In [100]:
import csv
import numpy as np
import pandas as pd
import pylab as p
from patsy import dmatrices
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier 
%matplotlib inline  
train = pd.read_csv('train.csv', header = 0)
test = pd.read_csv('test.csv', header = 0)

In [101]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [102]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [103]:
def clean_data(train):
    #train['Gender'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    train.Embarked = train.Embarked.fillna('S')
    #build a pivot table to impute missing fare values with Pclass
    train.Fare = train.Fare.map(lambda x: np.nan if x==0 else x)
    fare_pivot_table = train.pivot_table("Fare", index='Pclass', aggfunc='mean', dropna=True)

    #use pivot table to impute missing fare values
    train.Fare = train[['Fare', 'Pclass']].apply(lambda x: fare_pivot_table[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis = 1)
    age_pivot_table = train.pivot_table('Age', index=['Pclass','Sex'], aggfunc='median',dropna=True)
    train.Age = train[['Sex', 'Pclass', 'Age']].apply(lambda x: age_pivot_table[x.Pclass, x.Sex] if pd.isnull(x.Age) else x.Age, axis = 1)
    train['Family_Size']=train['SibSp']+train['Parch']
    return train

In [104]:
train = clean_data(train)
test = clean_data(test)
test['Survived'] = 123



In [105]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
Family_Size    891 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 97.5+ KB


In [76]:
formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Parch + C(Embarked)'
y,x = dmatrices(formula, data=train, return_type="dataframe")
y = y.values
x = x.values
testy,testx = dmatrices(formula, data=test, return_type="dataframe")
#train_data = train[features].values
#train_response = np.ravel(train[['Survived']].values)

In [77]:
LRmodel = LogisticRegression()
LRmodel.fit(x, y)
LRtestdata = testx.values
output = LRmodel.predict(LRtestdata)

  y = column_or_1d(y, warn=True)


In [78]:
output

array([ 0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,
        1.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  1.,
        0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,
        1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  0

In [79]:
# Create the random forest object which will include all the parameters
# for the fit
RandomForestModel = RandomForestClassifier(n_estimators = 100)
RandomForestModel.fit(x, y)
RRdata = testx.values
RandomForestOutput = RandomForestModel.predict(RRdata)



In [84]:
scores = cross_val_score(RandomForestClassifier(n_estimators = 100), x, y.ravel(), scoring='accuracy', cv = 10)
print scores.mean()

0.803753830439


In [57]:
prediction_file = open("RRPrediction.csv", "wb")
out = csv.writer(prediction_file)
out.writerow(["PassengerId", "Survived"])
for i in range(0, len(test.PassengerId)):
    out.writerow([test.PassengerId[i], RandomForestOutput[i]])
prediction_file.close()

In [56]:
prediction_file = open("LRPrediction.csv", "wb")
out = csv.writer(prediction_file)
out.writerow(["PassengerId", "Survived"])
for i in range(0, len(test.PassengerId)):
    out.writerow([test.PassengerId[i], output[i]])
prediction_file.close()