In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import pandas as pd
import seaborn as sns
sns.set(style="white",color_codes=True)
plt.rcParams['figure.figsize'] = (15,9.27)
# Set the font set of the latex code to computer modern
matplotlib.rcParams['mathtext.fontset'] = "cm"

In [43]:
train = pd.read_csv('data/titanic/train.csv')
test = pd.read_csv('data/titanic/test.csv')
df = pd.concat([train,test])

In [44]:
df = df.drop(['Name','Ticket','Cabin','PassengerId'],axis=1)
df['Embarked'] = df['Embarked'].fillna('S')
average_age = df.Age.mean()
df['Age'] = df.Age.fillna(average_age)
df['Sex'] = df.Sex.map({'male':1,'female':0})
df['Embarked'] = df.Embarked.map({'S':0,'C':1,'Q':2})
df['family_size'] = df.SibSp + df.Parch
df = df.drop(['SibSp','Parch'],axis=1)
df['isalone'] = df.family_size.apply(lambda x: 1 if x==0 else 0)
df = pd.get_dummies(columns=['Embarked'],data=df)
df['Age'] = pd.qcut(df.Age,5,labels=range(5)).cat.codes
df['Fare'] = pd.qcut(df.Fare,5,labels=range(5)).cat.codes

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Age            1309 non-null int8
Fare           1309 non-null int8
Pclass         1309 non-null int64
Sex            1309 non-null int64
Survived       891 non-null float64
family_size    1309 non-null int64
isalone        1309 non-null int64
Embarked_0     1309 non-null uint8
Embarked_1     1309 non-null uint8
Embarked_2     1309 non-null uint8
dtypes: float64(1), int64(4), int8(2), uint8(3)
memory usage: 67.8 KB


In [7]:
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier #Decision Tree
import xgboost as xgb

In [50]:
train_df = df[df.Survived.isnull()==False]
test_df = df[df.Survived.isnull()==True]
y = train_df['Survived']
x = train_df.drop('Survived',axis=1)

In [25]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
logit = LogisticRegression()
lsvc = svm.SVC(kernel='linear',C=0.1,gamma=0.1,probability=True)
dt =DecisionTreeClassifier()
knn = KNeighborsClassifier()
gnb = GaussianNB()
xb = xgb.XGBClassifier()

In [10]:
def cv_model_selection(models,names,x,y,cv=10):
    means,stds = [],[]
    for model in models:
        model.fit(x,y)
        res_array = cross_val_score(model,x,y,cv=cv)
        means.append(res_array.mean())
        stds.append(res_array.std())
    res_df = pd.DataFrame({'model_name':names,'mean':means,'std':stds})
    res_df['mean/std'] = res_df['mean']/res_df['std']
    res_df = res_df.sort_values('mean',ascending=False)
    return res_df

In [51]:
models = [rf,logit,lsvc,dt,knn,gnb,xb]
cv_model_selection(models,['rf','logit','lsvc','dt','knn','gnb','xb'],x,y)

Unnamed: 0,model_name,mean,std,mean/std
6,xb,0.811456,0.027797,29.191893
0,rf,0.807012,0.039521,20.420067
1,logit,0.799171,0.026184,30.521353
2,lsvc,0.786698,0.027942,28.154442
3,dt,0.7857,0.055271,14.21536
4,knn,0.783466,0.035396,22.134185
5,gnb,0.780321,0.05334,14.6292


In [41]:
xb.fit(x,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [42]:
from sklearn.model_selection import GridSearchCV

In [45]:
n_estimators=list(range(100,1100,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
gd=GridSearchCV(estimator=xgb.XGBClassifier(),param_grid=hyper,verbose=True,n_jobs=-1,cv=10)
gd.fit(x,y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  3.6min finished


0.8226711560044894
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [49]:
n_estimators=range(100,1000,100)
hyper={'n_estimators':n_estimators}
gd=GridSearchCV(estimator=RandomForestClassifier(random_state=0),
                param_grid=hyper,verbose=True,n_jobs=-1,cv=10)
gd.fit(x,y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   41.7s finished


0.813692480359147
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)


In [12]:
from sklearn.ensemble import VotingClassifier

In [52]:
vcm = VotingClassifier(estimators=[('rf', rf), ('xb', xb),
('logit', logit), ('knn',knn),('lsvc',lsvc)], voting='soft', n_jobs=-1)

In [53]:
a = cross_val_score(vcm,x,y,cv=10)
a.mean(),a.std()

(0.8170491431165589, 0.034135395784352644)

In [56]:
x_test = test_df.drop('Survived',1)
vcm.fit(x,y)
y_pred = vcm.predict(x_test)

In [63]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [62]:
test['Survived'] = y_pred.astype(int)

In [66]:
final = test[['PassengerId','Survived']]
final.to_csv("submit.csv", index=False)

In [70]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64