In [106]:
from __future__ import division
%matplotlib inline

import numpy as np
import pandas as pd
from copy import copy
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.utils import shuffle


In [107]:
data_train = pd.DataFrame.from_csv('train.csv')
data_test = pd.DataFrame.from_csv('test.csv')

In [108]:
display(data_train.head())
print(data_train.describe())
print(data_train.isnull().sum())

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


         Survived      Pclass         Age       SibSp       Parch        Fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64


In [109]:
data_train_X = copy(data_train.ix[: ,1:])
data_train_y = copy(data_train['Survived'])

data_train_X, data_train_y = shuffle(data_train_X, data_train_y, random_state=0)

In [110]:
display(data_train_X.head())

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
496,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C
649,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S
279,3,"Rice, Master. Eric",male,7.0,4,1,382652,29.125,,Q
32,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
256,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29.0,0,2,2650,15.2458,,C


In [111]:
def split_cabin_string(x):
    if type(x) == type('str'):
        return x[0], x[1:]
    else:
        return x, x
    
def preprocess_data(df):
    df.drop(['Name', 'Ticket'], axis=1, inplace=True)
    df['Cabin_1'] = df.apply(lambda row: split_cabin_string(row['Cabin'])[0], axis=1)
    
    pclass_encoded = pd.get_dummies(df['Pclass'], prefix='Pclass')
    sex_encoded = pd.get_dummies(df['Sex'], prefix='Sex')
    cabin_encoded = pd.get_dummies(df['Cabin_1'], prefix='Cabin')
    embarked_encoded = pd.get_dummies(df['Embarked'], prefix='Embark')
    
    df = df.join(pclass_encoded).join(sex_encoded).join(cabin_encoded).join(embarked_encoded)
    df.drop(['Pclass', 'Sex', 'Cabin', 'Cabin_1', 'Embarked'], axis=1, inplace=True)
    
    df['Age'].loc[df['Age'].isnull()] = df['Age'].mean()
    df['Fare'].loc[df['Fare'].isnull()] = df['Fare'].mean()
    
    print(df.head())
    return df

In [112]:
data_train_X = preprocess_data(data_train_X)

                   Age  SibSp  Parch      Fare  Pclass_1  Pclass_2  Pclass_3  \
PassengerId                                                                    
496          29.699118      0      0   14.4583         0         0         1   
649          29.699118      0      0    7.5500         0         0         1   
279           7.000000      4      1   29.1250         0         0         1   
32           29.699118      1      0  146.5208         1         0         0   
256          29.000000      0      2   15.2458         0         0         1   

             Sex_female  Sex_male  Cabin_A  Cabin_B  Cabin_C  Cabin_D  \
PassengerId                                                             
496                   0         1        0        0        0        0   
649                   0         1        0        0        0        0   
279                   0         1        0        0        0        0   
32                    1         0        0        1        0        0   
2

In [113]:
print(data_train_X.describe())
print(data_train_X.isnull().sum())

              Age       SibSp       Parch        Fare    Pclass_1    Pclass_2  \
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000   
mean    29.699118    0.523008    0.381594   32.204208    0.242424    0.206510   
std     13.002015    1.102743    0.806057   49.693429    0.428790    0.405028   
min      0.420000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%     22.000000    0.000000    0.000000    7.910400    0.000000    0.000000   
50%     29.699118    0.000000    0.000000   14.454200    0.000000    0.000000   
75%     35.000000    1.000000    0.000000   31.000000    0.000000    0.000000   
max     80.000000    8.000000    6.000000  512.329200    1.000000    1.000000   

         Pclass_3  Sex_female    Sex_male     Cabin_A     Cabin_B     Cabin_C  \
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000   
mean     0.551066    0.352413    0.647587    0.016835    0.052750    0.066218   
std      0.497665    0.4779

In [114]:
data_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [115]:
data_test = preprocess_data(data_test)

              Age  SibSp  Parch     Fare  Pclass_1  Pclass_2  Pclass_3  \
PassengerId                                                              
892          34.5      0      0   7.8292         0         0         1   
893          47.0      1      0   7.0000         0         0         1   
894          62.0      0      0   9.6875         0         1         0   
895          27.0      0      0   8.6625         0         0         1   
896          22.0      1      1  12.2875         0         0         1   

             Sex_female  Sex_male  Cabin_A  Cabin_B  Cabin_C  Cabin_D  \
PassengerId                                                             
892                   0         1        0        0        0        0   
893                   1         0        0        0        0        0   
894                   0         1        0        0        0        0   
895                   0         1        0        0        0        0   
896                   1         0        0 

In [116]:
print(data_test.describe())
print(data_test.isnull().sum())

              Age       SibSp       Parch        Fare    Pclass_1    Pclass_2  \
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000   
mean    30.272590    0.447368    0.392344   35.627188    0.255981    0.222488   
std     12.634534    0.896760    0.981429   55.840500    0.436934    0.416416   
min      0.170000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%     23.000000    0.000000    0.000000    7.895800    0.000000    0.000000   
50%     30.272590    0.000000    0.000000   14.454200    0.000000    0.000000   
75%     35.750000    1.000000    0.000000   31.500000    1.000000    0.000000   
max     76.000000    8.000000    9.000000  512.329200    1.000000    1.000000   

         Pclass_3  Sex_female    Sex_male     Cabin_A     Cabin_B     Cabin_C  \
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000   
mean     0.521531    0.363636    0.636364    0.016746    0.043062    0.083732   
std      0.500135    0.4816

In [117]:
data_test.loc[:, 'Cabin_T'] = [0] * len(data_test)

In [118]:
print(data_test.describe())
print(data_test.isnull().sum())

              Age       SibSp       Parch        Fare    Pclass_1    Pclass_2  \
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000   
mean    30.272590    0.447368    0.392344   35.627188    0.255981    0.222488   
std     12.634534    0.896760    0.981429   55.840500    0.436934    0.416416   
min      0.170000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%     23.000000    0.000000    0.000000    7.895800    0.000000    0.000000   
50%     30.272590    0.000000    0.000000   14.454200    0.000000    0.000000   
75%     35.750000    1.000000    0.000000   31.500000    1.000000    0.000000   
max     76.000000    8.000000    9.000000  512.329200    1.000000    1.000000   

         Pclass_3  Sex_female    Sex_male     Cabin_A     Cabin_B     Cabin_C  \
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000   
mean     0.521531    0.363636    0.636364    0.016746    0.043062    0.083732   
std      0.500135    0.4816

In [119]:
# cabin_t_idx = data_train_X.columns.get_loc("Cabin_T")
# cols = data_test.columns.tolist()
# cols = cols[:cabin_t_idx] + [cols[cabin_t_idx]] + cols[cabin_t_idx + 1:]
data_test = data_test.reindex(columns=data_train_X.columns)

In [120]:
data_test.head()

Unnamed: 0_level_0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embark_C,Embark_Q,Embark_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
892,34.5,0,0,7.8292,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0
893,47.0,1,0,7.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1
894,62.0,0,0,9.6875,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
895,27.0,0,0,8.6625,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1
896,22.0,1,1,12.2875,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1


In [165]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import time

start_time = time.time()
# reg = SVC()
clf = LogisticRegression()
clf_es = GradientBoostingClassifier(max_depth=1, random_state=0)
parameters = {'n_estimators': [16, 64, 128, 512, 1024, 2048], 'learning_rate': [0.1, 0.3, 0.5], 'min_samples_leaf': [1, 50, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 5, 10]}
reg_gs = GridSearchCV(clf_es, parameters, cv=9, n_jobs=-1)
reg_gs.fit(data_train_X, data_train_y)
y_pred = reg_gs.predict(data_test)

print("Training time: {}".format(time.time() - start_time))

Training time: 242.462343216


In [166]:
print("Best score: {}".format(reg_gs.best_score_))
print("Best params: {}".format(reg_gs.best_params_))

Best score: 0.840628507295
Best params: {'max_features': 'sqrt', 'n_estimators': 512, 'learning_rate': 0.1, 'max_depth': 10, 'min_samples_leaf': 50}


In [167]:
y_pred = pd.DataFrame(y_pred, index=data_test.index, columns=["Survived"])

In [168]:
y_pred.index.rename("PassengerId", inplace=True)

In [169]:
y_pred.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [170]:
y_pred.to_csv("submission1.csv")

In [171]:
y_pred.describe()

Unnamed: 0,Survived
count,418.0
mean,0.349282
std,0.477315
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0
