In [629]:
from sklearn import cross_validation, preprocessing, pipeline, linear_model, metrics, ensemble, neighbors, grid_search, feature_selection

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [630]:
%matplotlib inline

In [631]:
train_data = pd.read_csv('data/train.csv')
print(train_data.shape)
train_data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [724]:
def prepare_data(raw_data):
    raw_data['Sex'] = raw_data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
    raw_data['Embarked'] = raw_data['Embarked'].apply(lambda x: 1 if x == 'C' else 2 if x == 'Q' else 3 if x == 'S' else 0)
    
    def status_to_num(full_name):
        status = full_name.split()[1][:-1]
        if status == 'Mr':
            return 1
        elif status == 'Mrs':
            return 2
        elif status == 'Miss':
            return 3
        return 0

    raw_data['Status'] = raw_data['Name'].apply(status_to_num)
    
    raw_data['Age'] = raw_data['Age'].fillna(train_data['Age'].mean())
    raw_data['Fare'] = raw_data['Fare'].fillna(train_data['Fare'].mean())
    
    family_name = lambda x: x.split(',')[0]
    raw_data['FamilyName'] = raw_data['Name'].apply(family_name)
    
    family_size = lambda x: list(raw_data['FamilyName'].values).count(x)
    raw_data['FamilySize'] = raw_data['FamilyName'].apply(family_size)
    
    data_to_drop = ['Name', 'Ticket', 'FamilyName', 'Cabin', 'PassengerId']
    raw_data.drop(data_to_drop, axis=1, inplace=True)
    
    return raw_data

In [633]:
clean_data = prepare_data(train_data)
clean_data.isnull().any()

Survived      False
Pclass        False
Sex           False
Age           False
SibSp         False
Parch         False
Fare          False
Embarked      False
Status        False
FamilySize    False
dtype: bool

In [634]:
# fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10))
# for idx, feature in enumerate(clean_data.columns[:-1]):
#     clean_data.plot(feature, 'Survived', subplots=True, kind='scatter', ax=axes[idx/4, idx%4])

In [635]:
X = clean_data.drop(['Survived'], axis=1)
y = clean_data['Survived']

In [759]:
binary_data = ['Sex']
binary_data_indices = np.array([(column in binary_data) for column in X.columns], dtype=bool)

numeric_data = ['Age', 'Fare', 'FamilySize']
numeric_data_indices = np.array([(column in numeric_data) for column in X.columns], dtype=bool)

categorical_data = ['Pclass', 'Parch', 'SibSp', 'Embarked', 'Status']
categorical_data_indices = np.array([(column in categorical_data) for column in X.columns], dtype=bool)

In [760]:
print('binary_data     ', binary_data_indices)
print('numeric_data    ', numeric_data_indices)
print('categorical_data', categorical_data_indices)

X.head(10)



binary_data      [False  True False False False False False False False]
numeric_data     [False False  True False False  True False False  True]
categorical_data [ True False False  True  True False  True  True False]


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Status,FamilySize
0,3,1,22.0,1,0,7.25,3,1,2
1,1,0,38.0,1,0,71.2833,1,2,1
2,3,0,26.0,0,0,7.925,3,3,1
3,1,0,35.0,1,0,53.1,3,2,2
4,3,1,35.0,0,0,8.05,3,1,2
5,3,1,29.699118,0,0,8.4583,2,1,3
6,1,1,54.0,0,0,51.8625,3,1,1
7,3,1,2.0,3,1,21.075,3,0,4
8,3,0,27.0,0,2,11.1333,3,2,6
9,2,0,14.0,1,0,30.0708,1,2,2


In [761]:
X_new = feature_selection.SelectKBest(feature_selection.chi2, k=4).fit_transform(X, y)
X_new[:10]

array([[  3.    ,   1.    ,   7.25  ,   1.    ],
       [  1.    ,   0.    ,  71.2833,   2.    ],
       [  3.    ,   0.    ,   7.925 ,   3.    ],
       [  1.    ,   0.    ,  53.1   ,   2.    ],
       [  3.    ,   1.    ,   8.05  ,   1.    ],
       [  3.    ,   1.    ,   8.4583,   1.    ],
       [  1.    ,   1.    ,  51.8625,   1.    ],
       [  3.    ,   1.    ,  21.075 ,   0.    ],
       [  3.    ,   0.    ,  11.1333,   2.    ],
       [  2.    ,   0.    ,  30.0708,   2.    ]])

In [830]:
# classifier = linear_model.SGDClassifier(random_state=42)
# classifier = ensemble.RandomForestClassifier(n_estimators=500, random_state=42)
classifier = ensemble.AdaBoostClassifier(random_state=42)
# classifier = neighbors.KNeighborsClassifier()

In [831]:
classifier.get_params().keys()

dict_keys(['algorithm', 'base_estimator', 'random_state', 'n_estimators', 'learning_rate'])

In [832]:
estimator = pipeline.Pipeline (steps=[
        ('feature_processing', pipeline.FeatureUnion(transformer_list=[
                    ('bin_vars_proc', preprocessing.FunctionTransformer(lambda data: data[:, binary_data_indices])),
                    ('num_vars_proc', pipeline.Pipeline(steps=[
                                ('select', preprocessing.FunctionTransformer(lambda data: data[:, numeric_data_indices])),
                                ('scaling', preprocessing.StandardScaler(with_mean=0.0)),
                                ('polynom', preprocessing.PolynomialFeatures(4))
                            ])),
                    ('cat_vars_proc', pipeline.Pipeline(steps=[
                                ('select', preprocessing.FunctionTransformer(lambda data: data[:, categorical_data_indices])),
                                ('hot_encode', preprocessing.OneHotEncoder(handle_unknown='ignore')),
                            ]))
                ])),
        ('model_fitting', classifier)
    ])

In [833]:
score = cross_validation.cross_val_score(estimator, 
                                        X, y,
                                        scoring='accuracy')
score.mean()

0.8136924803591471

In [807]:
score.mean()

0.8136924803591471

In [834]:
estimator.fit(X, y)

Pipeline(steps=[('feature_processing', FeatureUnion(n_jobs=1,
       transformer_list=[('bin_vars_proc', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x7fdd0a744f28>, pass_y=False,
          validate=True)), ('num_vars_proc', Pipeline(steps=[('select', FunctionTransformer(acc...thm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=42))])

In [837]:
test_data = prepare_data(pd.read_csv('data/test.csv'))
print(test_data.shape)


(418, 9)


In [838]:
raw_test_data = pd.read_csv('data/test.csv')
submission = pd.DataFrame()
submission['PassengerId'] = raw_test_data['PassengerId']
submission['Survived'] = estimator.predict(test_data)

In [839]:
submission.to_csv('submission.csv', index=False)