In [113]:
import pandas as pd
import numpy as np
from scipy import sparse
import eli5

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA

In [114]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [115]:
# data cleaning and create features

for df in (train_df, test_df):
    df['Name length'] = df['Name'].map(len)
    df['Sex num'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Has cabin'] = df['Cabin'].map(lambda x: 0 if pd.isna(x) else 1)
    df['Embarked num'] = df['Embarked'].map({'S': 0, 'C': 1})
    df['Family size'] = df['SibSp'] + df['Parch']

print('{} have NaN values'.format(train_df.columns[train_df.isna().any()].tolist()))

orig_num_columns = ['Age', 'Family size', 'Fare', 'Name length', 'Sex num', 'Has cabin', 'Embarked num']
orig_cat_columns = ['Pclass']

for df in (train_df, test_df):
    df[orig_cat_columns] = df[orig_cat_columns].fillna('')
    df[orig_num_columns] = df[orig_num_columns].fillna(train_df[orig_num_columns].median())

['Age', 'Cabin', 'Embarked', 'Embarked num'] have NaN values


In [116]:
# some data exploring

display(train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
display(train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())
display(train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())

train_df['Age bins'], b1 = pd.cut(train_df['Age'], 4, retbins=True)
test_df['Age bins'] = pd.cut(test_df['Age'], b1)

# train_df['Family size bins'], b2 = pd.cut(train_df['Family size'], 3, retbins=True)
# test_df['Family size bins'] = pd.cut(test_df['Family size'], b2)

# note it's possible that test_df will contain NaN, let's fix that
df['Age bins'] = df['Age bins'].fillna(test_df['Age bins'].value_counts().index[0])
# df['Family size bins'] = df['Family size bins'].fillna(test_df['Family size bins'].value_counts().index[0])
    
display(train_df[['Age bins', 'Survived']].groupby(['Age bins'], as_index=False).mean())
# display(train_df[['Family size bins', 'Survived']].groupby(['Family size bins'], as_index=False).mean())

num_columns = ['Fare', 'Sex num', 'Has cabin', 'Embarked num', 'Name length', 'Family size']
cat_columns = ['Pclass', 'Age bins']

train_df.head()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


Unnamed: 0,Age bins,Survived
0,"(0.34, 20.315]",0.458101
1,"(20.315, 40.21]",0.364769
2,"(40.21, 60.105]",0.390625
3,"(60.105, 80.0]",0.227273


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name length,Sex num,Has cabin,Embarked num,Family size,Age bins
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,0,0,0.0,1,"(20.315, 40.21]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,1,1,1.0,1,"(20.315, 40.21]"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,1,0,0.0,0,"(20.315, 40.21]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,1,1,0.0,1,"(20.315, 40.21]"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,0,0,0.0,0,"(20.315, 40.21]"


In [117]:
# utilities

def one_hot_categorical(cat_columns, train_df, test_df):  
    feature_names = []
    
    for cat in cat_columns:
        le = LabelEncoder()
        le.fit(train_df[cat])
        
        feature_names.extend([cat] * len(le.classes_))
    
        train_df[cat] = le.transform(train_df[cat])
        test_df[cat] = le.transform(test_df[cat])
        
    oe = OneHotEncoder()
    oe.fit(train_df[cat_columns])
    categ_mat_train = oe.transform(train_df[cat_columns]).todense()
    categ_mat_test = oe.transform(test_df[cat_columns]).todense()
    
    return categ_mat_train, categ_mat_test, feature_names

In [122]:
categ_mat_train, categ_mat_test, feature_names = one_hot_categorical(cat_columns, train_df, test_df)

rest_mat_train = train_df[num_columns].values
rest_mat_test = test_df[num_columns].values
    
X = np.hstack((rest_mat_train, categ_mat_train))
X_test = np.hstack((rest_mat_test, categ_mat_test))

assert X.shape[1] == X_test.shape[1]

X_max = X.max(axis=0)
X /= X_max
X_test /= X_max

Y = train_df['Survived'].values
Y = to_categorical(Y, num_classes = 10)

print('logreg: {}'.format(np.mean(cross_val_score(LogisticRegression(), X, Y, cv=10))))
print('random forest: {}'.format(np.mean(cross_val_score(RandomForestClassifier(), X, Y, cv=10))))
print('adaboost: {}'.format(np.mean(cross_val_score(AdaBoostClassifier(), X, Y, cv=10))))
print()

X_interactions = PolynomialFeatures(interaction_only=True).fit_transform(X)
print('logreg interaction: {}'.format(np.mean(cross_val_score(LogisticRegression(), X_interactions, Y, cv=10))))
print('random forest interaction: {}'.format(np.mean(cross_val_score(RandomForestClassifier(), X_interactions, Y, cv=10))))
print('adaboost interaction: {}'.format(np.mean(cross_val_score(AdaBoostClassifier(), X_interactions, Y, cv=10))))
print()

X_pca = PCA(n_components=20).fit_transform(X_interactions)
print('logreg pca interaction: {}'.format(np.mean(cross_val_score(LogisticRegression(), X_pca, Y, cv=10))))
print('random forest pca interaction: {}'.format(np.mean(cross_val_score(RandomForestClassifier(), X_pca, Y, cv=10))))
print('adaboost pca interaction: {}'.format(np.mean(cross_val_score(AdaBoostClassifier(), X_pca, Y, cv=10))))
print()

# okay, so regular random forest works pretty well. let's use GridSearchCV to find the best parameters.

parameters = {
    'n_estimators': [4, 6, 8, 10], 
    'max_features': ['log2', 'auto', None], 
    'criterion': ['entropy', 'gini'],
    'max_depth': [2, 3, 5, 7, None], 
    'min_samples_split': [3, 5, 7, 9],
    'min_samples_leaf': [3, 5, 7]
}

X_train, X_validate, Y_train, Y_validate = train_test_split(X, Y, train_size=0.80, test_size=0.20)

# clf = GridSearchCV(RandomForestClassifier(), param_grid=parameters)
# clf = VotingClassifier([('rf', RandomForestClassifier()), ('lr', LogisticRegression()), ('ada', AdaBoostClassifier())])
clf = RandomForestClassifier()
clf.fit(X, Y)
print('final clf: {}'.format(clf.score(X_validate, Y_validate)))

predictions = clf.predict(X_test)
test_df['Survived'] = predictions

test_df[['PassengerId', 'Survived']].to_csv('submission.csv', index=False)

feature_names = num_columns + feature_names
display(eli5.show_weights(clf, feature_names=feature_names))

test_df.head()

['Fare', 'Sex num', 'Has cabin', 'Embarked num', 'Name length']

NameError: name 'to_categorical' is not defined