In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook

#load the files
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

#size of training dataset
train_samples = train.shape[0]

In [25]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [26]:
train.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [27]:
train.groupby(by='Survived').PassengerId.count()

# double number of not survived that survived

Survived
0    549
1    342
Name: PassengerId, dtype: int64

In [28]:
#from mpl_toolkits.mplot3d import Axes3D

#ax3d = plt.figure().gca(projection='3d')
#ax3d.scatter(train.SibSp, train.Parch, train.Age, c=train.Survived, alpha=0.5)

In [31]:
train.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [32]:
train.Name.apply(lambda s: s.split(". ")[0].split(", ")[1]  ).unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [33]:
train['Name2'] = train.Name.apply(lambda s: s.split(". ")[0].split(", ")[1]  )
gg = train.groupby(by='Name2').Age.mean()
train['Age2'] = np.where(train.Age.isnull(), gg[train.Name2], train.Age)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name2,Age2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr,22.000000
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,38.000000
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss,26.000000
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,35.000000
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,35.000000
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr,32.368090
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr,54.000000
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,Master,2.000000
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs,27.000000
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs,14.000000


In [34]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

def preprocess(df, fit_scaler=True):
    #X = df[['Pclass','Age','SibSp','Parch','Fare','Sex', 'Embarked']]
    X = df[['Pclass','Sex', 'Age']].copy()

    # feature engineering
    X.Sex = X.Sex.map({'female':1, 'male':0})
    #X.Embarked.fillna('U', inplace=True)
    #X.Embarked = X.Embarked.map({'S':0,'C':1,'Q':2,'U':3})
    
    X['Family'] = 0
    mask = (df.Parch>0) & (df.SibSp>0)
    X.loc[mask, 'Family'] = 1
    
    X['Name'] = train.Name.apply(lambda s: s.split(". ")[0].split(", ")[1])
    list_names = train.Name.apply(lambda s: s.split(". ")[0].split(", ")[1]).unique()
    dict_names = {v:k for k,v in enumerate(list_names)}
    X.Name = X.Name.map(dict_names)
    
    #fill NaN Age with the mean of the same name ('Mr', 'Miss',...)
    grp_name_age = X.groupby(by='Name').Age.mean()
    X['Age'] = np.where(X.Age.isnull(), grp_name_age[X.Name], X.Age)
    
    X = X.drop('Name',axis=1)
    
    # Note the way of scaling (df[df.columns])
    # we want to mantain the dataframe (instead of numpy array)
    if fit_scaler:
        X[X.columns] = scaler.fit_transform(X[X.columns])
    else:
        X[X.columns] = scaler.transform(X[X.columns])
    return X


X_train = preprocess(train)
y_train = train[['Survived']]

In [35]:
# plotting a scatter matrix
colormap = {0:'firebrick',1:'steelblue'}
colors = y_train.Survived.map(colormap)

pd.plotting.scatter_matrix(X_train, c=colors, marker = 'o', s=30,
                           hist_kwds={'bins':15}, figsize=(9,9));



<IPython.core.display.Javascript object>

In [36]:
#baseline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

def baseline(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

    dummy = DummyClassifier(random_state=0)

    #be aware of y as column vector
    dummy.fit(X_train, y_train.values.reshape(-1))
    acc = dummy.score(X_val.values, y_val.values.reshape(-1))
    print('Accuracy: {:.2f}\n'.format(acc))
    
    # Combined report with all above metrics
    print(classification_report(y_val, dummy.predict(X_val), target_names=['Not Survived', 'Survived']))

#baseline(X_train, y_train)

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

X_train_1 = X_train.values
y_train_1 = y_train.values.reshape(-1)

rfc = RandomForestClassifier(random_state=0)
scores = cross_validate(rfc, X_train_1, y_train_1, cv=10, scoring='accuracy')

print("Train scores: {:.3f}".format(scores['train_score'].mean()))
print("Test scores: {:.3f}".format(scores['test_score'].mean()))

Train scores: 0.892
Test scores: 0.820


In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score

def grid_search(X, y):  
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
    params = {'n_estimators': [2, 3, 5, 7, 10, 15, 20],
              'class_weight': ['balanced', {1:2}, {1:3}],
              'max_features': [1,2,3,4],
              'max_depth': [1,2,3,4,None]}

    grid_rfc = GridSearchCV(rfc, param_grid=params, cv=10, scoring='accuracy')
    grid_rfc.fit(X_train, y_train)

    best_rfc = grid_rfc.best_estimator_

    y_pred = best_rfc.predict(X_val)
    print("Accuracy: {:.3f}".format(accuracy_score(y_val, y_pred)))

    y_probs = best_rfc.predict_proba(X_val)
    auc = roc_auc_score(y_val, y_probs[:,1])
    print("AUC:{:.3f}".format(auc))
    
    print("Best params:\n {}".format(grid_rfc.best_params_))
    print(classification_report(y_val, y_pred, target_names=['Not Survived', 'Survived']))
    
    return best_rfc

best_rfc = grid_search(X_train_1, y_train_1)

Accuracy: 0.794
AUC:0.875
Best params:
 {'max_depth': 4, 'class_weight': 'balanced', 'max_features': 2, 'n_estimators': 5}
              precision    recall  f1-score   support

Not Survived       0.82      0.86      0.84       139
    Survived       0.75      0.68      0.71        84

 avg / total       0.79      0.79      0.79       223



In [41]:
scores = cross_validate(best_rfc, X_train_1, y_train_1, cv=10, scoring='accuracy')

print("Train scores: {:.3f}".format(scores['train_score'].mean()))
print("Test scores: {:.3f}".format(scores['test_score'].mean()))

best_rfc.fit(X_train_1, y_train_1)
scores = cross_validate(best_rfc, X_train_1, y_train_1, cv=10, scoring='accuracy')

print("Train scores: {:.3f}".format(scores['train_score'].mean()))
print("Test scores: {:.3f}".format(scores['test_score'].mean()))

Train scores: 0.831
Test scores: 0.806
Train scores: 0.831
Test scores: 0.806


In [15]:
import os

X_test = preprocess(test, fit_scaler=False)
predictions = best_rfc.predict(X_test)

passengerId = 892
file = "PassengerId,Survived" + os.linesep

for i in range(len(X_test)):
    file += "{},{}".format(passengerId, (int)(predictions[i]))  + os.linesep
    passengerId += 1
    
# Save to file
with open('attempt.txt', 'w') as f:
    f.write(file)