In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Data Cleansing and formatting
- Deal with Null's by:
   - On Age, by filling with mean
   - On Embarked, by erasing them because small amount
   - On Cabin, we give the nulls a category

In [3]:
# check for null values
df_train.isnull().sum()
# We drop Cabin
df_train.drop(['Cabin'], axis=1, inplace=True)
# We transform the empty values into a new category U
#df_train['Cabin'].fillna('U', inplace=True)
#df_train['Cabin'] = df_train['Cabin'].apply(lambda x: 0 if x[0]=='A' else (1 if x[0]=='U' else 2))

# Depending on the distribution of Age we will replace the NaN's with the median or the mean
# it is normal so we use the mean
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)

# Embarked is a categorical variable with only two nan's, it is safe to drop the rows
df_train.dropna(inplace=True, axis=0, subset=['Embarked'])

### Feature Engineering
- Adjusgt non-ordinal categorical features to one hot encoding
- Create a feature of alone/ not alone using SibSp and Parch
- Create a feature child or not child with age

In [4]:
# Encode embarked into 3 columns
df_train = pd.get_dummies(df_train, columns = ['Embarked', 'Pclass'])

# Change male and Female into 1 and 0
df_train['Sex'] = df_train['Sex'].apply(lambda x: 1 if x=='male' else 0)

In [5]:
# create feature alone or not alone
df_train['Alone'] = df_train['SibSp'] + df_train['Parch']
df_train['Alone'] = df_train['Alone'].apply(lambda x: 1 if x >=1 else 0)

In [6]:
# create feature child or not
df_train['Childornot'] = df_train['Age'].apply(lambda x: 1 if x < 15 else 0)

### DATA exploration
- Correlations
- Chi-squared dependency test

In [7]:
corr = df_train.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Alone,Childornot
PassengerId,1.0,-0.00502832,0.043136,0.0302999,-0.0576859,-0.00165658,0.0127032,-0.00120814,-0.0336937,0.0222688,0.0345109,-8.64771e-05,-0.0296006,-0.0576489,-0.0269034
Survived,-0.00502832,1.0,-0.541585,-0.0746729,-0.03404,0.0831508,0.25529,0.169966,0.00453573,-0.151777,0.282368,0.0950022,-0.320171,0.206207,0.124099
Sex,0.043136,-0.541585,1.0,0.0894338,-0.116348,-0.247508,-0.179958,-0.0845205,-0.0752167,0.121405,-0.0931421,-0.066459,0.134227,-0.306985,-0.0968473
Age,0.0302999,-0.0746729,0.0894338,1.0,-0.231875,-0.178232,0.0886045,0.033853,-0.0128232,-0.0215893,0.315984,0.0084158,-0.278527,-0.177712,-0.570362
SibSp,-0.0576859,-0.03404,-0.116348,-0.231875,1.0,0.414542,0.160887,-0.0600738,-0.0266917,0.0694383,-0.0528939,-0.0565071,0.0915148,0.584186,0.364455
Parch,-0.00165658,0.0831508,-0.247508,-0.178232,0.414542,1.0,0.217532,-0.0115884,-0.0815851,0.0615116,-0.0158093,-0.00127823,0.0146337,0.583112,0.360801
Fare,0.0127032,0.25529,-0.179958,0.0886045,0.160887,0.217532,1.0,0.270731,-0.116684,-0.163758,0.590576,-0.117609,-0.411932,0.274079,-0.00244909
Embarked_C,-0.00120814,0.169966,-0.0845205,0.033853,-0.0600738,-0.0115884,0.270731,1.0,-0.148646,-0.782613,0.299472,-0.126039,-0.154785,0.0945128,0.00263895
Embarked_Q,-0.0336937,0.00453573,-0.0752167,-0.0128232,-0.0266917,-0.0815851,-0.116684,-0.148646,1.0,-0.499261,-0.15468,-0.127705,0.237035,-0.0870993,-0.038957
Embarked_S,0.0222688,-0.151777,0.121405,-0.0215893,0.0694383,0.0615116,-0.163758,-0.782613,-0.499261,1.0,-0.165022,0.190824,-0.0135939,-0.0279815,0.0222113


### Data Transformation
- Normalize the data. Subtract mean and divide by std

In [8]:
df_train[['Age', 'SibSp', 'Parch', 'Fare']] = preprocessing.scale(df_train[['Age', 'SibSp', 'Parch', 'Fare']])

  """Entry point for launching an IPython kernel.


### Train 

In [31]:
train = df_train[['Sex', 'Childornot', 'Fare', 'Embarked_C', 'Embarked_S', 
                 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Alone']]

test = df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.20, random_state=42)

In [32]:
parameters = {'n_estimators': [100,200,300], 'max_depth': [3, 4, 5, 6, 7, 8], 'min_samples_leaf': [3, 5, 10, 15, 20]}
clf = GridSearchCV(RandomForestClassifier(), parameters, cv=5, scoring='f1')
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83       109
           1       0.73      0.77      0.75        69

   micro avg       0.80      0.80      0.80       178
   macro avg       0.79      0.79      0.79       178
weighted avg       0.80      0.80      0.80       178



In [10]:
#print("SVM metrics")
#clf_1 = SVC().fit(X_train, y_train)
#y_pred = clf_1.predict(X_test)
#print(classification_report(y_test, y_pred))

#print("Log reg metrics")
#clf_2 = LogisticRegression(random_state=0).fit(X_train, y_train)
#y_pred = clf_2.predict(X_test)
#print(classification_report(y_test, y_pred))

#print("NN metrics")
#clf_3 = MLPClassifier(hidden_layer_sizes=15, max_iter=200).fit(X_train, y_train)
#y_pred = clf_3.predict(X_test)
#print(classification_report(y_test, y_pred))

print("RF metrics")
clf_4 = RandomForestClassifier(n_estimators = 200, max_depth=5, min_samples_leaf=5).fit(X_train, y_train)
y_pred = clf_4.predict(X_test)
print(classification_report(y_test, y_pred))

#print("Ensemble metrics")
#clf_5 = VotingClassifier(estimators=[('SVM', clf_1), ('LR', clf_2), ('NN', clf_3), ('RF', clf_4)], voting='hard')
#clf_5 = clf_5.fit(X_train, y_train)
#y_pred = clf_5.predict(X_test)
#print(classification_report(y_test, y_pred))


RF metrics
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       109
           1       0.80      0.71      0.75        69

   micro avg       0.82      0.82      0.82       178
   macro avg       0.82      0.80      0.81       178
weighted avg       0.82      0.82      0.82       178



### Predict

In [33]:
df_test = pd.read_csv('test.csv')

# clearly we wont be able to use the Cabin column, it has too many NaN values
df_test.drop(['Cabin'], axis=1, inplace=True)

# Depending on the distribution of Age we will replace the NaN's with the median or the mean
# it is normal so we use the mean
df_test['Age'].fillna(df_train['Age'].mean(), inplace=True)
df_test['Fare'].fillna(df_train['Fare'].mean(), inplace=True)

# Embarked is a categorical variable with only two nan's, it is safe to drop the rows
df_test.dropna(inplace=True, axis=0, subset=['Embarked'])

# Encode embarked into 3 columns
df_test = pd.get_dummies(df_test, columns = ['Embarked'])

# Encode Pclass into 3 columns
df_test = pd.get_dummies(df_test, columns=['Pclass'])

# Change male and Female into 1 and 0
df_test['Sex'] = df_test['Sex'].apply(lambda x: 1 if x=='male' else 0)

df_test['Alone'] = df_test['SibSp'] + df_train['Parch']
df_test['Alone'] = df_test['Alone'].apply(lambda x: 1 if x >=1 else 0)

# create feature child or not
df_test['Childornot'] = df_test['Age'].apply(lambda x: 1 if x < 15 else 0)

df_test[['Age', 'SibSp', 'Parch', 'Fare']] = preprocessing.scale(df_test[['Age', 'SibSp', 'Parch', 'Fare']])

test = df_test[['Sex', 'Childornot', 'Fare', 'Embarked_C', 'Embarked_S', 
                 'Pclass_1','Pclass_3', 'Alone']]

y_pred = clf_4.predict(test)

df_test['Survived'] = y_pred

df_test[['PassengerId', 'Survived']].to_csv('outputs.csv')

