## Titanic Competition Solution
This is my solution for the titanic competition on Kaggle that ranked 753th out of 10456 on the leaderboard. 

This notebook is structured as follows: 
- 1 Load Dataset 
- 2 Handle missing Values
- 3 Festure Engineering
- 4 Scaling
- 5 Train model
- 6 Evaluate model

In [363]:
import pandas as pd
pd.set_option('max_rows', 50)
pd.set_option('max_columns', 100)
pd.set_option('display.width', 1000)

import numpy as np
np.set_printoptions(suppress=True)
np.set_printoptions(linewidth=1000)

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn import preprocessing
#from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler
#from future_encoder import ColumnTransformer, make_column_transformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

### Load Dataset
and seperate the 'Survived' column, since this column has to be predicted.

In [364]:
train_ds = pd.read_csv('train.csv')

y_train = train_ds['Survived']
X_train = train_ds.drop(columns = 'Survived')
display(X_train.head())
display(X_train.describe())
print(f'has survived\n\n{y_train[:5]}')


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


has survived

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


## Add missing values


In [365]:
X_train.isnull().sum(axis=0)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [366]:
def add_missing(df):
    # use avarage age if missing
    age_avg = df['Age'].mean()
    df.loc[pd.isna(df['Age']), 'Age'] = age_avg
    
    fare_avg = df['Fare'].mean()
    df.loc[pd.isna(df['Fare']), 'Fare'] = fare_avg
    
    
    df['Embarked'].fillna('na', inplace=True)
    df['Cabin'].fillna('na', inplace=True)
    
    drop_column = ['PassengerId','Cabin', 'Ticket']
    df.drop(columns=drop_column, inplace = True)
    
add_missing(X_train) 
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


Now there should be no more empty cells.

## Preprocess dataset



In [367]:
X_train.nunique()

Pclass        3
Name        891
Sex           2
Age          89
SibSp         7
Parch         7
Fare        248
Embarked      4
dtype: int64

In [368]:
#use_oh = ['Pclass','Sex','SibSp','Parch','Embarked'] # apply one hot encoding to these columns
#use_oh_index = [i for i,x in enumerate(X_train.index) if x in use_oh ] # index in df 


encode_columns = ['Sex','Embarked','Title']

label = {x : LabelEncoder() for x in encode_columns}
one_hot = OneHotEncoder(sparse=False) # todo add one hot encoder
scaler = MinMaxScaler()
def preprocess(df,train = True):
    
    #feature engineering
    
    #define new features
    df['n_people'] = df ['SibSp'] + df['Parch'] + 1
    df['Title'] = df['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    df['name_len'] = df['Name'].apply(lambda x : len(x))
    
    #filter out rare titles 
    title_count =  (df['Title'].value_counts())
    def filter(x):
        if x in title_count:
            if title_count[x] > 10:
                return x 
        return 'other'
    df['Title'] = df['Title'].apply(filter)
    
    
    drop_column = ['Name']
    df.drop(columns=drop_column, inplace = True)
                   
    
    #scaling and converting columns
                       
    for col in encode_columns:
        if train:
            df[col] = label[col].fit_transform(df[col])
        else:
            df[col] = label[col].transform(df[col])
            
    for col in df.columns:
         df[col] = df[col].astype(np.float) 
        
    if train:
        df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    else:
        df = pd.DataFrame(scaler.transform(df), columns=df.columns)
    return df
                   
df = X_train.copy()
X_train = preprocess(df)
X_train.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,n_people,Title,name_len
0,1.0,1.0,0.271174,0.125,0.0,0.014151,0.666667,0.1,0.5,0.157143
1,0.0,0.0,0.472229,0.125,0.0,0.139136,0.0,0.1,0.75,0.557143
2,1.0,0.0,0.321438,0.0,0.0,0.015469,0.666667,0.0,0.25,0.142857
3,0.0,0.0,0.434531,0.125,0.0,0.103644,0.666667,0.1,0.75,0.457143
4,1.0,1.0,0.434531,0.0,0.0,0.015713,0.666667,0.0,0.5,0.171429


### Train Random Forest Classifier
start by splitting the dataframe into train and val dataframes

In [369]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=1)

define model and use randomsearch to find best hyperparameters

In [345]:
model = RandomForestClassifier()
args = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 8],
    'n_estimators': [200, 300, 400, 600, 1000, 1200, 1400, 1600, 1800, 2000]
}

#model.fit(X_train, y_train)

In [346]:
model = RandomizedSearchCV(model, args, n_iter = 200 , cv = 3, verbose=1, random_state=1, n_jobs = -1)
model.fit(X_train,y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  5.5min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=200, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 8], 'n_estimators': [200, 300, 400, 600, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [370]:
print(f'accuracy: ',model.score(X_val, y_val))

accuracy:  0.7932960893854749


#### Test
predict with test dataset and generate Kaggle submission file

In [371]:
df = pd.read_csv('test.csv')
pas_id = df['PassengerId'].values
add_missing(df)
df = preprocess(df,False)
res = model.predict(df)

submission = pd.DataFrame({
    'PassengerId': pas_id,
    'Survived': res
})
display(submission.head())

submission.to_csv('./submission.csv', index=False)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


## Other Classifiers
Experiments with other classifiers in sk-learn. The submission to Kaggle was made with the Random forrest classifier described above.

In [372]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()]

In [373]:
acc_dict = {}
for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc



In [374]:
acc_dict

{'KNeighborsClassifier': 0.7932960893854749,
 'SVC': 0.7821229050279329,
 'DecisionTreeClassifier': 0.6871508379888268,
 'RandomForestClassifier': 0.7486033519553073,
 'AdaBoostClassifier': 0.7877094972067039,
 'GradientBoostingClassifier': 0.7821229050279329,
 'GaussianNB': 0.7821229050279329,
 'LinearDiscriminantAnalysis': 0.7932960893854749,
 'QuadraticDiscriminantAnalysis': 0.7821229050279329,
 'LogisticRegression': 0.7877094972067039}