<a href="https://colab.research.google.com/github/robimalco/colab/blob/main/Titanic_Machine_Learning_from_Disaster_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files

In [None]:
files.upload()

In [7]:
#!mkdir ~/.kaggle
#!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets list

In [None]:
!kaggle competitions download -c titanic

# START

In [10]:
import json
import numpy as np
import pandas as pd
import torch

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import xgboost as xgb
from hyperopt import hp, tpe, fmin

In [97]:
train_df = pd.read_csv('train.csv')
train_df['df_source'] = 'train.csv'

test_df = pd.read_csv('test.csv')
test_df['df_source'] = 'test.csv'

total_df = pd.concat([train_df, test_df])

In [None]:
# Diplay null values of each column
total_df.isnull().sum()

In [12]:
# Manage names creating TitleCluster

Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Dona" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

def get_titles(x):
    title = x['Name'].split(',')[1].split('.')[0].strip()
    return Title_Dictionary[title]
total_df['TitleCluster'] = total_df.apply(get_titles, axis=1)

In [13]:
# Manage missing age and create AgeCluster

def fill_missing_age(x):
    age = x['Age']
    if np.isnan(age):
      return total_df[
        (total_df['TitleCluster'] == x['TitleCluster']) &
        (total_df['Sex'] == x['Sex']) &
        (total_df['Pclass'] == x['Pclass'])
      ]['Age'].median()
    else:
      return age

total_df['Age'] = total_df.apply(fill_missing_age, axis=1)

def cluster_age(x):
    age = x['Age']
    if age < 5:
      return 0
    elif age < 10:
      return 1
    elif age < 20:
      return 2
    elif age < 30:
      return 3
    elif age < 40:
      return 4
    elif age < 50:
      return 5
    elif age < 60:
      return 6
    else:
      return 7
total_df['AgeCluster'] = total_df.apply(cluster_age, axis=1)

In [14]:
# Manage tickets

total_df['TicketCluster'] = total_df['Ticket'].str[0]
total_df['TicketCluster'] = np.where(total_df["TicketCluster"].str.isdigit(), "X", total_df["TicketCluster"])

In [15]:
# Manage missing embarked

def fill_missing_embarked(x):
    embarked = x['Embarked']
    if embarked != embarked:
      return "X"
    else:
      return embarked

total_df['Embarked'] = total_df.apply(fill_missing_embarked, axis=1)

In [16]:
# Manage family

def cluster_family(x):
    pcSib = x['SibSp'] + x['Pclass'] + 1
    if pcSib < 2:
        return 'Single'
    elif pcSib == 2:
        return 'Couple'
    elif pcSib <= 4:
        return 'InterM'
    else:
        return 'Large'
    
total_df['FamilyCluster'] = total_df.apply(cluster_family, axis=1)

In [17]:
# Manage fare

def fill_missing_fare(x):
  fare = x['Fare']
  if fare == 0:
      return total_df[
        (total_df['TitleCluster'] == x['TitleCluster']) &
        (total_df['Sex'] == x['Sex']) &
        (total_df['Pclass'] == x['Pclass'])
      ]['Fare'].median()
  else:
    return fare

total_df['Fare'] = total_df.apply(fill_missing_fare, axis=1)

def cluster_fare(x):
    fare = x['Fare']
    if fare < 51:
        return 0
    elif fare < 101:
        return 1
    else:
        return 2

total_df['FareCluster'] = total_df.apply(cluster_fare, axis=1)

In [82]:
# total_df['Name'].str.split(",").str[0].iloc[888]

total_df['FamilyCode'] = total_df['Name'].str.split(",").str[0] + " | " + total_df['Pclass'].astype(str) + " | " + total_df['Ticket'].str[:-1] + " | " + total_df['Ticket'].str[:-1]

In [83]:
categorical_columns = ['Survived', 'df_source', 'Pclass', 'Sex', 'Embarked', 'AgeCluster', 'FareCluster', 'TicketCluster', 'TitleCluster', 'FamilyCluster', 'FamilyCode']
cat_total_df = total_df[categorical_columns]

In [84]:
dummy_total_df = pd.get_dummies(cat_total_df)

In [85]:
train_total_df = dummy_total_df[dummy_total_df['df_source_train.csv'] == 1]
test_total_df = dummy_total_df[dummy_total_df['df_source_test.csv'] == 1]

In [86]:
pd.options.mode.chained_assignment = None
train_total_df.Survived = train_total_df.Survived.astype('int')
xtrain = train_total_df.drop("Survived", axis=1)
ytrain = train_total_df['Survived']
xtest = test_total_df.drop("Survived", axis=1)

# Random Forest

In [None]:
# Random Forest
RF = RandomForestClassifier(random_state=1)
PRF = [
  {
    'n_estimators': [10,100],
    'max_depth': [3,6],
    'criterion': ['gini','entropy']
  }
]
GSRF = GridSearchCV(
    estimator=RF, 
    param_grid=PRF,
    scoring='accuracy', 
    cv=2
)
scores_rf = cross_val_score(
    GSRF,
    xtrain,
    ytrain,
    scoring='accuracy',
    cv=5
)
np.mean(scores_rf)

In [None]:
# SVM (Support Vector Machine)
svc = make_pipeline(
    StandardScaler(),
    SVC(random_state=1)
)
r = [0.0001, 0.001, 0.1, 1, 10, 50, 100]
PSVM = [
  {
    'svc__C': r,
    'svc__kernel': ['linear']
  },
  {
    'svc__C': r,
    'svc__gamma': r,
    'svc__kernel': ['rbf']
  }
]
GSSVM = GridSearchCV(
    estimator=svc, 
    param_grid=PSVM, 
    scoring='accuracy',
    cv=2
)
scores_svm = cross_val_score(
    GSSVM, 
    xtrain.astype(float), 
    ytrain,scoring='accuracy', 
    cv=5
)
np.mean(scores_rf)

In [49]:
model = GSSVM.fit(xtrain, ytrain)

In [50]:
pred = model.predict(xtest)

In [93]:
pred

NameError: ignored

In [86]:
random_forest_df = pd.DataFrame({'PassengerId':test_df['PassengerId'], 'Survived':pred})
random_forest_df.to_csv('submission.csv', index=False)
# files.download('submission.csv')

# XGBOOST

In [87]:
xgboot_x_train, xgboot_x_valid, xgboot_y_train, xgboot_y_valid = train_test_split(xtrain, ytrain, test_size=0.1, random_state=2020)

In [88]:
space = {
  'n_estimators':hp.quniform('n_estimators', 1000, 5000, 100),
  'gamma':hp.uniform('gamma', 0.01, 0.1),
  'learning_rate':hp.uniform('learning_rate', 0.00001, 0.1),
  'max_depth':hp.quniform('max_depth', 3,7,1),
  'subsample':hp.uniform('subsample', 0.30, 0.95),
  'colsample_bytree':hp.uniform('colsample_bytree', 0.30, 0.98),
  'colsample_bylevel':hp.uniform('colsample_bylevel', 0.30, 0.98),
  'reg_lambda': hp.uniform('reg_lambda', 1, 50)
}

def objective(params):
  params = {
    'n_estimators': int(params['n_estimators']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'max_depth': int(params['max_depth']),
    'subsample': params['subsample'],
    'colsample_bytree': params['colsample_bytree'],
    'colsample_bylevel': params['colsample_bylevel'],
    'reg_lambda': params['reg_lambda']}
  xb_a = xgb.XGBRegressor(**params)
  score = cross_val_score(xb_a, xgboot_x_train, xgboot_y_train, scoring='neg_mean_squared_error', cv=5, n_jobs=-1).mean()
  return -score

In [89]:
best = fmin(fn=objective, space=space, max_evals=20, rstate=np.random.RandomState(1), algo=tpe.suggest)

100%|██████████| 20/20 [34:10<00:00, 102.53s/it, best loss: 0.1347899151178409]


In [90]:
print(print(json.dumps(best, indent=4, sort_keys=True)))

{
    "colsample_bylevel": 0.3722034758923356,
    "colsample_bytree": 0.3121458021275036,
    "gamma": 0.03988564584911725,
    "learning_rate": 0.012383888651206643,
    "max_depth": 5.0,
    "n_estimators": 3500.0,
    "reg_lambda": 16.517527793960163,
    "subsample": 0.43306045984225505
}
None


In [91]:
xb_b = xgb.XGBRegressor(
  random_state=0,
  n_estimators=int(best['n_estimators']),
  colsample_bytree= best['colsample_bytree'],
  gamma= best['gamma'],
  learning_rate= best['learning_rate'],
  max_depth= int(best['max_depth']),
  subsample= best['subsample'],
  colsample_bylevel= best['colsample_bylevel'],
  reg_lambda= best['reg_lambda']
)

xb_b.fit(xgboot_x_train, xgboot_y_train)



XGBRegressor(base_score=0.5, booster='gbtree',
             colsample_bylevel=0.3722034758923356, colsample_bynode=1,
             colsample_bytree=0.3121458021275036, gamma=0.03988564584911725,
             importance_type='gain', learning_rate=0.012383888651206643,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
             n_estimators=3500, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=16.517527793960163,
             scale_pos_weight=1, seed=None, silent=None,
             subsample=0.43306045984225505, verbosity=1)

In [95]:
preds = xb_b.predict(xtest)

In [98]:
xgboost_df = pd.DataFrame()
xgboost_df['PassengerId'] = test_df['PassengerId']
xgboost_df['Survived'] = preds
xgboost_df['Survived'] = xgboost_df['Survived'].apply(lambda x: 1 if x>0.6 else 0)

In [99]:
xgboost_df.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# COMPARE MODELS

In [126]:
models_df = pd.concat([random_forest_df, xgboost_df], axis=1)
models_df.columns = ["PassengerId", "surv_random_forest", "PassengerIdRemove", "surv_xgboost"]
models_df = models_df[["PassengerId", "surv_random_forest", "surv_xgboost"]]
difference_df = models_df[models_df["surv_random_forest"] != models_df["surv_xgboost"]]
difference_df.count()

PassengerId           19
surv_random_forest    19
surv_xgboost          19
dtype: int64

In [127]:
for i, row in xgboost_df.iterrows():
  if xgboost_df.at[i,'Survived'] != random_forest_df.at[i,'Survived']:
    if i % 2 == 0:
      xgboost_df.at[i,'Survived'] = random_forest_df.at[i,'Survived']

In [128]:
xgboost_df.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>