<a href="https://colab.research.google.com/github/robimalco/colab/blob/main/Titanic_Machine_Learning_from_Disaster_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q kaggle

In [3]:
from google.colab import files

In [None]:
files.upload()

In [5]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets list

In [None]:
!kaggle competitions download -c titanic

In [None]:
!mkdir train
!unzip train.zip -d train

# START

In [42]:
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [9]:
train_df = pd.read_csv('train.csv')
train_df['df_source'] = 'train.csv'

test_df = pd.read_csv('test.csv')
test_df['df_source'] = 'test.csv'

total_df = pd.concat([train_df, test_df])

In [None]:
# Diplay null values of each column
total_df.isnull().sum()

In [10]:
# Manage names creating TitleCluster

Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Dona" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

def get_titles(x):
    title = x['Name'].split(',')[1].split('.')[0].strip()
    return Title_Dictionary[title]
total_df['TitleCluster'] = total_df.apply(get_titles, axis=1)

In [11]:
# Manage missing age and create AgeCluster

def fill_missing_age(x):
    age = x['Age']
    if np.isnan(age):
      return total_df[
        (total_df['TitleCluster'] == x['TitleCluster']) &
        (total_df['Sex'] == x['Sex']) &
        (total_df['Pclass'] == x['Pclass'])
      ]['Age'].median()
    else:
      return age

total_df['Age'] = total_df.apply(fill_missing_age, axis=1)

def cluster_age(x):
    age = x['Age']
    if age < 5:
      return 0
    elif age < 10:
      return 1
    elif age < 20:
      return 2
    elif age < 30:
      return 3
    elif age < 40:
      return 4
    elif age < 50:
      return 5
    elif age < 60:
      return 6
    else:
      return 7
total_df['AgeCluster'] = total_df.apply(cluster_age, axis=1)

In [12]:
# Manage tickets

total_df['TicketCluster'] = total_df['Ticket'].str[0]
total_df['TicketCluster'] = np.where(total_df["TicketCluster"].str.isdigit(), "X", total_df["TicketCluster"])

In [13]:
# Manage missing embarked

def fill_missing_embarked(x):
    embarked = x['Embarked']
    if embarked != embarked:
      return "X"
    else:
      return embarked

total_df['Embarked'] = total_df.apply(fill_missing_embarked, axis=1)

In [16]:
# Manage family

def cluster_family(x):
    pcSib = x['SibSp'] + x['Pclass'] + 1
    if pcSib < 2:
        return 'Single'
    elif pcSib == 2:
        return 'Couple'
    elif pcSib <= 4:
        return 'InterM'
    else:
        return 'Large'
    
total_df['FamilyCluster'] = total_df.apply(cluster_family, axis=1)

In [17]:
# Manage fare

def fill_missing_fare(x):
  fare = x['Fare']
  if fare == 0:
      return total_df[
        (total_df['TitleCluster'] == x['TitleCluster']) &
        (total_df['Sex'] == x['Sex']) &
        (total_df['Pclass'] == x['Pclass'])
      ]['Fare'].median()
  else:
    return fare

total_df['Fare'] = total_df.apply(fill_missing_fare, axis=1)

def cluster_fare(x):
    fare = x['Fare']
    if fare < 51:
        return 0
    elif fare < 101:
        return 1
    else:
        return 2

total_df['FareCluster'] = total_df.apply(cluster_fare, axis=1)

In [24]:
categorical_columns = ['Survived', 'df_source', 'Pclass', 'Sex', 'Embarked', 'AgeCluster', 'FareCluster', 'TicketCluster', 'TitleCluster', 'FamilyCluster']
cat_total_df = total_df[categorical_columns]

In [25]:
dummy_total_df = pd.get_dummies(cat_total_df)

In [26]:
train_total_df = dummy_total_df[dummy_total_df['df_source_train.csv'] == 1]
test_total_df = dummy_total_df[dummy_total_df['df_source_test.csv'] == 1]

In [30]:
pd.options.mode.chained_assignment = None
train_total_df.Survived = train_total_df.Survived.astype('int')
xtrain = train_total_df.drop("Survived", axis=1)
ytrain = train_total_df['Survived']
xtest = test_total_df.drop("Survived", axis=1)

In [37]:
# Random Forest
RF = RandomForestClassifier(random_state=1)
PRF = [
  {
    'n_estimators': [10,100],
    'max_depth': [3,6],
    'criterion': ['gini','entropy']
  }
]
GSRF = GridSearchCV(
    estimator=RF, 
    param_grid=PRF,
    scoring='accuracy', 
    cv=2
)
scores_rf = cross_val_score(
    GSRF,
    xtrain,
    ytrain,
    scoring='accuracy',
    cv=5
)
np.mean(scores_rf)

In [None]:
# SVM (Support Vector Machine)
svc = make_pipeline(
    StandardScaler(),
    SVC(random_state=1)
)
r = [0.0001, 0.001, 0.1, 1, 10, 50, 100]
PSVM = [
  {
    'svc__C': r,
    'svc__kernel': ['linear']
  },
  {
    'svc__C': r,
    'svc__gamma': r,
    'svc__kernel': ['rbf']
  }
]
GSSVM = GridSearchCV(
    estimator=svc, 
    param_grid=PSVM, 
    scoring='accuracy',
    cv=2
)
scores_svm = cross_val_score(
    GSSVM, 
    xtrain.astype(float), 
    ytrain,scoring='accuracy', 
    cv=5
)
np.mean(scores_rf)

In [44]:
model = GSSVM.fit(xtrain, ytrain)

In [45]:
pred = model.predict(xtest)

In [48]:
submission_df = pd.DataFrame({'PassengerId':test_df['PassengerId'], 'Survived':pred})
submission_df.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>