# Import Libraries

In [49]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Read Data

In [50]:
titanic_data = pd.read_csv(r"C:\Personal Learning\titanic_dataset_github\Data\train.csv")

# Separating Train and Test Data

In [51]:
df_X = titanic_data.drop(columns=['Survived'])
df_y = titanic_data['Survived']

df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# Preprocessing

## Imputation

### Train Data

In [52]:
df_X_train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            140
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          553
Embarked         2
dtype: int64

In [53]:
df_X_train['Age'].fillna(df_X_train['Age'].mean(), inplace=True)
df_X_train['Embarked'].fillna(df_X_train['Embarked'].mode()[0], inplace=True)
df_X_train['Cabin'].fillna(df_X_train['Cabin'].mode()[0], inplace=True)

df_X_train.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Test Data

In [54]:
df_X_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             37
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          134
Embarked         0
dtype: int64

In [55]:
df_X_test['Age'].fillna(df_X_test['Age'].mean(), inplace=True)
df_X_test['Embarked'].fillna(df_X_test['Embarked'].mode()[0], inplace=True)
df_X_test['Cabin'].fillna(df_X_test['Cabin'].mode()[0], inplace=True)

df_X_test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Entire Data

In [56]:
df_X.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [57]:
df_X['Age'].fillna(df_X['Age'].mean(), inplace=True)
df_X['Embarked'].fillna(df_X['Embarked'].mode()[0], inplace=True)
df_X['Cabin'].fillna(df_X['Cabin'].mode()[0], inplace=True)

df_X.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Encoding of Categorical Variables

### Train Data

In [58]:
le = LabelEncoder()

df_X_train['Sex_cat'] = le.fit_transform(df_X_train['Sex'])
df_X_train['Embarked_cat'] = le.fit_transform(df_X_train['Embarked'])
df_X_train['Name_cat'] = le.fit_transform(df_X_train['Name'])
df_X_train['Ticket_cat'] = le.fit_transform(df_X_train['Ticket'])
df_X_train['Cabin_cat'] = le.fit_transform(df_X_train['Cabin'])

### Test Data

In [59]:
df_X_test['Sex_cat'] = le.fit_transform(df_X_test['Sex'])
df_X_test['Embarked_cat'] = le.fit_transform(df_X_test['Embarked'])
df_X_test['Name_cat'] = le.fit_transform(df_X_test['Name'])
df_X_test['Ticket_cat'] = le.fit_transform(df_X_test['Ticket'])
df_X_test['Cabin_cat'] = le.fit_transform(df_X_test['Cabin'])

### Entire Data

In [60]:
df_X['Sex_cat'] = le.fit_transform(df_X['Sex'])
df_X['Embarked_cat'] = le.fit_transform(df_X['Embarked'])
df_X['Name_cat'] = le.fit_transform(df_X['Name'])
df_X['Ticket_cat'] = le.fit_transform(df_X['Ticket'])
df_X['Cabin_cat'] = le.fit_transform(df_X['Cabin'])

# Modelling

## Fitting Train Data

### Preparing training data

In [61]:
X_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_cat', 'Embarked_cat', 'Name_cat', 'Ticket_cat', 'Cabin_cat']
y_cols = ['Survived']

X_train = df_X_train[X_cols]
y_train = df_y_train

### Fitting Model on Train Data

In [62]:
clf_rfc = RandomForestClassifier()

param_grid = [{'n_estimators': [10, 100, 250, 500, 750, 1000]},
              {'max_depth': [1, 3, 5, 7, 10, None]},
              {'min_samples_split': [2,3,4,5]}]

grid_search = GridSearchCV(clf_rfc, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train, y_train)

In [63]:
final_clf_rcf = grid_search.best_estimator_
final_clf_rcf

### Testing Model on Test Data

In [64]:
X_test = df_X_test[X_cols]
y_test = df_y_test

final_clf_rcf.score(X_test, y_test)

0.7821229050279329

In [65]:
y_pred = final_clf_rcf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))

Accuracy:  0.7821229050279329
Precision:  0.7272727272727273
Recall:  0.7567567567567568
F1 score:  0.7417218543046358


# Final Model

## Fitting Model on Entire Data

In [66]:
X = df_X[X_cols]
y = df_y

clf_rfc = RandomForestClassifier()

param_grid = [{'n_estimators': [10, 100, 250, 500, 750, 1000]},
              {'max_depth': [1, 3, 5, 7, 10, None]},
              {'min_samples_split': [2,3,4,5]}]

grid_search = GridSearchCV(clf_rfc, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X, y)

final_clf_rcf = grid_search.best_estimator_
final_clf_rcf

# Output Prediction on Kaggle Test Data

## Read Kaggle Test Data

In [67]:
titanic_test_data = pd.read_csv(r"C:\Personal Learning\titanic_dataset_github\Data\test.csv")

## Preprocessing

### Imputation

In [68]:
titanic_test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [70]:
titanic_test_data['Fare'].fillna(titanic_test_data['Fare'].mean(), inplace=True)
titanic_test_data['Age'].fillna(titanic_test_data['Age'].mean(), inplace=True)
titanic_test_data['Embarked'].fillna(titanic_test_data['Embarked'].mode()[0], inplace=True)
titanic_test_data['Cabin'].fillna(titanic_test_data['Cabin'].mode()[0], inplace=True)

titanic_test_data.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Encoding of Categorical Variables

In [71]:
titanic_test_data['Sex_cat'] = le.fit_transform(titanic_test_data['Sex'])
titanic_test_data['Embarked_cat'] = le.fit_transform(titanic_test_data['Embarked'])
titanic_test_data['Name_cat'] = le.fit_transform(titanic_test_data['Name'])
titanic_test_data['Ticket_cat'] = le.fit_transform(titanic_test_data['Ticket'])
titanic_test_data['Cabin_cat'] = le.fit_transform(titanic_test_data['Cabin'])

## Final Prediction

In [73]:
X_pred = titanic_test_data[X_cols]
y_pred = final_clf_rcf.predict(X_pred)

## Kaggle Output Format

In [75]:
final_output = pd.DataFrame(titanic_test_data['PassengerId'])
final_output['Survived'] = y_pred

final_output.to_csv(r"C:\Personal Learning\titanic_dataset_github\Output\titanic_pred_v1.csv", index=False)