# Import Libraries

In [181]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Read Data

In [182]:
titanic_data = pd.read_csv(r"C:\Personal Learning\titanic_dataset_github\Data\train.csv")

# Separating Train and Test Data

In [183]:
df_X = titanic_data.drop(columns=['Survived'])
df_y = titanic_data['Survived']

df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# Preprocessing

## Imputation

### Train Data

In [184]:
df_X_train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            140
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          553
Embarked         2
dtype: int64

In [185]:
df_X_train['Age'].fillna(df_X_train['Age'].mean(), inplace=True)
df_X_train['Embarked'].fillna(df_X_train['Embarked'].mode()[0], inplace=True)
df_X_train['Cabin'].fillna(df_X_train['Cabin'].mode()[0], inplace=True)

df_X_train.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Test Data

In [186]:
df_X_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             37
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          134
Embarked         0
dtype: int64

In [187]:
df_X_test['Age'].fillna(df_X_test['Age'].mean(), inplace=True)
df_X_test['Embarked'].fillna(df_X_test['Embarked'].mode()[0], inplace=True)
df_X_test['Cabin'].fillna(df_X_test['Cabin'].mode()[0], inplace=True)

df_X_test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Entire Data

In [188]:
df_X.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [189]:
df_X['Age'].fillna(df_X['Age'].mean(), inplace=True)
df_X['Embarked'].fillna(df_X['Embarked'].mode()[0], inplace=True)
df_X['Cabin'].fillna(df_X['Cabin'].mode()[0], inplace=True)

df_X.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Encoding of Categorical Variables

### Train Data

In [190]:
le = LabelEncoder()

df_X_train['Sex_cat'] = le.fit_transform(df_X_train['Sex'])
df_X_train['Embarked_cat'] = le.fit_transform(df_X_train['Embarked'])
df_X_train['Name_cat'] = le.fit_transform(df_X_train['Name'])
df_X_train['Ticket_cat'] = le.fit_transform(df_X_train['Ticket'])
df_X_train['Cabin_cat'] = le.fit_transform(df_X_train['Cabin'])

### Test Data

In [191]:
df_X_test['Sex_cat'] = le.fit_transform(df_X_test['Sex'])
df_X_test['Embarked_cat'] = le.fit_transform(df_X_test['Embarked'])
df_X_test['Name_cat'] = le.fit_transform(df_X_test['Name'])
df_X_test['Ticket_cat'] = le.fit_transform(df_X_test['Ticket'])
df_X_test['Cabin_cat'] = le.fit_transform(df_X_test['Cabin'])

### Entire Data

In [192]:
df_X['Sex_cat'] = le.fit_transform(df_X['Sex'])
df_X['Embarked_cat'] = le.fit_transform(df_X['Embarked'])
df_X['Name_cat'] = le.fit_transform(df_X['Name'])
df_X['Ticket_cat'] = le.fit_transform(df_X['Ticket'])
df_X['Cabin_cat'] = le.fit_transform(df_X['Cabin'])

# Modelling

## Fitting Train Data

### Preparing training data

In [193]:
X_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_cat', 'Embarked_cat', 'Name_cat', 'Ticket_cat', 'Cabin_cat']
y_cols = ['Survived']

standard_scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

X_train = min_max_scaler.fit_transform(df_X_train[X_cols])
y_train = df_y_train

### Fitting Model on Train Data

#### Random Forrest Classifier

In [194]:
clf_rfc = RandomForestClassifier()

param_grid = [{'n_estimators': [10, 100, 250, 500, 750, 1000]},
              {'max_depth': [1, 3, 5, 7, 10, None]},
              {'min_samples_split': [2,3,4,5]}]

grid_search = GridSearchCV(clf_rfc, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train, y_train)
best_params_rfc = grid_search.best_params_

In [195]:
final_clf_rfc = grid_search.best_estimator_
y_train_pred = final_clf_rfc.predict(X_train)

print("Training Accuracy: ", accuracy_score(y_train_pred, y_train))
print("Training Precision: ", precision_score(y_train_pred, y_train))
print("Training Recall: ", recall_score(y_train_pred, y_train))
print("Training F1 score: ", f1_score(y_train_pred, y_train))

Training Accuracy:  1.0
Training Precision:  1.0
Training Recall:  1.0
Training F1 score:  1.0


#### Gradient Boosting Classifier

In [196]:
clf_gbc = GradientBoostingClassifier()

param_grid = [{'n_estimators': [10, 100, 250, 500, 750, 1000, 1500, 2000]},
              {'max_depth': [1, 3, 5, 7, 10, None]},
              {'min_samples_split': [2,3,4,5]}]

grid_search = GridSearchCV(clf_gbc, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train, y_train)
best_params_gbc = grid_search.best_params_

In [197]:
final_clf_gbc = grid_search.best_estimator_
y_train_pred = final_clf_gbc.predict(X_train)

print("Training Accuracy: ", accuracy_score(y_train_pred, y_train))
print("Training Precision: ", precision_score(y_train_pred, y_train))
print("Training Recall: ", recall_score(y_train_pred, y_train))
print("Training F1 score: ", f1_score(y_train_pred, y_train))

Training Accuracy:  1.0
Training Precision:  1.0
Training Recall:  1.0
Training F1 score:  1.0


### Testing Model on Test Data

#### Preparing Test Data

In [198]:
X_test = min_max_scaler.fit_transform(df_X_test[X_cols])
y_test = df_y_test

#### Random Forrest Classifier

In [199]:
y_pred = final_clf_rfc.predict(X_test)

print("Test Accuracy: ", accuracy_score(y_test, y_pred))
print("Test Precision: ", precision_score(y_test, y_pred))
print("Test Recall: ", recall_score(y_test, y_pred))
print("Test F1 score: ", f1_score(y_test, y_pred))

Test Accuracy:  0.8268156424581006
Test Precision:  0.8412698412698413
Test Recall:  0.7162162162162162
Test F1 score:  0.7737226277372262


#### Gradient Boosting Classifier

In [200]:
y_pred = final_clf_gbc.predict(X_test)

print("Test Accuracy: ", accuracy_score(y_test, y_pred))
print("Test Precision: ", precision_score(y_test, y_pred))
print("Test Recall: ", recall_score(y_test, y_pred))
print("Test F1 score: ", f1_score(y_test, y_pred))

Test Accuracy:  0.7597765363128491
Test Precision:  0.7540983606557377
Test Recall:  0.6216216216216216
Test F1 score:  0.6814814814814815


# Final Model

## Fitting Model on Entire Data

In [245]:
X = min_max_scaler.fit_transform(df_X[X_cols])
y = df_y

# clf_final = RandomForestClassifier()
# param_grid = [{'n_estimators': [10, 100, 250, 500, 750, 1000, 1500, 2000]},
#               {'max_depth': [1, 3, 5, 7, 10, None]},
#               {'min_samples_split': [2, 3, 4, 5, 7, 10]}]

clf_final = GradientBoostingClassifier()
param_grid = [{'n_estimators': [10, 100, 250, 500, 750, 1000, 1500, 2000]},
              {'max_depth': [1, 3, 5, 7, 10, None]},
              {'min_samples_split': [2, 3, 4, 5, 7, 10]}]


grid_search = GridSearchCV(clf_final, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X, y)

final_clf = grid_search.best_estimator_
final_clf

# Output Prediction on Kaggle Test Data

## Read Kaggle Test Data

In [246]:
titanic_test_data = pd.read_csv(r"C:\Personal Learning\titanic_dataset_github\Data\test.csv")

## Preprocessing

### Imputation

In [247]:
titanic_test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [248]:
titanic_test_data['Fare'].fillna(titanic_test_data['Fare'].mean(), inplace=True)
titanic_test_data['Age'].fillna(titanic_test_data['Age'].mean(), inplace=True)
titanic_test_data['Embarked'].fillna(titanic_test_data['Embarked'].mode()[0], inplace=True)
titanic_test_data['Cabin'].fillna(titanic_test_data['Cabin'].mode()[0], inplace=True)

titanic_test_data.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Encoding of Categorical Variables

In [249]:
titanic_test_data['Sex_cat'] = le.fit_transform(titanic_test_data['Sex'])
titanic_test_data['Embarked_cat'] = le.fit_transform(titanic_test_data['Embarked'])
titanic_test_data['Name_cat'] = le.fit_transform(titanic_test_data['Name'])
titanic_test_data['Ticket_cat'] = le.fit_transform(titanic_test_data['Ticket'])
titanic_test_data['Cabin_cat'] = le.fit_transform(titanic_test_data['Cabin'])

## Final Prediction

In [250]:
X_pred = min_max_scaler.fit_transform(titanic_test_data[X_cols])
y_pred = final_clf.predict(X_pred)

## Kaggle Output Format

In [251]:
final_output = pd.DataFrame(titanic_test_data['PassengerId'])
final_output['Survived'] = y_pred

final_output.to_csv(r"C:\Personal Learning\titanic_dataset_github\Output\titanic_pred_v9.csv", index=False)