# Titanic Survival Prediction Project
In this project, we aim to predict the survival of passengers aboard the RMS Titanic based on various features such as age, sex, fare, and other relevant attributes.

## Libraries

In [72]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV



## Data Loading and Initial Analysis
Import the training set. Kaggle splits the data into training and testing subsets. We will be validating based on the training set and we will only use the test set for submission.

In [106]:
# Load the training set.
try:
    titanic = pd.read_csv('train.csv')
except FileNotFoundError:
    print('File not found.')

In [107]:
# Display the first five rows of the dataset.
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [108]:
# Display basic information of the data. Look for null values.
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [109]:
# Display summary statistics.
titanic.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


## Preprocessing

In [77]:
# Separate features and target.
X = titanic.drop('Survived', axis=1)
y = titanic['Survived']

Basic plan for each feature:
<ol>
    <li><b>PassengerId:</b> Drop.</li>
    <li><b>Pclass:</b> Treated as a categorical value, so impute and one-hot encode.</li>
    <li><b>Name:</b> Drop.</li>
    <li><b>Sex:</b> Impute and OH encode.</li>
    <li><b>Age:</b> Impute and scale.</li>
    <li><b>SibSp and Parch:</b> Keep unchanged.</li>
    <li><b>Ticket:</b> Drop.</li>
    <li><b>Fare:</b> Impute and scale.</li>
    <li><b>Cabin:</b> Drop.</li>
    <li><b>Embarked:</b> Impute and OH encode.</li>
</ol>

In [78]:
# Define feature goups
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']
unchanged_features = ['SibSp', 'Parch']

In [79]:
# Define numeric pipeline
numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

In [80]:
# Define categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('oh_encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

In [81]:
# Combine preprocessing steps
preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, numeric_features),
    ('categorical', categorical_pipeline, categorical_features),
    ('unchanged', 'passthrough', unchanged_features)
])

In [82]:
X_final = preprocessor.fit_transform(X)

## Model Training and Evaluation

We will try Random Forest Classifier and Support Vector Machine for this dataset. We will use RandomizedSearchCV to choose the best model and tune its hyperparameters.

In [83]:
# Define hyperparameter grids
param_grid_forest = {
    'n_estimators':[100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, None]
}

param_grid_svm = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'degree': [2, 3, 4, 5]
}

# Define model parameters
model_params = {
    'clf_forest': {
        'model': RandomForestClassifier(),
        'params': param_grid_forest
    },
    'clf_svm': {
        'model': SVC(),
        'params': param_grid_svm
    }
}

In [84]:
# Perform RandomizedGridSearchCV
scores = []
for model_name, mp in model_params.items():
    clf = RandomizedSearchCV(mp['model'], mp['params'])
    clf.fit(X_final, y)
    results = {
        'model': model_name, 
        'best_score': clf.best_score_,
        'best_params' : clf.best_params_
    }
    scores.append(results)

In [88]:
# Display results
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,clf_forest,0.830557,"{'n_estimators': 500, 'max_depth': 10}"
1,clf_svm,0.824907,"{'kernel': 'rbf', 'gamma': 0.01, 'degree': 3, ..."


In [90]:
# Train a random forest on the whole training dataset.
clf_forest = RandomForestClassifier(n_estimators=500, max_depth=10)
clf_forest.fit(X_final, y)

In [94]:
# Define a complete pipeline for the data.
complete_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=500, max_depth=10))
])

In [95]:
# Preprocess and train the model.
complete_pipeline.fit(X, y)

In [99]:
# Use the trained model to create predicitons.
predictions = complete_pipeline.predict(X_test)

## Submission

In [89]:
# load testing dataset.
try:
    X_test = pd.read_csv('test.csv')
except FileNotFoundError:
    print('File not found.')

In [111]:
# Create submission dataframe/
submission = pd.DataFrame({'PassengerId': X_test['PassengerId'], 'Survived': predictions})

In [113]:
# Export to .csv file.
submission.to_csv('submission.csv', index=False)