In [1]:
# Import Data Manipulation Libraries
import pandas as pd
import numpy as np

# Import Data Visualization Libararies
import matplotlib.pyplot as plt
import seaborn as sns

# Import Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,RobustScaler,LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

In [2]:
# Data Ingestion

df = pd.read_csv(r'C:\TitanicPeopleSurvival_PredictionModel\data\raw\Titanic_Dataset.csv')

df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeGroup,CabinDeck,Ticket,Survived
0,1,3,male,43.2,4,3,23.01,S,8,0,Miss,Adult,Unknown,PC 27225,0
1,2,3,male,21.4,2,0,157.46,C,3,0,Mr,YoungAdult,Unknown,SC 257787,1
2,3,3,female,47.2,1,4,131.88,S,6,0,Mr,Adult,Unknown,CA 147316,1
3,4,1,male,15.7,0,4,3.42,S,5,0,Dr,Teen,Unknown,PC 710570,0
4,5,1,male,49.6,2,0,54.24,S,3,0,Miss,Adult,Unknown,PC 620176,0


In [3]:
# Data Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  15000 non-null  int64  
 1   Pclass       15000 non-null  int64  
 2   Sex          15000 non-null  object 
 3   Age          15000 non-null  float64
 4   SibSp        15000 non-null  int64  
 5   Parch        15000 non-null  int64  
 6   Fare         15000 non-null  float64
 7   Embarked     15000 non-null  object 
 8   FamilySize   15000 non-null  int64  
 9   IsAlone      15000 non-null  int64  
 10  Title        15000 non-null  object 
 11  AgeGroup     14705 non-null  object 
 12  CabinDeck    15000 non-null  object 
 13  Ticket       15000 non-null  object 
 14  Survived     15000 non-null  int64  
dtypes: float64(2), int64(7), object(6)
memory usage: 1.7+ MB


In [4]:
# Drop Unnecessary Columns
df.drop(columns = ['PassengerId','CabinDeck','Ticket','Title','AgeGroup'],inplace=True,axis =1)

In [5]:
# Data exploration
# Handling Leakage
'''
1. Split the data into X and y
2. Use Train Test Split
3. Using LabelEncoder
4. Using Scaling Technique
5. Using SMOT Technique for data balance
'''
# Split the Data into X and y
X = df.drop(columns='Survived',axis=1)
y = df['Survived']

# Use Train Test Split
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 test_size=0.3,
                                                 random_state=1)

# Segregate the Numerical and Categorical Columns
numerical_col = X_train.select_dtypes(exclude='object').columns
categorical_col = X_train.select_dtypes(include='object').columns

numerical_stats = []
from collections import OrderedDict
for i in numerical_col:
    num_stats = OrderedDict({
        "Feature": i,
        "Count": df[i].count(),
        "Maximum": df[i].max(),
        "Minimum": df[i].min(),
        "Mean": df[i].mean(),
        "Q1": df[i].quantile(0.25),
        "Q2": df[i].quantile(0.50),
        "Q3": df[i].quantile(0.75),
        "IQR": df[i].quantile(0.75) - df[i].quantile(0.25),
        "Standard Deviation": df[i].std(),
        "Skewness": df[i].skew(),
        "Kurtosis": df[i].kurt()
    })

    numerical_stats.append(num_stats)
    numerical_stats_report = pd.DataFrame(numerical_stats)

categorical_stats = []
for i in categorical_col:
    cat_stats = OrderedDict({
        "Features": i,
        "Count": df[i].count(),
        "Unique Count": df[i].nunique(),
        "Value Counts": df[i].value_counts(),
        "Mode": df[i].mode()
    })

categorical_stats.append(cat_stats)
categorical_stats_report = pd.DataFrame(categorical_stats)

numerical_stats_report, categorical_stats

(      Feature  Count  Maximum  Minimum       Mean     Q1      Q2       Q3  \
 0      Pclass  15000     3.00      1.0   2.320933   2.00   3.000   3.0000   
 1         Age  15000    78.50      0.0  29.184020  19.70  29.100  38.5000   
 2       SibSp  15000     5.00      0.0   2.486067   1.00   2.000   4.0000   
 3       Parch  15000     4.00      0.0   2.013600   1.00   2.000   3.0000   
 4        Fare  15000   375.72      0.0  31.583209   8.87  21.985  43.6925   
 5  FamilySize  15000    10.00      1.0   5.499667   4.00   5.000   7.0000   
 6     IsAlone  15000     1.00      0.0   0.034200   0.00   0.000   0.0000   
 
        IQR  Standard Deviation  Skewness   Kurtosis  
 0   1.0000            0.829044 -0.656178  -1.232581  
 1  18.8000           13.686311  0.079937  -0.263259  
 2   3.0000            1.716781  0.007059  -1.280820  
 3   2.0000            1.416551 -0.008539  -1.305198  
 4  34.8225           31.843415  2.069897   6.877369  
 5   3.0000            2.220494  0.013744  -

In [6]:
# Apply Label Encoding on Categorical Columns
le = LabelEncoder()
for i in categorical_col:
    X_train[i] = le.fit_transform(X_train[i])  # Seen Data
    X_test[i] = le.transform(X_test[i])           # Unseen Data

# Using Scaling techiques on Numerical Columns
sc = RobustScaler()
X_train[numerical_col] = sc.fit_transform(X_train[numerical_col]) # Seen data
X_test[numerical_col] = sc.transform(X_test[numerical_col]) # Unseen Data

# Using SMOTE
smote = SMOTE(random_state=1)
X_train, y_train = smote.fit_resample(X_train, y_train) # Seen data


In [7]:
models = {
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Bagging": BaggingClassifier() 
}

for model_name , model in models.items():
    model.fit(X_train,y_train) # Seen Data
    y_pred = model.predict(X_test) # Unseen data
    print(f"Model: {model_name}")
    print(classification_report(y_test,y_pred))
    print("Confusion Matix")
    print(confusion_matrix(y_test,y_pred))
    print("-"*50)


Model: LogisticRegression
              precision    recall  f1-score   support

           0       0.64      0.44      0.52      2876
           1       0.36      0.55      0.43      1624

    accuracy                           0.48      4500
   macro avg       0.50      0.50      0.48      4500
weighted avg       0.54      0.48      0.49      4500

Confusion Matix
[[1279 1597]
 [ 731  893]]
--------------------------------------------------
Model: DecisionTree
              precision    recall  f1-score   support

           0       0.65      0.58      0.61      2876
           1       0.37      0.44      0.40      1624

    accuracy                           0.53      4500
   macro avg       0.51      0.51      0.51      4500
weighted avg       0.55      0.53      0.54      4500

Confusion Matix
[[1669 1207]
 [ 915  709]]
--------------------------------------------------
Model: RandomForest
              precision    recall  f1-score   support

           0       0.64      0.69    