In [1]:
# Import Data Manupulation Library
import pandas as pd
import numpy as np
 
 # Import Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Machine Leaning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  RobustScaler,MinMaxScaler,LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Data Ingestion
data = pd.read_csv(r'C:\TitanicPeopleSurvival_PredictionData\data\raw\Titanic_Dataset.csv')
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeGroup,CabinDeck,Ticket,Survived
0,1,3,male,43.2,4,3,23.01,S,8,0,Miss,Adult,Unknown,PC 27225,0
1,2,3,male,21.4,2,0,157.46,C,3,0,Mr,YoungAdult,Unknown,SC 257787,1
2,3,3,female,47.2,1,4,131.88,S,6,0,Mr,Adult,Unknown,CA 147316,1
3,4,1,male,15.7,0,4,3.42,S,5,0,Dr,Teen,Unknown,PC 710570,0
4,5,1,male,49.6,2,0,54.24,S,3,0,Miss,Adult,Unknown,PC 620176,0


In [3]:
# Checking Dataset Information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  15000 non-null  int64  
 1   Pclass       15000 non-null  int64  
 2   Sex          15000 non-null  object 
 3   Age          15000 non-null  float64
 4   SibSp        15000 non-null  int64  
 5   Parch        15000 non-null  int64  
 6   Fare         15000 non-null  float64
 7   Embarked     15000 non-null  object 
 8   FamilySize   15000 non-null  int64  
 9   IsAlone      15000 non-null  int64  
 10  Title        15000 non-null  object 
 11  AgeGroup     14705 non-null  object 
 12  CabinDeck    15000 non-null  object 
 13  Ticket       15000 non-null  object 
 14  Survived     15000 non-null  int64  
dtypes: float64(2), int64(7), object(6)
memory usage: 1.7+ MB


In [4]:
# Checking Descriptive Stats
data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Survived
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,7500.5,2.320933,29.18402,2.486067,2.0136,31.583209,5.499667,0.0342,0.3724
std,4330.271354,0.829044,13.686311,1.716781,1.416551,31.843415,2.220494,0.181749,0.48346
min,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,3750.75,2.0,19.7,1.0,1.0,8.87,4.0,0.0,0.0
50%,7500.5,3.0,29.1,2.0,2.0,21.985,5.0,0.0,0.0
75%,11250.25,3.0,38.5,4.0,3.0,43.6925,7.0,0.0,1.0
max,15000.0,3.0,78.5,5.0,4.0,375.72,10.0,1.0,1.0


In [5]:
# Drop Unnecessary Columns
data.drop(columns = ['PassengerId','CabinDeck','Ticket','Title','AgeGroup'],inplace=True,axis =1)

In [6]:
# Controlling Data Leakage

''' 
1. Split the Dataset into X and y
2. Split the Dataset into Train and Test Sets
3. Use Encoding Techniques on Categorical Features
4. Use Scaling Techniques on Numerical Features
5. Use SMOTE
'''
# 1. Split the Dataset into X and y
X = data.drop('Survived',axis=1)    
y = data['Survived']
# 2. Split the Dataset into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 1)
# 3. Use Encoding Techniques on Categorical Features

# Segregate Categorical and Numerical Features
numerical_col = X.select_dtypes(exclude = 'object').columns
categorical_col = X.select_dtypes(include = 'object').columns
# Apply Label Encoding on Categorical Features
le = LabelEncoder()
for col in categorical_col:
    X_train[col] = le.fit_transform(X_train[col])  # Seen Data
    X_test[col] = le.transform(X_test[col])           # Unseen Data
# 4. Use Scaling Techniques on Numerical Features
sc = RobustScaler()
X_train[numerical_col] = sc.fit_transform(X_train[numerical_col])   # Seen Data
X_test[numerical_col] = sc.transform(X_test[numerical_col])   # Unseen Data

# 5. Use SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)  # Seen Data

In [7]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Bagging Classifier": BaggingClassifier()}

for model_name, model in models.items():
    model.fit(X_train, y_train)  # Seen Data
    y_pred = model.predict(X_test)   # Unseen Data
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 50)

Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.64      0.45      0.53      2876
           1       0.36      0.55      0.44      1624

    accuracy                           0.49      4500
   macro avg       0.50      0.50      0.49      4500
weighted avg       0.54      0.49      0.50      4500

Confusion Matrix:
[[1306 1570]
 [ 725  899]]
--------------------------------------------------
Model: Decision Tree
              precision    recall  f1-score   support

           0       0.64      0.57      0.60      2876
           1       0.36      0.43      0.39      1624

    accuracy                           0.52      4500
   macro avg       0.50      0.50      0.50      4500
weighted avg       0.54      0.52      0.53      4500

Confusion Matrix:
[[1639 1237]
 [ 925  699]]
--------------------------------------------------
Model: Random Forest
              precision    recall  f1-score   support

           0       0.65      0