In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
data = pd.read_csv('Titanic-Dataset.csv')

In [3]:
# Data Preprocessing

# Fill missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)


In [4]:
# Feature Engineering
# Create FamilySize feature
data['FamilySize'] = data['SibSp'] + data['Parch']

# Extract titles from names
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 
                                       'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')

In [5]:
# Map categorical features to numerical values
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
title_encoder = LabelEncoder()
data['Title'] = title_encoder.fit_transform(data['Title'])

In [6]:
# Select relevant features
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'Title']
X = data[features]
y = data['Survived']

In [7]:
# Standardize the data for models sensitive to feature scales (e.g., SVM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Model Selection: Instantiate multiple models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

In [10]:
# Training and evaluating each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluation
    print(f"Model: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    
    # Cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5)
    print("Cross-validation Accuracy:", np.mean(cv_scores))
    print("\n" + "="*50 + "\n")

Model: Logistic Regression
Accuracy: 0.8044692737430168
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix:
 [[90 15]
 [20 54]]
Cross-validation Accuracy: 0.7912497646098802


Model: Random Forest
Accuracy: 0.8324022346368715
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86       105
           1       0.80      0.80      0.80        74

    accuracy                           0.83       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.83      0.83      0.83       179

Confusion Matrix:
 [[90 15]
 [15 59]]
Cross-validation Accuracy: 0.8148138848785387


Model: Decision Tree

In [11]:
# Feature Importance Analysis using Random Forest
importances = models['Random Forest'].feature_importances_
feature_importances = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("Feature Importances from Random Forest Model:")
print(feature_importances)

# Insights
print("\nInsights on Factors Most Likely to Lead to Survival:")
print("1. Gender: Females were more likely to survive.")
print("2. Class: Higher classes (1st and 2nd) had a better chance of survival.")
print("3. Age: Younger passengers, especially children, had a higher survival rate.")
print("4. Family Size: Moderate family sizes showed better survival likelihood due to support.")


Feature Importances from Random Forest Model:
      Feature  Importance
3        Fare    0.264621
2         Age    0.229320
1         Sex    0.207376
6       Title    0.106375
5  FamilySize    0.081183
0      Pclass    0.079296
4    Embarked    0.031829

Insights on Factors Most Likely to Lead to Survival:
1. Gender: Females were more likely to survive.
2. Class: Higher classes (1st and 2nd) had a better chance of survival.
3. Age: Younger passengers, especially children, had a higher survival rate.
4. Family Size: Moderate family sizes showed better survival likelihood due to support.
