In [2]:
import seaborn as sns
import pandas as pd
import numpy as np

# Loading the Titanic dataset
df = sns.load_dataset('titanic')

# Basic preprocessing
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]
df.dropna(inplace=True)
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['embarked'] = df['embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Features and target
X = df.drop('survived', axis=1)
y = df['survived']

# Step 2: Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Defining the  models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()}

# Step 4: Train and Evaluate Models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({'Model': name, 'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1})

results_df = pd.DataFrame(results)
print("Initial Model Performance:\n", results_df)

# Step 5: Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# GridSearchCV for RandomForest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 5, 10]}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)

# RandomizedSearchCV for SVC
param_dist_svc = {'C': [0.1, 1, 10, 100],
                  'kernel': ['linear', 'rbf'],
                  'gamma': ['scale', 'auto']}

random_svc = RandomizedSearchCV(SVC(), param_distributions=param_dist_svc, n_iter=10, cv=5, scoring='f1', n_jobs=-1, random_state=42)
random_svc.fit(X_train, y_train)

# Step 6: Evaluating the tuned models
tuned_models = {
    'Random Forest (Tuned)': grid_rf.best_estimator_,
    'SVM (Tuned)': random_svc.best_estimator_
}

for name, model in tuned_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({'Model': name, 'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1})


final_results_df = pd.DataFrame(results)
print("\nFinal Model Comparison:\n", final_results_df.sort_values(by="F1 Score", ascending=False))

best_model = final_results_df.sort_values(by='F1 Score', ascending=False).iloc[0]
print(f"\n Best Model: {best_model['Model']} with F1 Score = {best_model['F1 Score']:.4f}")


Initial Model Performance:
                  Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.797203   0.854167  0.650794  0.738739
1        Random Forest  0.776224   0.771930  0.698413  0.733333
2                  SVM  0.636364   0.648649  0.380952  0.480000

Final Model Comparison:
                    Model  Accuracy  Precision    Recall  F1 Score
3  Random Forest (Tuned)  0.804196   0.830189  0.698413  0.758621
0    Logistic Regression  0.797203   0.854167  0.650794  0.738739
1          Random Forest  0.776224   0.771930  0.698413  0.733333
4            SVM (Tuned)  0.748252   0.764706  0.619048  0.684211
2                    SVM  0.636364   0.648649  0.380952  0.480000

 Best Model: Random Forest (Tuned) with F1 Score = 0.7586
