# Import libraries


In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")


# Load and Preprocess Titanic Dataset

In [2]:
# Load dataset
titanic = sns.load_dataset("titanic")

# Select useful features
df = titanic[["survived", "pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]]

# Drop missing values
df = df.dropna()

# Encode categorical columns
le_sex = LabelEncoder()
le_embarked = LabelEncoder()
df["sex"] = le_sex.fit_transform(df["sex"])
df["embarked"] = le_embarked.fit_transform(df["embarked"])

# Split features and target
X = df.drop("survived", axis=1)
y = df["survived"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Train Multiple Models

In [3]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

# Display
results_df = pd.DataFrame(results).T
print("Model Evaluation Results:\n")
print(results_df)


Model Evaluation Results:

                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.797203   0.854167  0.650794  0.738739
Support Vector Machine  0.825175   0.865385  0.714286  0.782609
Random Forest           0.783217   0.766667  0.730159  0.747967
Decision Tree           0.685315   0.666667  0.571429  0.615385


# Hyperparameter Tuning

In [4]:
#GridSearchCV for SVC

param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svc = GridSearchCV(SVC(), param_grid_svc, cv=5, scoring='f1')
grid_svc.fit(X_train, y_train)

print("\nBest SVC Parameters from GridSearchCV:")
print(grid_svc.best_params_)



Best SVC Parameters from GridSearchCV:
{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


In [5]:
#RandomizedSearchCV for RandomForest
from scipy.stats import randint

param_dist_rf = {
    "n_estimators": randint(50, 200),
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

random_search_rf = RandomizedSearchCV(RandomForestClassifier(), param_dist_rf,
                                      n_iter=10, cv=5, scoring='f1', random_state=42)
random_search_rf.fit(X_train, y_train)

print("\nBest RandomForest Parameters from RandomizedSearchCV:")
print(random_search_rf.best_params_)



Best RandomForest Parameters from RandomizedSearchCV:
{'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 180}


# Evaluate Tuned Models

In [6]:
final_models = {
    "Tuned SVC": grid_svc.best_estimator_,
    "Tuned Random Forest": random_search_rf.best_estimator_
}

tuned_results = {}

for name, model in final_models.items():
    y_pred = model.predict(X_test)
    tuned_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

tuned_df = pd.DataFrame(tuned_results).T
print("\nTuned Model Evaluation Results:\n")
print(tuned_df)



Tuned Model Evaluation Results:

                     Accuracy  Precision    Recall  F1 Score
Tuned SVC            0.825175   0.865385  0.714286  0.782609
Tuned Random Forest  0.790210   0.823529  0.666667  0.736842


# Best Model

In [7]:
summary = pd.concat([results_df, tuned_df])
print("\nAll Model Performance Summary:\n")
print(summary.sort_values(by="F1 Score", ascending=False))



All Model Performance Summary:

                        Accuracy  Precision    Recall  F1 Score
Support Vector Machine  0.825175   0.865385  0.714286  0.782609
Tuned SVC               0.825175   0.865385  0.714286  0.782609
Random Forest           0.783217   0.766667  0.730159  0.747967
Logistic Regression     0.797203   0.854167  0.650794  0.738739
Tuned Random Forest     0.790210   0.823529  0.666667  0.736842
Decision Tree           0.685315   0.666667  0.571429  0.615385


# Final Model Selection Summary

In [9]:
print("\nSummary and Conclusion:\n")

summary_text = """
After training and evaluating multiple machine learning models on the Titanic dataset, we observed the following:

1) The best-performing model in terms of F1 Score was Support Vector Machine (SVC) with an F1 Score of 0.7826,
which remained the same even after hyperparameter tuning (Tuned SVC), indicating the initial parameters were already optimal.

2)Random Forest and Logistic Regression also performed reasonably well, with F1 Scores of 0.7479 and 0.7387, respectively.
Tuned Random Forest gave a slight improvement in precision but did not outperform the original SVC model.

3) Decision Tree showed the weakest performance across all metrics, suggesting it may not generalize well for this dataset.

Conclusion:
The Support Vector Machine (SVC) model is selected as the best-performing model based on F1 Score and overall balanced performance across all evaluation metrics. No further improvement was observed with tuning, confirming its robustness on this problem.
"""

print(summary_text)



Summary and Conclusion:


After training and evaluating multiple machine learning models on the Titanic dataset, we observed the following:

1) The best-performing model in terms of F1 Score was Support Vector Machine (SVC) with an F1 Score of 0.7826, 
which remained the same even after hyperparameter tuning (Tuned SVC), indicating the initial parameters were already optimal.

2)Random Forest and Logistic Regression also performed reasonably well, with F1 Scores of 0.7479 and 0.7387, respectively.
Tuned Random Forest gave a slight improvement in precision but did not outperform the original SVC model.

3) Decision Tree showed the weakest performance across all metrics, suggesting it may not generalize well for this dataset.

Conclusion: 
The Support Vector Machine (SVC) model is selected as the best-performing model based on F1 Score and overall balanced performance across all evaluation metrics. No further improvement was observed with tuning, confirming its robustness on this proble