In [15]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input/titanic-dataset/Titanic-Dataset.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

### Loading Data

In [16]:
df = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")

In [17]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Data preparation

In [19]:
df = df.drop_duplicates()

In [20]:
df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)

In [21]:
df_input = df.drop(["Name", "Survived", "PassengerId"], axis=1)

In [22]:
df_input.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,female,35.0,1,0,113803,53.1,C123,S,Mrs
4,3,male,35.0,0,0,373450,8.05,,S,Mr


### Data Imputation & Encoding

In [23]:
df_input['Age'] = df_input.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

df_input['Fare'] = df_input['Fare'].fillna(df_input['Fare'].median())
df_input['Embarked'] = df_input['Embarked'].fillna(df_input['Embarked'].mode()[0])

df_input['Deck'] = df_input['Cabin'].str[0]
df_input['Deck'] =df_input['Deck'].fillna('Unknown')

df_input['Sex'] = df_input['Sex'].map({'male': 0, 'female': 1})

df_input = pd.get_dummies(df_input, columns=['Title', 'Embarked', 'Deck'], drop_first=True)


df_input= df_input.drop(['Ticket', 'Cabin'], axis=1)
df_input = df_input.astype(int)

df_input['FamilySize'] = df_input['SibSp'] + df_input['Parch']
df_input['IsAlone'] = (df_input['FamilySize'] == 0).astype(int)


In [24]:
df_input.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Col,Title_Don,Title_Dr,Title_Jonkheer,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,FamilySize,IsAlone
0,3,0,22,1,0,7,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,1,1,38,1,0,71,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,3,1,26,0,0,7,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,1,1,35,1,0,53,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,3,0,35,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [25]:
df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

### Observation:
#### Imbalanced dataset

### Train and Test stratified split (imbalanced dataset)

In [26]:
X = df_input 
y = df['Survived']

# Split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### Training dataset on different Models

In [27]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True, class_weight='balanced'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(verbose=0)
}

In [28]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_prob = model.decision_function(X_test)
    else:
        y_prob = y_pred
    
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
    })
results_df = pd.DataFrame(results).sort_values(by='F1 Score', ascending=False)
print(results_df)


                 Model  Accuracy  Precision    Recall  F1 Score   ROC AUC
0  Logistic Regression  0.826816   0.763889  0.797101  0.780142  0.871476
1        Decision Tree  0.804469   0.757576  0.724638  0.740741  0.782279
6             CatBoost  0.815642   0.810345  0.681159  0.740157  0.862780
5              XGBoost  0.804469   0.783333  0.681159  0.728682  0.843808
2        Random Forest  0.787709   0.731343  0.710145  0.720588  0.826285
3    Gradient Boosting  0.782123   0.758621  0.637681  0.692913  0.843544
4                  SVM  0.636872   0.530303  0.507246  0.518519  0.688472


### Performing hyperparameter tuning on CatBoost and XGBoost, as they performing better after logistic regression.

In [29]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid_xgb = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 4, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

xgb_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid_xgb,
    n_iter=50,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_search.fit(X_train, y_train)
print("Best XGBoost Params:", xgb_search.best_params_)


NameError: name 'RandomizedSearchCV' is not defined

In [None]:
cat = CatBoostClassifier(verbose=0)

param_grid_cat = {
    'iterations': [200, 300, 500],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [32, 64, 128],
    'bagging_temperature': [0.2, 0.5, 1.0]
}

cat_search = RandomizedSearchCV(
    estimator=cat,
    param_distributions=param_grid_cat,
    n_iter=30,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

cat_search.fit(X_train, y_train)
print("Best CatBoost Params:", cat_search.best_params_)

In [None]:
xgb_best = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    subsample=1.0,
    reg_lambda=1,
    reg_alpha=0.01,
    n_estimators=300,
    max_depth=3,
    learning_rate=0.2,
    gamma=0,
    colsample_bytree=0.6
)
cat_best = CatBoostClassifier(
    iterations=200,
    depth=6,
    learning_rate=0.1,
    l2_leaf_reg=5,
    border_count=64,
    bagging_temperature=1.0,
    verbose=0
)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True, class_weight='balanced'),
    "XGBoost (Tuned)": xgb_best,
    "CatBoost (Tuned)": cat_best 
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Get probabilities for ROC AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_prob = model.decision_function(X_test)
    else:
        y_prob = y_pred

    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
    })

results_df = pd.DataFrame(results).sort_values(by='F1 Score', ascending=False)
print(results_df)

#### Again Logistic Regression performed better

### A false positive might give a false sense of safety.
### A false negative might be more costly — predicting someone would die when they actually survived might be worse in a rescue simulation.

### After evaluating multiple models on the Titanic dataset, Logistic Regression was selected as the final model due to its strong and consistent performance across key metrics such as Accuracy, Precision, Recall, F1 Score, and ROC AUC.

In [None]:
logreg = LogisticRegression(max_iter=1000, solver='liblinear') 

param_grid = {
    'C': [0.5, 1, 2],      
    'penalty': ['l1', 'l2'],              
    'class_weight': [None, 'balanced']
}

# Setup GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Logistic Regression Params:", grid_search.best_params_)

In [None]:
logreg_tuned = LogisticRegression(C=1, class_weight='balanced', penalty='l2', solver='liblinear', max_iter=1000)

logreg_tuned.fit(X_train, y_train)

y_pred = logreg_tuned.predict(X_test)
y_prob = logreg_tuned.predict_proba(X_test)[:, 1]

In [None]:
logreg_results = {
    'Model': 'Logistic Regression (Tuned)',
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1 Score': f1_score(y_test, y_pred),
    'ROC AUC': roc_auc_score(y_test, y_prob)
}

# Print results
import pandas as pd
results_df = pd.DataFrame([logreg_results])
print(results_df)

### Conclusion

#### Logistic Regression performs better for predicting Titanic passenger survival.