In [1]:
# Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import joblib

# Load cleaned training data
df = pd.read_csv("Cleaned_Data.csv")

# 1) Feature engineering to create Title, FamilySize, IsAlone from cleaned data
# - FamilySize and IsAlone exist in Cleaned_Data.csv, but we recompute to be robust
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1)

# Extract Title from Name and bucket to Miss/Mr/Mrs/Rare
def extract_title(name):
    m = re.search(r',\s*([^\.]+)\.', str(name))
    return m.group(1).strip() if m else 'Rare'

def map_title_to_buckets(title):
    if title in ['Mr']:
        return 'Mr'
    if title in ['Mrs', 'Mme', 'Lady', 'Countess']:
        return 'Mrs'
    if title in ['Miss', 'Mlle']:
        return 'Miss'
    return 'Rare'

df['Title'] = df['Name'].apply(extract_title).apply(map_title_to_buckets)

# 2) Select ONLY the 7 requested features for X, and Survived as y
feature_cols = ['Sex', 'Pclass', 'Age', 'Fare', 'Title', 'FamilySize', 'IsAlone']
X = df[feature_cols].copy()
y = df['Survived'].astype(int)

# 3) Train/test split (same seed and stratify as your original)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4) Preprocessing: impute + scale numerics; OHE for Title; pass IsAlone as categorical/bool
num_features = ['Sex', 'Pclass', 'Age', 'Fare', 'FamilySize']
cat_features = ['Title', 'IsAlone']  # Title is object; IsAlone is bool

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features)
])

# 5) Build pipeline and tune RandomForest on ROC AUC
pipe_rf = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 5, 10]
}

gs = GridSearchCV(pipe_rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("Best CV ROC AUC:", gs.best_score_)

# 6) Evaluate on held-out test set
best_model = gs.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 7) Save the retrained model
joblib.dump(best_model, 'titanic_model_7features.pkl')


Best params: {'clf__max_depth': 5, 'clf__n_estimators': 100}
Best CV ROC AUC: 0.8700880276115385
Accuracy: 0.8156424581005587
Precision: 0.8
Recall: 0.6956521739130435
F1: 0.7441860465116279
ROC AUC: 0.8480895915678524
Confusion matrix:
 [[98 12]
 [21 48]]


['titanic_model_7features.pkl']