In [3]:
import pandas as pd
df = pd.read_csv("Cleaned_Data.csv")
print(df.shape)
print(df['Survived'].value_counts())
print(df.dtypes)


(891, 17)
Survived
0    549
1    342
Name: count, dtype: int64
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked         int64
FamilySize       int64
IsAlone          int64
Title_Miss        bool
Title_Mr          bool
Title_Mrs         bool
Title_Rare        bool
dtype: object


In [4]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Survived','Name','Ticket','PassengerId'])  # drop irrelevant identifiers
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','bool','category']).columns.tolist()

num_cols = [c for c in num_cols if c!='Survived']  # ensure target not in features

num_pipe = Pipeline([('scaler', StandardScaler())])
cat_pipe = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

pipe = Pipeline([('pre', preprocessor),
                 ('clf', LogisticRegression(max_iter=1000))])

scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
print("Logistic CV accuracy:", scores.mean())

pipe_dt = Pipeline([('pre', preprocessor),
                    ('clf', DecisionTreeClassifier(random_state=42))])
print("DecisionTree CV accuracy:", cross_val_score(pipe_dt, X_train, y_train, cv=5).mean())


Logistic CV accuracy: 0.8174628188712696
DecisionTree CV accuracy: 0.7626908302964641


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

pipe_rf = Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(random_state=42))])
print("RF CV:", cross_val_score(pipe_rf, X_train, y_train, cv=5, scoring='roc_auc').mean())


RF CV: 0.8461838473554775


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

pipe_rf.fit(X_train, y_train)
y_pred = pipe_rf.predict(X_test)
y_proba = pipe_rf.predict_proba(X_test)[:,1]

print("Accuracy", accuracy_score(y_test, y_pred))
print("Precision", precision_score(y_test, y_pred))
print("Recall", recall_score(y_test, y_pred))
print("F1", f1_score(y_test, y_pred))
print("ROC AUC", roc_auc_score(y_test, y_proba))
print("Confusion matrix:\\n", confusion_matrix(y_test, y_pred))


Accuracy 0.8156424581005587
Precision 0.7647058823529411
Recall 0.7536231884057971
F1 0.7591240875912408
ROC AUC 0.8337944664031621
Confusion matrix:\n [[94 16]
 [17 52]]


In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
  'clf__n_estimators':[100,200],
  'clf__max_depth':[None,5,10]
}
pipe_rf = Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(random_state=42))])
gs = GridSearchCV(pipe_rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
gs.fit(X_train, y_train)
print("Best params:", gs.best_params_)
print("Best CV score:", gs.best_score_)


Best params: {'clf__max_depth': 5, 'clf__n_estimators': 200}
Best CV score: 0.8638419065260757


In [11]:
import joblib
best_model = gs.best_estimator_
joblib.dump(best_model, "titanic_model.pkl")


['titanic_model.pkl']