In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [3]:
df = pd.read_csv("titanic_cleaned.csv")
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Familysize,isalone,Title,AgeGroup,Age_scaled,Fare_scales,Familysize_scaled
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,523,7.25,147,2,2,0,Mr,YoungAdult,-0.565736,-0.502445,0.05916
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,596,71.2833,81,0,2,0,Mrs,Adult,0.663861,0.786845,0.05916
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,669,7.925,147,2,1,1,Miss,YoungAdult,-0.258337,-0.488854,-0.560975
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,49,53.1,55,2,2,0,Mrs,YoungAdult,0.433312,0.42073,0.05916
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,472,8.05,147,2,1,1,Mr,YoungAdult,0.433312,-0.486337,-0.560975


In [4]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols.remove("Survived")   # remove target

cat_cols = df.select_dtypes(include=['object']).columns.tolist()

num_cols, cat_cols


(['PassengerId',
  'Pclass',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked',
  'Familysize',
  'isalone',
  'Age_scaled',
  'Fare_scales',
  'Familysize_scaled'],
 ['Name', 'Title', 'AgeGroup'])

In [5]:
num_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler())
])


In [6]:
cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])


In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ]
)


In [8]:
clf = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])


In [9]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
clf.fit(X_train, y_train)


In [11]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8379888268156425
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       110
           1       0.81      0.75      0.78        69

    accuracy                           0.84       179
   macro avg       0.83      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179





In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__C": [0.1, 1, 10],
    "model__penalty": ["l2"]
}

gs = GridSearchCV(clf, param_grid, cv=3, scoring="f1")
gs.fit(X_train, y_train)

gs.best_params_




{'model__C': 10, 'model__penalty': 'l2'}

In [13]:
import joblib

joblib.dump(clf, "titanic_pipeline.pkl")


['titanic_pipeline.pkl']

In [14]:
pipeline_loaded = joblib.load("titanic_pipeline.pkl")
pipeline_loaded.predict(X_test)




array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0])