Import 

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import joblib   # for saving models

Load Cleaned Data

In [5]:
df = pd.read_csv("../data/processed/titanic_clean.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,1,0,3,0,22.0,1,0,7.25,False,True
1,2,1,1,1,38.0,1,0,71.2833,False,False
2,3,1,3,1,26.0,0,0,7.925,False,True
3,4,1,1,1,35.0,1,0,53.1,False,True
4,5,0,3,0,35.0,0,0,8.05,False,True


Split Features And Target

In [23]:
# Target variable
y = df["Survived"]

# Features
x = df.drop("Survived", axis=1)

print("x shape:", x.shape)
print("y shape:", y.shape)

x shape: (891, 9)
y shape: (891,)


Train Test Split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("Train shape:", x_train.shape)
print("Test shape:", x_test.shape)

Train shape: (712, 9)
Test shape: (179, 9)


Feature Scaling (Optional for Logistic Regrtession)

In [8]:
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

Train Logistic Regression

In [9]:
logreg = LogisticRegression()
logreg.fit(x_train_scaled, y_train)

# Predict
y_pred = logreg.predict(x_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8044692737430168
Confusion Matrix:
 [[89 16]
 [19 55]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.84       105
           1       0.77      0.74      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.80      0.80      0.80       179



Train Random Forest

In [10]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

y_pred_rf = rf.predict(x_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.8379888268156425


Save both models

In [11]:
joblib.dump(logreg, "../models/logreg_model.pkl")
joblib.dump(rf, "../models/rf_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

print("Logistic Regression, Random Forest, and Sacler saved sucessfully")

Logistic Regression, Random Forest, and Sacler saved sucessfully


Load test.csv

In [12]:
test_df = pd.read_csv("../data/raw/test.csv")
print("Test data shape:", test_df.shape)
test_df.head()

Test data shape: (418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Clean tes.csv

In [13]:
test_df["Age"].fillna(test_df["Age"].median(), inplace=True)
test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)
test_df["Sex"] = test_df["Sex"].map({"male": 0, "female": 1})
test_df["Embarked"].fillna(test_df["Embarked"].mode()[0], inplace=True)
test_df = pd.get_dummies(test_df, columns=["Embarked"], drop_first=True)
test_df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["Age"].fillna(test_df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

Checking test.csv

In [14]:
test_df.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [15]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,892,3,0,34.5,0,0,7.8292,True,False
1,893,3,1,47.0,1,0,7.0,False,True
2,894,2,0,62.0,0,0,9.6875,True,False
3,895,3,0,27.0,0,0,8.6625,False,True
4,896,3,1,22.0,1,1,12.2875,False,True


Save Cleaned test.csv

In [16]:
test_df.to_csv("../data/processed/titanic_test_clean.csv", index=False)
print("Cleaned test data saved sucessfully")

Cleaned test data saved sucessfully


Allign Columns

In [17]:
missing_cols = set(x.columns) - set(test_df.columns)
for c in missing_cols:
    test_df[c] = 0
test_df = test_df[x.columns]

Load Models and Predict

In [18]:
scaler = joblib.load("../models/scaler.pkl")
logreg = joblib.load("../models/logreg_model.pkl")
rf = joblib.load("../models/rf_model.pkl")

# Scale for logistic regression
X_test_final_scaled = scaler.transform(test_df)

# Predictions from both
pred_log = logreg.predict(X_test_final_scaled)
pred_rf = rf.predict(test_df)

print("Predictions generated from both models")

Predictions generated from both models


Save both submission files

In [19]:
submission_log = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": pred_log
})
submission_log.to_csv("../data/processed/titanic_submission_logreg.csv", index=False)

submission_rf = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": pred_rf
})
submission_rf.to_csv("../data/processed/titanic_submission_rf.csv", index=False)

print(" Saved both submissions:")
print("- titanic_submission_logreg.csv")
print("- titanic_submission_rf.csv")

 Saved both submissions:
- titanic_submission_logreg.csv
- titanic_submission_rf.csv


In [20]:
import pickle
import os

os.makedirs("../models", exist_ok=True)

# Save models and scaler properly in binary mode
with open("../models/logreg_model.pkl", "wb") as f:
    pickle.dump(logreg, f)

with open("../models/rf_model.pkl", "wb") as f:
    pickle.dump(rf, f)

with open("../models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("✅ Models and scaler re-saved successfully!")


✅ Models and scaler re-saved successfully!
