In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.preprocessing import StandardScaler

# Load cleaned dataset
df = pd.read_csv("titanic_dataset/cleaned_dataset.csv")

df.head()


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,False,True
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,False,False
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,False,True
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,False,True
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,False,True


In [108]:

# Features and target
feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S']
X = df[feature_cols]
y = df['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Evaluation Metrics (Scikit-learn Logistic Regression):")
print("Accuracy: ", round(accuracy_score(y_test, y_pred), 4))
print("Precision:", round(precision_score(y_test, y_pred), 4))
print("Recall:   ", round(recall_score(y_test, y_pred), 4))
print("F1 Score: ", round(f1_score(y_test, y_pred), 4))


Evaluation Metrics (Scikit-learn Logistic Regression):
Accuracy:  0.8101
Precision: 0.7857
Recall:    0.7432
F1 Score:  0.7639


In [109]:
#feature Engineering

df_fe = df.copy()

# Create engineered features
df_fe['FamilySize'] = df_fe['SibSp'] + df_fe['Parch']
df_fe['IsAlone'] = (df_fe['FamilySize'] == 0).astype(int)
df_fe['Age_Fare'] = df_fe['Age'] * df_fe['Fare']
df_fe['Sex_num'] = df_fe['Sex'].map({0: 0.5, 1: 1.0})  # Assuming 0: female, 1: male

# 👉 Add interaction feature
df_fe['Sex_Pclass'] = df_fe['Sex_num'] * df_fe['Pclass']

df_fe.head()


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,FamilySize,IsAlone,Age_Fare,Sex_num,Sex_Pclass
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,False,True,1,0,159.5,0.5,1.5
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,False,False,1,0,2708.7654,1.0,1.0
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,False,True,0,1,206.05,1.0,3.0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,False,True,1,0,1858.5,1.0,1.0
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,False,True,0,1,281.75,0.5,1.5


In [110]:
# final_features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Age_Fare', 'Sex_Pclass', 'Sex_num']
final_features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'Age_Fare', 'Sex_Pclass']
X = df_fe[final_features]
y = df_fe['Survived']

In [111]:
# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [112]:

# Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [113]:

# Predict
y_pred = model.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Evaluation with Interaction Feature (Sex_num * Pclass):")
print("Accuracy: ", round(accuracy_score(y_test, y_pred), 4))
print("Precision:", round(precision_score(y_test, y_pred), 4))
print("Recall:   ", round(recall_score(y_test, y_pred), 4))
print("F1 Score: ", round(f1_score(y_test, y_pred), 4))

Evaluation with Interaction Feature (Sex_num * Pclass):
Accuracy:  0.838
Precision: 0.8
Recall:    0.8108
F1 Score:  0.8054
