In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

Load Titanic dataset

In [2]:
df = pd.read_csv("titanic/train.csv")

Data preprocessing (simplified)

In [3]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(2, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(2, inplace=True)


Feature extraction

In [4]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

x = df[features]
y = df['Survived']

Train / Test split

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2, random_state=42)

Logistic Regression model

In [6]:
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(x_train, y_train)

y_pred_log = log_reg.predict(x_test)

Decision tree model

In [7]:
tree = DecisionTreeClassifier(max_depth=3, random_state=42)
tree.fit(x_train, y_train)
y_pred_tree = tree.predict(x_test)

In [8]:
# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_tree))

# Classification Report (Precision, Recall, F1)
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_log))
print("Decision Tree Report:\n", classification_report(y_test, y_pred_tree))

Logistic Regression Accuracy: 0.7994389901823282
Decision Tree Accuracy: 0.7798036465638148
Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.80      0.89      0.84       436
           1       0.79      0.66      0.72       277

    accuracy                           0.80       713
   macro avg       0.80      0.77      0.78       713
weighted avg       0.80      0.80      0.80       713

Decision Tree Report:
               precision    recall  f1-score   support

           0       0.77      0.92      0.84       436
           1       0.82      0.56      0.66       277

    accuracy                           0.78       713
   macro avg       0.79      0.74      0.75       713
weighted avg       0.79      0.78      0.77       713

