In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load and preprocess data (same as before)
train_data = pd.read_csv('titanic/train.csv')
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
train_data.drop('Cabin', axis=1, inplace=True)
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
train_data['IsAlone'] = (train_data['FamilySize'] == 0).astype(int)
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)

features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Sex_male', 'Embarked_Q', 'Embarked_S']
X = train_data[features]
y = train_data['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)


In [2]:
# Initialize models
dt_model = DecisionTreeClassifier(max_depth=3, random_state=42)  # Same as Day 1
lr_model = LogisticRegression(max_iter=1000, random_state=42)
nb_model = GaussianNB()

# Train models
dt_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
nb_model.fit(X_train, y_train)

# Make predictions
dt_pred = dt_model.predict(X_val)
lr_pred = lr_model.predict(X_val)
nb_pred = nb_model.predict(X_val)

# Evaluate performance
def evaluate_model(name, y_true, y_pred):
    return {
        'Model': name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred)
    }

results = [
    evaluate_model('Decision Tree (Day 1)', y_val, dt_pred),
    evaluate_model('Logistic Regression', y_val, lr_pred),
    evaluate_model('Naive Bayes', y_val, nb_pred)
]

# Display results
results_df = pd.DataFrame(results)
print(results_df.round(3))

                   Model  Accuracy  Precision  Recall  F1 Score
0  Decision Tree (Day 1)     0.799      0.797   0.689     0.739
1    Logistic Regression     0.799      0.779   0.716     0.746
2            Naive Bayes     0.793      0.761   0.730     0.745
