In [6]:
import pandas as pd

train_df = pd.read_csv("train.csv")
from sklearn.metrics import f1_score


train_df["Sex"].unique()
# 1. Clean text
train_df["Sex"] = train_df["Sex"].str.lower().str.strip()

# 2. Map to numbers
train_df["Sex"] = train_df["Sex"].map({"male": 0, "female": 1})

# 3. Fill Age
train_df["Age"] = train_df["Age"].fillna(train_df["Age"].median())

# 4. Create FamilySize
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1

train_df[["Sex", "Age", "FamilySize"]].isna().sum()

## Step 4: Select final features for model
features = ['Pclass'] + ['Sex'] + ['Age'] + ['FamilySize']
X = train_df[features]
y = train_df['Survived']

## Step 2: Split data (Train / Test)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size = 0.2,random_state = 42)
## Step 3: Train Logistic Regression
X.isna().sum()
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)
## Step 5: Make Predictions
y_pred = model.predict(X_test)
f1_lr = f1_score(y_test,y_pred)
### Step 6: Check Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy
### Step 7: (Optional but useful) Confusion Matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)
print(accuracy)
print(f1_lr)

0.8044692737430168
0.7552447552447552


### Topic 1: Try Decision Tree & Random Forest

# Day 82: Improve Titanic Model

### Step 2: Decision Tree model

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train,y_train)
y_pred = dt_model.predict(X_test)
f1_dt = f1_score(y_test,y_pred)
dt_acc = accuracy_score(y_test,y_pred)
print(dt_acc)
print(f1_dt)
### 0.7821229050279329

0.7821229050279329
0.7272727272727273


### Step 3: Random Forest model

In [8]:
from sklearn.ensemble  import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100,random_state = 42)
rf_model.fit(X_train,y_train)
rf_pred = rf_model.predict(X_test)
f1_rf = f1_score(y_test,rf_pred)
rf_acc = accuracy_score(y_test,rf_pred)
print(rf_acc)
print(f1_rf) 
### 0.8156424581005587

0.8156424581005587
0.7692307692307693


## Topic 2 : Use GridSearchCV for Hyperparameter Tuning

### Step 1: Import GridSearchCV

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
### Step 2: Define parameter grid (VERY IMPORTANT)
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10]
}
### Step 3: Create GridSearch object
rf = RandomForestClassifier(random_state=42)

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,              # 5-fold cross validation
    scoring="accuracy",
    n_jobs=-1          # use all CPU cores
)
### Step 4: Fit GridSearch (this may take time)
grid.fit(X_train, y_train)

### Step 5: See BEST parameters
grid.best_params_

### Step 6: Best model & accuracy
best_model = grid.best_estimator_

best_model.score(X_test, y_test)

best_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_split=5,
    random_state=42
)

best_rf.fit(X_train, y_train)
best_rf.score(X_test, y_test)


0.8156424581005587

In [47]:
### Logistic regression = 0.8044692737430168             f1 score :  0.7552447552447552
### Decision Tree Classifer =  0.7821229050279329        f1 score :  0.7821229050279329
### Random Forest model = 0.8156424581005587             f1 score :  0.7692307692307693

## Generate Submission CSV (PassengerId, Survived)

In [16]:
import pandas as pd
test_df = pd.read_csv("test.csv")

## Step 2: Apply SAME feature engineering
# Clean Sex
test_df['Sex'] = test_df['Sex'].astype(str).str.lower().str.strip()
test_df['Sex'] = test_df['Sex'].map({'male':0,'female':1})
# Fill Age
test_df['Age'] = test_df['Age'].fillna(train_df['Age'].median())
# Create FamilySize
test_df["FamilySize"] = test_df["SibSp"] + test_df["Parch"] + 1

# Step 3: Select features (same as training)
X_test_final = test_df[["Pclass", "Sex", "Age", "FamilySize"]]

# Step 4: Predict using BEST model (Random Forest)
test_predictions = best_rf.predict(X_test_final)

# Step 5: Create submission file
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": test_predictions
})

# Step 6: Save CSV 
submission.to_csv("submission.csv", index=False)
