In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier


In [3]:
import pandas as pd
df = pd.read_csv("data/phase_3_titanic_dataset.csv")
df.columns


Index(['survived', 'pclass', 'sibsp', 'parch', 'alone', 'age_norm',
       'fare_norm', 'sex_encoded', 'embarked_Q', 'embarked_S', 'class_Second',
       'class_Third'],
      dtype='object')

In [3]:
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = df.drop(columns=['survived'])  # all columns except the label
y = df['survived']                 # the target column

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # 20% test data
    random_state=42,      # for reproducibility
    stratify=y            # ensures class balance in both sets
)


### Set Up Parameter Grid
We’ll test different values for `n_neighbors`, `weights`, and `p`:

In [4]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 = Manhattan distance, 2 = Euclidean
}


### Run Grid Search

In [5]:
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


### Best Parameters & Accuracy

In [6]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated accuracy:", grid_search.best_score_)


Best parameters: {'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}
Best cross-validated accuracy: 0.756829765545362


### Evaluate on Test Set

In [7]:
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)

from sklearn.metrics import classification_report, accuracy_score
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy on test set: 0.7720588235294118
              precision    recall  f1-score   support

           0       0.77      0.92      0.84        86
           1       0.79      0.52      0.63        50

    accuracy                           0.77       136
   macro avg       0.78      0.72      0.73       136
weighted avg       0.77      0.77      0.76       136



### Random Forest Tuning

✅ Step 1: Define Parameter Grid


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': [None, 'balanced']
}


✅ Step 2: Set Up and Run Grid Search

In [9]:
rf = RandomForestClassifier(random_state=42)

grid_search_rf = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_rf.fit(X_train, y_train)


✅ Step 3: Print Results and Evaluate

In [11]:
print("Best parameters:", grid_search_rf.best_params_)
print("Best cross-validated accuracy:", grid_search_rf.best_score_)

# Predict on test data
y_pred_rf = grid_search_rf.predict(X_test)

Best parameters: {'class_weight': None, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validated accuracy: 0.8046890927624872


In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       0.77      0.97      0.86        86
           1       0.89      0.50      0.64        50

    accuracy                           0.79       136
   macro avg       0.83      0.73      0.75       136
weighted avg       0.81      0.79      0.78       136



- Interpretation

| Metric             | Meaning                                                             |
| ------------------ | ------------------------------------------------------------------- |
| **Precision 0.89** | 89% of predicted survivors were correct. Very few false positives.  |
| **Recall 0.50**    | Only 50% of actual survivors were identified. Many false negatives. |
| **Accuracy 0.79**  | Higher than all previous models. Overall good performance.          |


✅ What Next? Improve Recall for Class 1 

We'll now:

1. Adjust decision threshold (⬆ recall)
2. Try `class_weight='balanced'` again (even if it wasn’t best for accuracy)
3. Plot precision-recall curve (to find a better threshold)

> Step A1: Predict Probabilities and Adjust Threshold

In [13]:
y_proba = grid_search_rf.predict_proba(X_test)[:, 1]

# Predict class 1 if probability > 0.4
y_pred_adjusted = (y_proba > 0.4).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_adjusted))


              precision    recall  f1-score   support

           0       0.81      0.93      0.86        86
           1       0.84      0.62      0.71        50

    accuracy                           0.82       136
   macro avg       0.82      0.78      0.79       136
weighted avg       0.82      0.82      0.81       136



This may raise recall for survivors by allowing more class 1 predictions.

> Step A2: Try class_weight='balanced' Again

Even though `None` gave best accuracy, let’s see if `balanced` helps recall.

In [14]:
rf_balanced = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=2,
    class_weight='balanced'
)

rf_balanced.fit(X_train, y_train)
y_pred_bal = rf_balanced.predict(X_test)

print(classification_report(y_test, y_pred_bal))


              precision    recall  f1-score   support

           0       0.84      0.94      0.89        86
           1       0.88      0.70      0.78        50

    accuracy                           0.85       136
   macro avg       0.86      0.82      0.83       136
weighted avg       0.86      0.85      0.85       136



### interpretation

> A1: Lowering the Decision Threshold to 0.4

| Metric                  | Value |
| ----------------------- | ----- |
| **Precision (class 1)** | 0.84  |
| **Recall (class 1)**    | 0.62  |
| **F1-score (class 1)**  | 0.71  |
| **Accuracy**            | 0.82  |

Improvement: Balanced trade-off — better recall while maintaining good precision.

>  A2: Using class_weight='balanced'

| Metric                  | Value |
| ----------------------- | ----- |
| **Precision (class 1)** | 0.88  |
| **Recall (class 1)**    | 0.70  |
| **F1-score (class 1)**  | 0.78  |
| **Accuracy**            | 0.85  |

Even better: Recall reached 0.70 with strong precision. This is your best model yet for detecting survivors!

### Recommendation
Use the `class_weight='balanced'` Random Forest for now — it gives:

- Highest accuracy: 85%
- Best recall for survivors
- Solid F1-score and balanced performance

# Export the Model

In [17]:
import joblib

# Save the trained model
joblib.dump(rf_balanced, 'models/random_forest_model.pkl')


['models/random_forest_model.pkl']

> Import the Model Later

In [19]:
import joblib

# Load the trained model
loaded_model = joblib.load('models/random_forest_model.pkl')


> Test the Loaded Model

In [22]:
# Example: X_test is your test feature set
y_pred = loaded_model.predict(X_test)

# Evaluate performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.94      0.89        86
           1       0.88      0.70      0.78        50

    accuracy                           0.85       136
   macro avg       0.86      0.82      0.83       136
weighted avg       0.86      0.85      0.85       136



> Use predict_proba with Threshold 0.4

In [23]:
y_proba = loaded_model.predict_proba(X_test)[:, 1]
y_pred_thresh = (y_proba >= 0.4).astype(int)

print(classification_report(y_test, y_pred_thresh))


              precision    recall  f1-score   support

           0       0.89      0.81      0.85        86
           1       0.72      0.82      0.77        50

    accuracy                           0.82       136
   macro avg       0.80      0.82      0.81       136
weighted avg       0.82      0.82      0.82       136

