# Classification with Titanic Dataset

## Titanic Dataset

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import os

root_dir = "PATH/TO/YOUR/DIRECTORY"

# Checking if our specified directory exists
os.path.exists(root_dir)

In [None]:
import pandas as pd

# Paths to the downloaded files
data_path = os.path.join(root_dir, "titanic_train.csv")

# Load data
df = pd.read_csv(data_path)

# Check the first few rows of the dataframe
print(df.head())

In [None]:
df.info()

In [None]:
df.describe()

## Data Preprocessing

Eliminate variables that are not utilized as inputs or that contain numerous missing values.

In [None]:
drop_vars = ["Name", "PassengerId", "Ticket", "Cabin"]
df.drop(drop_vars, axis=1, inplace=True)
df.info()

Impute the missing values:
* Continuous variables: mean value
* Categorical variables: most frequent value

In [None]:
# Mean imputation
df["Age"] = df["Age"].fillna(df["Age"].mean()) # without inplace


# Mode imputation
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0]) # without inplace

df.info()

Encode the Categorical Variables.

In [None]:
df

In [None]:
df["Sex"] = df["Sex"].replace({"male": 0, "female": 1})

var = "Embarked"
one_hot = pd.get_dummies(df[var], prefix=var)
df = pd.concat([df, one_hot], axis=1).drop([var], axis=1)

df

### Train-Test Split

* Split data into train and test sets ([model_selection.train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html))

In [None]:
from sklearn.model_selection import train_test_split

random_state = 100
shuffle = True
test_size_ratio = 0.25

train_df, test_df = train_test_split(df, test_size=test_size_ratio, random_state=random_state, shuffle=shuffle)
print(train_df.shape, test_df.shape)

In [None]:
target = "Survived"

X_train = train_df.drop(target, axis=1).values
y_train = train_df[target].values

X_test = test_df.drop(target, axis=1).values
y_test = test_df[target].values

target_names = ["Not survived", "Survived"]
feature_names = train_df.drop(target, axis=1).columns

## Training & Validation

Models
* Decision tree ([tree.DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html))
* Bagging ([ensemble.BaggingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html))
* Random Forests ([ensemble.RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html))
* Stacking ([ensemble.StackingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html))

Cross Validation
* K-folds cross validator ([model_selection.KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html))
* Evaluate a score by CV ([model_selection.cross_val_score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, StackingClassifier

from sklearn.model_selection import KFold, cross_val_score

### K-Fold Cross Validation for a Single Model

 You can change the scoring function by using the `scoring` parameter in `cross_val_score`.
* `accuracy`: Accuracy (**default**)
* `roc_auc`: Area under the receiver operating characteristic (ROC) curve
* `f1`: F1 score
* `precision`: Precision
* `recall`: Recall

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
scoring = "roc_auc"

In [None]:
# Decision tree
model = DecisionTreeClassifier(criterion="gini",
                               max_depth=5,
                               min_samples_split=2,
                               min_impurity_decrease=0.0)

scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=scoring)
print("Scores from each iteration:", scores)
print("Average score:", scores.mean())

In [None]:
# Bagging
base_model = DecisionTreeClassifier()
model = BaggingClassifier(estimator=base_model,
                          n_estimators=100,
                          bootstrap=True,
                          n_jobs=-1,
                          random_state=random_state)

scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=scoring)
print("Scores from each iteration:", scores)
print("Average score:", scores.mean())

In [None]:
# Random Forests
model = RandomForestClassifier(n_estimators=100,
                               max_depth=None,
                               min_samples_split=2,
                               max_features="sqrt",
                               bootstrap=True,
                               n_jobs=-1,
                               random_state=random_state)

scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=scoring)
print("Scores from each iteration:", scores)
print("Average score:", scores.mean())

In [None]:
from sklearn.linear_model import RidgeClassifier

# Stacking
model = StackingClassifier(
    estimators=[
        ('dt1', DecisionTreeClassifier(max_depth=3)),
        ('dt2', DecisionTreeClassifier(max_depth=5)),
        ('dt3', DecisionTreeClassifier(max_depth=7)),
    ],
    final_estimator=RidgeClassifier(),
    cv=kf,
)

model.fit(X_train, y_train)
# model.predict(X_test)

### K-Fold Cross Validation using Grid Search

* Grid search over specified parameter values ([model_selection.GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html))

In [None]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(min_samples_split=2)

# Define the hyperparameters and their possible values
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 10, 20, None],
    "min_impurity_decrease": [0.0, 0.05, 0.1],
}

dt = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True)
dt.fit(X_train, y_train)
# dt.predict(X_test)
# dt.best_estimator_.predict(X_test)

# Print the best parameters and the corresponding score
print("Best parameters: ", dt.best_params_)
print("Best CV score: {:.6f}".format(dt.best_score_))

In [None]:
base_model = DecisionTreeClassifier()
model = BaggingClassifier(estimator=base_model,
                          bootstrap=True,
                          n_jobs=-1,
                          random_state=random_state)

# Define the hyperparameters and their possible values
param_grid = {
    "n_estimators": [50, 100, 200, 300]
}

bagg = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True)
bagg.fit(X_train, y_train)
# bagg.predict(X_test)
# bagg.best_estimator_.predict(X_test)

# Print the best parameters and the corresponding score
print("Best parameters: ", bagg.best_params_)
print("Best CV score: {:.6f}".format(bagg.best_score_))

In [None]:
model = RandomForestClassifier(max_depth=None,
                               min_samples_split=2,
                               bootstrap=True,
                               n_jobs=-1,
                               random_state=random_state)

# Define the hyperparameters and their possible values
param_grid = {
    "n_estimators": [50, 100, 200, 300],
    "max_features": [0.5, "sqrt", "log2", None],
}

rf = GridSearchCV(model, param_grid, cv=kf, scoring=scoring, refit=True)
rf.fit(X_train, y_train)
# rf.predict(X_test)
# rf.best_estimator_.predict(X_test)

# Print the best parameters and the corresponding score
print("Best parameters: ", rf.best_params_)
print("Best CV score: {:.6f}".format(rf.best_score_))

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
y_prob = bagg.predict_proba(X_test)
print("Estimated probs:", y_prob[:10])

y_cls = bagg.predict(X_test)
print("Estimated classes:", y_cls[:10])
print()

In [None]:
import numpy as np

np.array([estimator.predict_proba(X_test) for estimator in bagg.best_estimator_.estimators_]).shape

In [None]:
y_prob = rf.predict_proba(X_test)
print("Estimated probs:", y_prob[:10])

y_cls = rf.predict(X_test)
print("Estimated classes:", y_cls[:10])
print()

Estimate classes with custom threshold

In [None]:
threshold = 0.5
y_cls = (y_prob[:, 1] >= threshold).astype(int)
print("Estimated classes:", y_cls[:10])

* Accuracy ([metrics.accuracy_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html))
* F1 ([metrics.f1_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html))
* ROC AUC ([metrics.roc_auc_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html))

In [None]:
print("Accuracy:", accuracy_score(y_test, y_cls))
print("F1:", f1_score(y_test, y_cls))
print("ROC AUC:", roc_auc_score(y_test, y_prob[:, 1]))

* Confusion Matrix ([metrics.confusion_matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html))

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_cls)
conf_matrix_df = pd.DataFrame(
    conf_matrix,
    columns=["Predicted Not-Survived", "Predicted Survived"],
    index=["Actual Not-Survived", "Actual Survived"]
)
print(conf_matrix_df)

* ROC Curve ([metrics.roc_curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])

plt.plot(fpr, tpr, color="darkorange", lw=2)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("1 - Specificity (FP Rate)")
plt.ylabel("Sensitivity (TP Rate)")
plt.title("ROC Curve")
plt.show()

## Interpretation

### Random Forest

In [None]:
rf.best_estimator_.feature_importances_

In [None]:
import numpy as np

# Get feature importances
importances = rf.best_estimator_.feature_importances_

# Sort feature importances in ascending order and get the indices
indices = np.argsort(importances)

# Rearrange feature names so they match the sorted feature importances
names = [feature_names[i] for i in indices]

plt.figure(figsize=(10, 6))
plt.barh(range(X_train.shape[1]), importances[indices])
plt.yticks(range(X_train.shape[1]), names)
plt.ylabel("Feature")
plt.xlabel("Importance")
plt.title("Feature Importance in Random Forest Model")
plt.tight_layout()
plt.show()