# Classification with Titanic Dataset

## Titanic Dataset

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import os

root_dir = "PATH/TO/YOUR/DIRECTORY"

# Checking if our specified directory exists
os.path.exists(root_dir)

In [None]:
import pandas as pd

# Paths to the downloaded files
data_path = os.path.join(root_dir, "titanic_train.csv")

# Load data
df = pd.read_csv(data_path)

# Check the first few rows of the dataframe
print(df.head())

In [None]:
df.info()

In [None]:
df.describe()

## Exploring Dataset with Pandas

Examine the distribution of the continuous variable grouped by the target class.

In [None]:
import matplotlib.pyplot as plt

target = "Survived"
var = "Age"

df.boxplot(column=var, by=target)
plt.ylabel(var)
plt.xlabel(target)
plt.title("{} Distribution by {}".format(var, target))
plt.suptitle("")  # Suppress the automatic "Boxplot grouped by Survived" title
plt.show()

In [None]:
var = "Age"
bins = 30
density = True
alpha = 0.3

plt.figure(figsize=(10, 6))
df[df[target] == 0][var].hist(bins=bins, density=density, alpha=alpha, color="red", label="Not Survived")
df[df[target] == 1][var].hist(bins=bins, density=density, alpha=alpha, color="blue", label="Survived")

plt.title("{} Distribution by {}".format(var, target))
plt.xlabel(var)
plt.ylabel("Probability Density")
plt.legend()
plt.show()

Examine the distribution of the categorical variable grouped by the target class.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

var = "Pclass"

plt.figure(figsize=(10, 6))

var_counts = df.groupby(target)[var].value_counts(normalize=False).unstack().transpose()
var_counts.plot(kind="bar", stacked=False, figsize=(10,6), color=["red", "blue"], alpha=0.5)

plt.title("{} Distribution by {}".format(var, target))
plt.xlabel(var)
plt.ylabel("Number of Passengers")
plt.legend(title=target, labels=["No", "Yes"])
plt.show()

## Data Preprocessing

In [None]:
df

In [None]:
df.info()

Eliminate variables that are not utilized as inputs or that contain numerous missing values.

In [None]:
drop_vars = ["Name", "PassengerId", "Ticket", "Cabin"]
df.drop(drop_vars, axis=1, inplace=True)
df.info()

Impute the missing values:
* Continuous variables: mean value
* Categorical variables: most frequent value

In [None]:
# Mean imputation
df["Age"] = df["Age"].fillna(df["Age"].mean())


# Mode imputation
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

df.info()

Encode the Categorical Variables.

In [None]:
df

In [None]:
df["Sex"] = df["Sex"].replace({"male": 0, "female": 1})

var = "Embarked"
one_hot = pd.get_dummies(df[var], prefix=var)
df = pd.concat([df, one_hot], axis=1).drop([var], axis=1)

df

Split the data into training and test sets.

In [None]:
from sklearn.model_selection import train_test_split

random_state = 100
shuffle = True
test_size_ratio = 0.25

train_df, test_df = train_test_split(df, test_size=test_size_ratio, random_state=random_state, shuffle=shuffle)
print(train_df.shape, test_df.shape)

In [None]:
X_train = train_df.drop(target, axis=1).values
y_train = train_df[target].values

X_test = test_df.drop(target, axis=1).values
y_test = test_df[target].values

target_names = ["Not survived", "Survived"]
feature_names = train_df.drop(target, axis=1).columns

## Training & Validation

Models
* Logistic regresssion ([linear_model.LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html))
* Decision tree ([tree.DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html))

Cross Validation
* K-folds cross validator ([model_selection.KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html))
* Evaluate a score by CV ([model_selection.cross_val_score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold, cross_val_score

### K-Fold Cross Validation for a Single Model

 You can change the scoring function by using the `scoring` parameter in `cross_val_score`.
* `accuracy`: Accuracy (**default**)
* `roc_auc`: Area under the receiver operating characteristic (ROC) curve
* `f1`: F1 score
* `precision`: Precision
* `recall`: Recall

In [None]:
# model = LogisticRegression(solver="saga", max_iter=10000, penalty=None)
# model = LogisticRegression(max_iter=10000, penalty="l1", C=1.0, solver="saga") # with L1 regularization
# model = LogisticRegression(max_iter=10000, penalty="l2", C=1.0, solver="saga") # with L2 regularization
model = DecisionTreeClassifier(criterion="gini", max_depth=5, min_samples_split=2, min_impurity_decrease=0.0) # Decision tree

kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
scores = cross_val_score(model, X_train, y_train, cv=kf, scoring="roc_auc")

print("Scores from each iteration:", scores)
print("Average score:", scores.mean())

### K-Fold Cross Validation using Grid Search

* Grid search over specified parameter values ([model_selection.GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html))

In [None]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(min_samples_split=2)

# Define the hyperparameters and their possible values
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 10, 20, None],
    "min_impurity_decrease": [0.0, 0.05, 0.1],
}

grid_search = GridSearchCV(model, param_grid, cv=kf, scoring="roc_auc")
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters: ", grid_search.best_params_)
print("Best CV score: {:.6f}".format(grid_search.best_score_))

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

model = DecisionTreeClassifier(criterion="entropy",
                               max_depth=5,
                               min_samples_split=2,
                               min_impurity_decrease=0.0)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)
print("Estimated probs:", y_prob[:10])

y_cls = model.predict(X_test)
print("Estimated classes:", y_cls[:10])
print()

* Accuracy ([metrics.accuracy_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html))
* F1 ([metrics.f1_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html))
* ROC AUC ([metrics.roc_auc_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html))

In [None]:
print("Accuracy:", accuracy_score(y_test, y_cls))
print("F1:", f1_score(y_test, y_cls))
print("ROC AUC:", roc_auc_score(y_test, y_prob[:, 1]))

* Confusion Matrix ([metrics.confusion_matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html))

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_cls)
conf_matrix_df = pd.DataFrame(
    conf_matrix,
    columns=["Predicted Not-Survived", "Predicted Survived"],
    index=["Actual Not-Survived", "Actual Survived"]
)
print(conf_matrix_df)

* ROC Curve ([metrics.roc_curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html))

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])

plt.plot(fpr, tpr, color="darkorange", lw=2)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("1 - Specificity (FP Rate)")
plt.ylabel("Sensitivity (TP Rate)")
plt.title("ROC Curve")
plt.show()

## Interpretation

### Linear model

In [None]:
# Logistic Regression
model = LogisticRegression(solver="saga", max_iter=10000, penalty=None)
model.fit(X_train, y_train)

coef_df = pd.DataFrame(model.coef_, columns=feature_names)
coef_df["intercept"] = model.intercept_
coef_df

### Decision Tree
* Plot a decision tree ([tree.plot_tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html))

In [None]:
from sklearn.tree import plot_tree

# Decision tree
model = DecisionTreeClassifier(max_depth=3, min_samples_split=2, min_impurity_decrease=0.0)
model.fit(X_train, y_train)

plt.figure(figsize=(20, 10))
plot_tree(model, filled=True, impurity=True, feature_names=feature_names, class_names=target_names, rounded=True)
plt.show()