In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# I. Load dataset

In [None]:
df = pd.read_csv("../input/traffic-flow-data-in-ho-chi-minh-city-viet-nam/train.csv", index_col="_id", parse_dates=["date"])

print(df.head())
print(df.shape)

In [None]:
# Choose concerning cols
cols = ["segment_id", "street_id", "street_name", "date", "weekday", 
        "length", "max_velocity", "street_level", "street_type", 
        "long_snode", "lat_snode", "long_enode", "lat_enode", "period", "LOS"]
df = df[cols]

In [None]:
print(df.head())

# II. Feature enrichment and Feature selection

In [None]:
import datetime

# 6h-8h, 16h-19h
peaks = ["period_6_00", "period_6_30", 
         "period_7_00", "period_7_30",
         "period_16_00", "period_16_30", 
         "period_17_00", "period_17_30",
         "period_18_00", "period_18_30"]

def is_special(date):
    # holidays = [(day, month)]
    holidays = [(1,1), (14,2), (8,3), (30,4), 
                (1,5), (1,6), (2,9), (20,10), 
                (20,11), (24,12), (25,12)]
    for holiday in holidays:
        if date.day == holiday[0] and\
           date.month == holiday[1]:
            return True
    return False

In [None]:
df["is_weekend"] = df["weekday"].apply(lambda x: int(x in [5, 6]))
df["is_peak"] = df["period"].apply(lambda p: int(p in peaks))
df["special_day"] = df["date"].apply(lambda date: int(is_special(date)))
print(df.head())

In [None]:
features = ["segment_id", "street_id", "weekday", 
            "length", "street_level", "street_type", 
            "long_snode", "lat_snode", "period", 
            "is_weekend", "is_peak", "special_day"]

# III. Train models

## Extend ROC curves for multiclass classification by computing macro-average ROC curve & ROC area

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle

def classification_report_df(y_true, y_pred):
    classes = np.unique(y_true)
    true = label_binarize(y_true, classes=classes)
    pred = label_binarize(y_pred, classes=classes)
    
    fpr, tpr, roc_auc = dict(), dict(), dict()
    for i, c in enumerate(classes):
        fpr[c], tpr[c], _ = roc_curve(true[:, i], pred[:, i])
        roc_auc[c] = auc(fpr[c], tpr[c])
        
    fpr["micro"], tpr["micro"], _ = roc_curve(true.ravel(), pred.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    return fpr, tpr, roc_auc

def plot_multiclass_roc(y_true, y_pred, plot_title):
    fpr, tpr, roc_auc = classification_report_df(y_true, y_pred)
    classes = fpr.keys()
    all_fpr = np.unique(np.concatenate([fpr[c] for c in classes]))
    mean_tpr = np.zeros_like(all_fpr)
    for c in classes:
        mean_tpr += np.interp(all_fpr, fpr[c], tpr[c])
    mean_tpr /= len(classes)
    
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    plt.figure(figsize=(10, 8))
    plt.plot(fpr["micro"], tpr["micro"],
             label=f'micro-average ROC curve (area = {roc_auc["micro"]:0.2f})',
             color='deeppink', linestyle=':', linewidth=4)
    plt.plot(fpr["macro"], tpr["macro"],
             label=f'macro-average ROC curve (area = {roc_auc["macro"]:0.2f})',
             color='navy', linestyle=':', linewidth=4)
    
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    lw = 2
    for c, color in zip(classes, colors):
        plt.plot(fpr[c], tpr[c], color=color, lw=lw,
                 label=f'ROC curve of class {c} (area = {roc_auc[c]:0.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(plot_title)
    plt.legend(loc="lower right")
    plt.show()

## Plot Feature Importances

In [None]:
def plot_feature_importances(features, feature_importances):
    indices = np.argsort(feature_importances)

    plt.title("Feature Importances")
    plt.barh(range(len(indices)), feature_importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel("Relative Importance")
    plt.show()

## Helper functions

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.dummy import DummyClassifier

def preprocess_dataset(X, y, preprocessor=None, resampler=None, test_size=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=0)
    if preprocessor is not None:
        X_train = preprocessor.fit_transform(X_train)
        X_val = preprocessor.transform(X_val)
    if resampler is not None:
        X_train, y_train = resampler.fit_resample(X_train, y_train)
    return X_train, X_val, y_train, y_val

def train_and_validate(X_train, X_val, y_train, y_val, model, plot_title):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(classification_report(y_val, y_pred))
    plot_confusion_matrix(model, X_val, y_val)
    plot_multiclass_roc(y_val, y_pred, plot_title)

def train_model(X, y, preprocessor=None, resampler=None, 
                model=DummyClassifier(strategy="most_frequent", random_state=0),
                plot_title="Extension ROC to multi-class", test_size=0.2):
    pre = preprocess_dataset(X, y, preprocessor, resampler, test_size)
    train_and_validate(*pre, model, plot_title)

## Column preprocessor

In [None]:
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer

num_features = make_column_selector(dtype_exclude=object)
cat_features = make_column_selector(dtype_include=object)

num_pipeline = Pipeline([('numerical_scaler', StandardScaler())])
cat_pipeline = Pipeline([('categorical_encoder', OneHotEncoder(handle_unknown="ignore"))])

preprocessor = make_column_transformer((num_pipeline, num_features), (cat_pipeline, cat_features))

# One-Against-All SVM

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

oaa_svm = OneVsRestClassifier(SVC(random_state=0))
train_model(df[features], df["LOS"], preprocessor=preprocessor, model=oaa_svm,
            plot_title="One-Against-All Support Vector Classifier")

# K-Nearest neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
train_model(df[features], df["LOS"], preprocessor=preprocessor, model=knn,
            plot_title="K-Nearest neighbors")

# Multi-Layer Perceptron classifier

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=1, max_iter=1000)
train_model(df[features], df["LOS"], preprocessor=preprocessor, model=mlp,
            plot_title="Multi-Layer Perceptron")

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=0)
train_model(df[features], df["LOS"], preprocessor=preprocessor, model=model, plot_title="Decision Tree")

# Decision Tree with SMOTE: Over-sampling

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=0)
train_model(df[features], df["LOS"], preprocessor, SMOTE(), model, 
            "Decision Tree with SMOTE: Over-sampling")

# Decision Tree with SMOTE: Under-sampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier

resampler = RandomUnderSampler(sampling_strategy={"A":8000})
model = DecisionTreeClassifier(random_state=0)

train_model(df[features], df["LOS"], preprocessor, resampler, model,
            "Decision Tree with Random Under-sampling")

# Hybrid: Oversampling & Undersampling

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

resampler = ImbPipeline(steps=[('o', SMOTE(sampling_strategy={"B":5000, "C":5000, "D":5000, "E":5000, "F":5000})),
                               ('u', RandomUnderSampler(sampling_strategy={"A":8000}))])
model = DecisionTreeClassifier(random_state=0)

train_model(df[features], df["LOS"], preprocessor, resampler, model,
            "Decision Tree with Oversampling & Undersampling")

# Random Forest with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)
train_model(df[features], df["LOS"], preprocessor, SMOTE(),
            forest, "Random Forest with SMOTE")

# Cost-sensitive random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=50, class_weight="balanced")

train_model(df[features], df["LOS"], preprocessor, model=forest,
            plot_title="Cost-sensitive Random Forest")

# CatBoost

In [None]:
from sklearn.model_selection import train_test_split
import catboost
import ipywidgets

def train_catboost(X, y, cat_features, model, test_size=0.2, plot_title="CatBoost"):
    X_train, X_val, y_train, y_val = train_test_split(df[features], df["LOS"],
                                                      test_size=test_size, random_state=1)
    train_pool = catboost.Pool(data=X_train, label=y_train, cat_features=cat_features)
    val_pool = catboost.Pool(data=X_val, label=y_val, cat_features=cat_features)
    model.fit(train_pool, eval_set=val_pool)
    y_pred = model.predict(X_val)
    print(classification_report(y_val, y_pred))
    plot_confusion_matrix(model, X_val, y_val)
    plot_multiclass_roc(y_val, y_pred, plot_title=plot_title)

In [None]:
base_cat = catboost.CatBoostClassifier(
    iterations = 1000,
    loss_function = "MultiClass",
    verbose = 100,
    random_seed = 1,
    custom_loss = ["AUC", "Accuracy"],
    save_snapshot = True,
    snapshot_file="base_cat.bkp",
    snapshot_interval=10
)
cat_cols = ["segment_id", "street_id", "weekday", "street_type", "period"]
train_catboost(df[features], df["LOS"], cat_cols, base_cat)

In [None]:
plot_feature_importances(features, base_cat.feature_importances_)

# Weighted CatBoost

In [None]:
from sklearn.utils.class_weight import compute_class_weight

classes = df["LOS"].unique()
weights = compute_class_weight(class_weight="balanced", classes=classes, y=df["LOS"])
class_weights = dict(zip(classes, weights))
display(class_weights)

In [None]:
weighted_cat = catboost.CatBoostClassifier(
    iterations = 1000,
    loss_function = "MultiClass",
    verbose = 100,
    random_seed = 1,
    custom_loss = ["AUC"],
    class_weights = class_weights,
    save_snapshot = True,
    snapshot_file="weighted_cat.bkp",
    snapshot_interval=10
)
cat_cols = ["segment_id", "street_id", "weekday", "street_type", "period"]
train_catboost(df[features], df["LOS"], cat_cols, weighted_cat, plot_title="Cost-sensitive CatBoost")

In [None]:
plot_feature_importances(features, weighted_cat.feature_importances_)

# Hyperparameter Tuning CatBoost

In [None]:
params = {
    "learning_rate": [0.03, 0.1, 0.3],
    "depth": [4, 6, 8]
}

tune_model = catboost.CatBoostClassifier(
    iterations = 1000,
    loss_function = "MultiClass",
    verbose = False,
    random_seed = 1,
    custom_loss = ["AUC"],
    class_weights = class_weights,
)
cat_cols = ["segment_id", "street_id", "weekday", "street_type", "period"]
X_train, X_val, y_train, y_val = preprocess_dataset(df[features], df["LOS"], preprocessor)
results = tune_model.grid_search(params, X_train, y_train)
display(results["params"])

# Run Best CatBoost Model

In [None]:
best_clf = catboost.CatBoostClassifier(
    iterations = 1000,
    learning_rate = 0.1,
    loss_function = "MultiClass",
    verbose = 100,
    random_seed = 1,
    custom_loss = ["AUC"],
    class_weights = class_weights,
    depth = 6,
)
cat_cols = ["segment_id", "street_id", "weekday", "street_type", "period"]
train_catboost(df[features], df["LOS"], cat_cols, best_clf)

In [None]:
plot_feature_importances(features, best_clf.feature_importances_)