In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# I. Load dataset

In [None]:
df = pd.read_csv("../input/traffic-flow-data-in-ho-chi-minh-city-viet-nam/train.csv", index_col="_id", parse_dates=["date"])

print(df.head())
print(df.shape)

In [None]:
# Choose concerning cols
cols = ["segment_id", "street_id", "street_name", "date", "weekday", 
        "length", "max_velocity", "street_level", "street_type", 
        "long_snode", "lat_snode", "long_enode", "lat_enode", "period", "LOS"]
df = df[cols]

In [None]:
print(df.head())

## Feature Enrichment

In [None]:
import datetime

# 6h-8h, 16h-19h
peaks = ["period_6_00", "period_6_30", 
         "period_7_00", "period_7_30",
         "period_16_00", "period_16_30", 
         "period_17_00", "period_17_30",
         "period_18_00", "period_18_30"]

def is_special(date):
    # holidays = [(day, month)]
    holidays = [(1,1), (14,2), (8,3), (30,4), 
                (1,5), (1,6), (2,9), (20,10), 
                (20,11), (24,12), (25,12)]
    for holiday in holidays:
        if date.day == holiday[0] and\
           date.month == holiday[1]:
            return True
    return False

In [None]:
df["is_weekend"] = df["weekday"].apply(lambda x: int(x in [5, 6]))
df["is_peak"] = df["period"].apply(lambda p: int(p in peaks))
df["special_day"] = df["date"].apply(lambda date: int(is_special(date)))
print(df.head())

# II. Simple EDA

* LOS is the target we want to classify
* Mostly data is categorical, except: velocity, max_velocity, long/lat of 2 nodes

## 1. Missing values

In [None]:
missing_df = pd.DataFrame((df.isna().sum() / df.shape[0]), columns=["missing_ratio"]).sort_values("missing_ratio", ascending=False)
print(missing_df)

A lot missing values in max_velocity, other columns are good.

## 2. Categorical columns

In [None]:
def plot_cat_cols_with_target(data, cols, target):
    for col in cols:
        pd.crosstab(data[col], data[target]).plot.bar(figsize=(20, 10), fontsize=18)
    plt.show()
        
cat_cols = ["weekday", "street_level", "street_type",
            "period", "is_weekend", "is_peak", "special_day"]
plot_cat_cols_with_target(df, cat_cols, "LOS")

The patterns show some similar distributions of LOS labels

## 2. Numerical columns

In [None]:
def plot_num_cols_with_target(data, cols, target):
    for i, col in enumerate(cols):
        plt.figure(figsize=(16, 8))
        sns.violinplot(x=target, y=col, data=data)
        plt.show()

# Ignore 'max_velocity'
num_cols = ["length", "long_snode", "lat_snode", "long_enode", "lat_enode"]
plot_num_cols_with_target(df, num_cols, "LOS")

1. Outliers!!!
2. Worth noting that plots for **long_snode & long_enode** look familiar to each other, as well as **lat_snode & lat_enode**

## 4. Label

In [None]:
def check_label(target_cols):
    target_cols.value_counts().plot.bar(figsize=(12, 8))
    plt.show()
check_label(df["LOS"])

Dataset is imbalanced which is not good for classification.

## 5. Relationship

In [None]:
from sklearn.preprocessing import LabelEncoder

num_cols = ["length", "max_velocity", "long_snode", "lat_snode", "long_enode", "lat_enode"]
cat_cols = ["street_id", "segment_id", "weekday", "street_level", "street_type", "period", "is_weekend", "is_peak","special_day"]

def plot_heatmap(data):
    cols = ["LOS"] + num_cols + cat_cols
    temp_df = data[cols].copy()
    
    encoder = LabelEncoder()
    for col in cat_cols + ["LOS"]:
        temp_df[col] = encoder.fit_transform(temp_df[col])
        
    corrmat = temp_df[cols].corr()
    plt.figure(figsize=(12, 9))
    sns.heatmap(corrmat, cbar=True, annot=True, square=True, 
                fmt='.2f', annot_kws={'size': 10}, yticklabels=cols, xticklabels=cols)
    plt.show()

plot_heatmap(df)

Some features may affect classification of LOS:

* All columns seems not to be related to LOS, maybe these features aren't enough for classification.
* Some cells show that long_snode ~ long_enode, lat_snode ~ lat_enode with full correlation. Easy to understand because segments connected by nodes so that a start node of a segment can also be an end node of other. Therefore, remove a pair of long/lat of end node in each sample before train to prevent overestimating these features.
* street_level somewhat relates to street_type, fair enough!

The dataset is pretty dark: outliers, unrelated features, severe imbalance; but diamond cuts diamond, I will try to mine it.

# III. Implement metric for evaluation multiclass classification

## Plot ROC curves for multiclass classification by computing macro-average ROC curve & ROC area

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle

def classification_report_df(y_true, y_pred):
    classes = np.unique(y_true)
    true = label_binarize(y_true, classes=classes)
    pred = label_binarize(y_pred, classes=classes)
    
    fpr, tpr, roc_auc = dict(), dict(), dict()
    for i, c in enumerate(classes):
        fpr[c], tpr[c], _ = roc_curve(true[:, i], pred[:, i])
        roc_auc[c] = auc(fpr[c], tpr[c])
        
    fpr["micro"], tpr["micro"], _ = roc_curve(true.ravel(), pred.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    return fpr, tpr, roc_auc

def plot_multiclass_roc(y_true, y_pred, title="Extension ROC to multi-class"):
    fpr, tpr, roc_auc = classification_report_df(y_true, y_pred)
    classes = fpr.keys()
    all_fpr = np.unique(np.concatenate([fpr[c] for c in classes]))
    mean_tpr = np.zeros_like(all_fpr)
    for c in classes:
        mean_tpr += np.interp(all_fpr, fpr[c], tpr[c])
    mean_tpr /= len(classes)
    
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    plt.figure(figsize=(12, 8))
    plt.plot(fpr["micro"], tpr["micro"],
             label=f'micro-average ROC curve (area = {roc_auc["micro"]:0.2f})',
             color='deeppink', linestyle=':', linewidth=4)
    plt.plot(fpr["macro"], tpr["macro"],
             label=f'macro-average ROC curve (area = {roc_auc["macro"]:0.2f})',
             color='navy', linestyle=':', linewidth=4)
    
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    lw = 2
    for c, color in zip(classes, colors):
        plt.plot(fpr[c], tpr[c], color=color, lw=lw,
                 label=f'ROC curve of class {c} (area = {roc_auc[c]:0.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

# IV. Try to mining data in desperation

## Baseline classification model

In [None]:
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer

num_features = make_column_selector(dtype_exclude=object)
cat_features = make_column_selector(dtype_include=object)

num_pipeline = Pipeline([('numerical_imputer', SimpleImputer(strategy="median")),
                         ('numerical_scaler', RobustScaler())])
cat_pipeline = Pipeline([('categorical_imputer', SimpleImputer(strategy="most_frequent")),
                         ('categorical_encoder', OneHotEncoder(handle_unknown="ignore"))])

preprocessor = make_column_transformer((num_pipeline, num_features), (cat_pipeline, cat_features))

# Choose model
from sklearn.tree import DecisionTreeClassifier

model = Pipeline([("preprocessor", preprocessor),
                  ("classifier", DecisionTreeClassifier(random_state=0))])

In [None]:
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import classification_report, plot_confusion_matrix

def train_and_evaluate_model(X, y, model):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    plot_confusion_matrix(model, X_val, y_val)
    print(classification_report(y_val, y_pred))
    plot_multiclass_roc(y_val, y_pred)

In [None]:
features = ["weekday", "length", "street_level", "street_type", "long_snode", "lat_snode", "period"]

train_and_evaluate_model(df[features], df["LOS"], model)

Model classifies A and B pretty well, others are not good since the dataset as shown to be imbalance towards 'A', or maybe data is not well distinguishable among class C, D, E, F.

### Why care about Imbalanced Classification? ([this blog](https://machinelearningmastery.com/what-is-imbalanced-classification/) for details and further reading)
* Most ML algorithms for classification were designed around the assumption of an equal number of examples for each class; therefore imbalanced model will prone to majority class, which is bad for generalization.
* In real world, we're mostly interested in minority class so it's useless if a model shows poor performance on minor population.

### 3 advises when addressing imbalance problem:
1. Use other classification metrics rather than just 'accuracy': 'precision', 'recall', 'F1-score', ie.
2. Preprocess the raw data before feeding it into model: data augmentation, ie.
3. Use variant of models/ML algorithms that treat classification errors differently.

## SMOTE

Author suggested: one approach to addressing imbalanced datasets is to oversample the minority class. Simplest approach involves duplicating samples in the minority class which leaves out generating unnecessary information.

Synthetic Minor Oversampling Technique(SMOTE): select examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

[SMOTE for Imbalanced Classification](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X, y = preprocessor.fit_transform(df[features]), df["LOS"]
print("Before:", X.shape, y.shape)
X, y = smote.fit_resample(X, y)
print("After:", X.shape, y.shape)
y.value_counts().plot.bar(figsize=(12, 8))
plt.show()

This is how SMOTE balances the distribution of classes. Now let's feed it into model.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

def preprocess_dataset(X, y, preprocessor, resampler, test_size=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=0)
    X_train = preprocessor.fit_transform(X_train)
    X_val = preprocessor.transform(X_val)
    X_train, y_train = resampler.fit_resample(X_train, y_train)
    return X_train, X_val, y_train, y_val

def train_and_validate(X_train, X_val, y_train, y_val, model, plot_title="Extension ROC to multi-class"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(classification_report(y_val, y_pred))
    plot_confusion_matrix(model, X_val, y_val)
    plot_multiclass_roc(y_val, y_pred, plot_title)

### Normal SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier

# Use the previous defined preprocessor
X_train, X_val, y_train, y_val = preprocess_dataset(df[features], df["LOS"], preprocessor, SMOTE())
model = DecisionTreeClassifier(random_state=0)

train_and_validate(X_train, X_val, y_train, y_val, model, "Decision Tree with normal SMOTE")

### Weighting-SMOTE: Oversampling & Undersampling

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

resampler = ImbPipeline(steps=[('o', SMOTE(sampling_strategy={"B":5000, "C":5000, "D":5000, "E":5000, "F":5000})),
                               ('u', RandomUnderSampler(sampling_strategy={"A":8000}))])
# Use the previous defined preprocessor
X_train, X_val, y_train, y_val = preprocess_dataset(df[features], df["LOS"], preprocessor, resampler)
model = DecisionTreeClassifier(random_state=0)

train_and_validate(X_train, X_val, y_train, y_val, model, "Decision Tree with Oversampling & Undersampling SMOTE")

### Adaptive Synthetic Sampling (ADASYN)

In [None]:
from imblearn.over_sampling import ADASYN

# Use the previous defined preprocessor
X_train, X_val, y_train, y_val = preprocess_dataset(df[features], df["LOS"], preprocessor, ADASYN())
model = DecisionTreeClassifier(random_state=0)

train_and_validate(X_train, X_val, y_train, y_val, model, "Decision Tree with ADASYN")

### SMOTE Tomek Links

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

# Resampler: SMOTE with Tomek Links
resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

X_train, X_val, y_train, y_val = preprocess_dataset(df[features], df["LOS"], preprocessor, resampler)
model = DecisionTreeClassifier(random_state=0)

train_and_validate(X_train, X_val, y_train, y_val, model, "Decision Tree with SMOTE Tomek Links")

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

# Resampler: SMOTE with Tomek Links
resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

X_train, X_val, y_train, y_val = preprocess_dataset(df[features], df["LOS"], preprocessor, resampler)
model = DecisionTreeClassifier(random_state=0)

train_and_validate(X_train, X_val, y_train, y_val, model)

## Cost-sensitive random forest classifier

[Multi-class Imbalanced Classification](https://machinelearningmastery.com/multi-class-imbalanced-classification/)

[Cost-Sensitive Learning](https://machinelearningmastery.com/cost-sensitive-learning-for-imbalanced-classification/)

In [None]:
from sklearn.ensemble import RandomForestClassifier

weights = {'F': 1.5, 'A': 0.8, 'B': 1.5, 'C': 1.5, 'D': 1.5, 'E': 1.5}
forest = RandomForestClassifier(n_estimators=50, class_weight=weights)
model = Pipeline([('preprocessor', preprocessor), ('classifier', forest)])

X_train, X_val, y_train, y_val = train_test_split(df[features], df['LOS'], test_size=0.2, random_state=26)

train_and_validate(X_train, X_val, y_train, y_val, model)

## Random Forest with SMOTE Tomek Links

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.ensemble import RandomForestClassifier

# Resampler: SMOTE with Tomek Links
resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
# Model
model = RandomForestClassifier(n_estimators=100)

X_train, X_val, y_train, y_val = preprocess_dataset(df[features], df["LOS"], preprocessor, resampler)
train_and_validate(X_train, X_val, y_train, y_val, model, "Random Forest with SMOTE Tomek Links")

# V. Check model

Create a list of models to test

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.multiclass import OneVsRestClassifier

def get_models():
    models, names = [], []
    # LDA
    models.append(LinearDiscriminantAnalysis())
    names.append('LDA')
    # OAA SVC
    # One-Against-All
    models.append(OneVsRestClassifier(SVC()))
    names.append('OAA-SVC')
    # KNN
    models.append(KNeighborsClassifier(n_neighbors=3))
    names.append('KNN')
    return zip(models, names)

In [None]:
X_train, X_val, y_train, y_val = preprocess_dataset(df[features], df["LOS"], preprocessor, resampler)
for model, name in get_models():
    train_and_validate(X_train.toarray(), X_val.toarray(), y_train, y_val, model, name)

# VI. Reference for improvement

https://www.reddit.com/r/MachineLearning/comments/12evgi/classification_when_80_of_my_training_set_is_of/

https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

[Weka's CostSensitiveClassifier](https://www.youtube.com/watch?v=LbZ9ROR1tQ0)

[Sample Imbalanced Multiclass Classification](https://machinelearningmastery.com/imbalanced-multiclass-classification-with-the-e-coli-dataset/)