In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import xgboost
import graphviz

RANDOM_SEED = 1212

## Data Loading

In [None]:
train = pd.read_csv('../data/small/train/orange_small_train.data', sep="\t")
train.head()

In [None]:
upselling_label = (
    pd.read_csv(
        "../data/small/labels/orange_small_train_upselling.labels", sep="\t", header=None
    )
    .iloc[:, 0]
    .astype("category")
)

upselling_label.cat.rename_categories([0, 1], inplace=True)

upselling_label.head()

## Dataframe Conversion

In [None]:
def convert_dataframe(DF):
    df_var_names = DF.columns

    df_types = {df_var_name: DF[df_var_name].dtype for df_var_name in df_var_names}

    for df_var_name in df_var_names:
        if df_types[df_var_name] == int:
            df = DF[df_var_name].astype(float)
            DF.loc[:, df_var_name] = df
            df_types[df_var_name] = df.dtype

        elif df_types[df_var_name] != float:
            df = DF[df_var_name].astype("category")
            DF.loc[:, df_var_name] = df
            df_types[df_var_name] = df.dtype

    return DF, df_types

In [None]:
train, train_types = convert_dataframe(train)

## Data Cleaning and Filling

In [None]:
def plot_missing_data(train):
    train_missing = train.isnull().sum() / train.shape[0]

    plt.figure(figsize=(15,5))
    
    plt.hist(train_missing, bins=50)

    plt.title("Missing data proportions on Train set")
    plt.show()

In [None]:
plot_missing_data(train)

I'll consider valid only features that have less than 15% missing data of its total data as our model features

In [None]:
train_missing = train.isnull().sum() / train.shape[0]
features = train_missing[train_missing <= 0.15].index

print(features)
print(len(features))

Filling the missing numeric values with respective column mean values

In [None]:
numeric_features = [feat for feat in features if train_types[feat] == float]

print(numeric_features)
print(len(numeric_features))

In [None]:
train[numeric_features] = train[numeric_features].fillna(
    train[numeric_features].mean()
)

In [None]:
plot_missing_data(train)

I'll now remove the categorical features that have more than 400 categories in it

In [None]:
categorical_features = [feat for feat in features if train_types[feat] != float]

print(categorical_features)
print(len(categorical_features))

In [None]:
categorical_levels = train[categorical_features].apply(lambda col: len(col.cat.categories))

categorical_features_filtered = categorical_levels[categorical_levels <= 400].index.tolist()
print(categorical_features_filtered)
print(len(categorical_features_filtered))

In [None]:
features_filtered = numeric_features + categorical_features_filtered
print(features_filtered)
print(len(features_filtered))

Feature Selection

In [None]:
train = train[features_filtered]

Getting dummy variables for category columns

In [None]:
train = pd.get_dummies(train, dtype=bool)

Scaling between $[0,1]$ numeric features

In [None]:
scaler = MinMaxScaler()
train[numeric_features] = scaler.fit_transform(train[numeric_features])

In [None]:
train[numeric_features].describe()

## Train, Validation and Test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    train, upselling_label, test_size=0.2, random_state=RANDOM_SEED
)

x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=RANDOM_SEED
)

In [None]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

## Model Training

### XGBoost Classifier

In [None]:
xgb = xgboost.XGBClassifier(
    n_jobs=8,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=2000,
    random_state=RANDOM_SEED,
)

In [None]:
xgb.fit(
    x_train,
    y_train,
    eval_set=[(x_val, y_val)],
    early_stopping_rounds=50,
    eval_metric="auc",
    verbose=True,
)

In [None]:
y_score_xgb = xgb.predict_proba(x_test)

### GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(
    learning_rate=0.01,
    n_estimators=100,
    max_depth=10,
    random_state=RANDOM_SEED,
    n_iter_no_change=10,
    verbose=1
)

In [None]:
gbc.fit(x_train, y_train)

In [None]:
y_score_gdb = gbc.predict_proba(x_test)

### RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(
    n_estimators=2000, max_depth=10, n_jobs=8, random_state=RANDOM_SEED, verbose=1
)

In [None]:
rfc.fit(x_train, y_train)

In [None]:
y_score_rfc = rfc.predict_proba(x_test)

### Support Vector Machine Classifier

In [None]:
svc = SVC(
    gamma="auto",
    kernel="rbf",
    max_iter=10e+5,
    probability=True,
    random_state=RANDOM_SEED,
    verbose=1,
)

In [None]:
svc.fit(x_train, y_train)

In [None]:
y_score_svc = svc.predict_proba(x_test)

### Linear Support Vector Machine Classifier

In [None]:
lsvc = SVC(
    gamma="auto",
    kernel="linear",
    max_iter=10e+5,
    probability=True,
    random_state=RANDOM_SEED,
    verbose=1,
)

In [None]:
lsvc.fit(x_train, y_train)

In [None]:
y_score_lsvc = lsvc.predict_proba(x_test)

### Multi-Layer Perceptron Neural Network Classifier

In [None]:
mlp = MLPClassifier(
    hidden_layer_sizes=(2 ** 10, 2 ** 8, 2 ** 6, 2 ** 4),
    activation="relu",
    max_iter=int(10e+5),
    early_stopping=True,
    validation_fraction=0.2,
    n_iter_no_change=50,
    random_state=RANDOM_SEED,
    verbose=True,
)

In [None]:
mlp.fit(x_train, y_train)

In [None]:
y_score_mlp = mlp.predict_proba(x_test)

## AUC Comparison

In [None]:
def plot_roc_curve(y_true, y_score, model_name):
    fpr, tpr, _ = roc_curve(y_true, y_score[:, 1])
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(10, 5))
    lw = 2
    plt.plot(
        fpr,
        tpr,
        color="darkorange",
        lw=lw,
        label="ROC curve for UpSelling class (area = %0.2f)" % roc_auc,
    )
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic for %s model" % model_name)
    plt.legend(loc="lower right")
    plt.show()

In [None]:
plot_roc_curve(y_test, y_score_xgb, "xGBoost")

In [None]:
plot_roc_curve(y_test, y_score_gdb, "GradientBoosting")

In [None]:
plot_roc_curve(y_test, y_score_rfc, "RandomForest")

In [None]:
plot_roc_curve(y_test, y_score_svc, "SVC")

In [None]:
plot_roc_curve(y_test, y_score_lsvc, "LinearSVC")

In [None]:
plot_roc_curve(y_test, y_score_mlp, "MLP")