# Load libs

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load and describe

In [None]:
df = pd.read_csv("../input/invehicle-coupon-recommendation/in-vehicle-coupon-recommendation.csv")
df

In [None]:
df.info()

# Preprocessing dataset

## Check for NaN

In [None]:
df_na_sum = df.isna().sum()
df_na_sum[df_na_sum > 0]/len(df)

## Drop `car` column since it has 99% `NaN`

In [None]:
df.drop('car', axis=1, inplace=True)
df

## Drop `NaN` in `CarryAway` and `RestaurantLessThan20` since it doesn't allow `NaN` and it takes small percent of dataset

In [None]:
df.dropna(subset=["CarryAway", "RestaurantLessThan20"], inplace=True)
df

## Change all **object** columns to **category**

In [None]:
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype("category")
df

## Visualize dataset

In [None]:
for col in df.drop('Y', axis=1).columns:
    pd.crosstab(df[col], df['Y']).plot(kind='bar')
    plt.xlabel(col)
    plt.ylabel('Y')
    plt.show()

## Check if dataset is balanced

In [None]:
df["Y"].value_counts(normalize=True)

# Value encoding

In [None]:
df.dtypes

In [None]:
x = df.drop('Y', axis=1)
y = df['Y']

ohe = OneHotEncoder(sparse=False)
x = pd.DataFrame(ohe.fit_transform(x), index=y.index, columns=ohe.get_feature_names(x.columns))
x

# Split data for train and test (k-fold)

In [None]:
kf = KFold(
    n_splits=5,
    shuffle=True,
    random_state=7
)

# Building and evaluating model

## Gradient Boosting classifier

In [None]:
gbc = GradientBoostingClassifier(
    max_depth=7,
    min_samples_split=5,
    random_state=7
)

gbc_acc = 0
gbc_cfm = np.zeros((2, 2))

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    gbc.fit(x_train, y_train)
    y_pred = gbc.predict(x_test)
    gbc_acc = gbc_acc + accuracy_score(y_test, y_pred)
    gbc_cfm = gbc_cfm + confusion_matrix(y_test, y_pred)

## Average accuracy score

In [None]:
gbc_acc/kf.get_n_splits()

## Combine confusion matrix

In [None]:
ConfusionMatrixDisplay(gbc_cfm).plot()
plt.show()

## Random Forest classifier

In [None]:
rfc = RandomForestClassifier(
    max_depth=7,
    min_samples_split=5,
    random_state=7
)

rfc_acc = 0
rfc_cfm = np.zeros((2, 2))

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    rfc.fit(x_train, y_train)
    y_pred = rfc.predict(x_test)
    rfc_acc = rfc_acc + accuracy_score(y_test, y_pred)
    rfc_cfm = rfc_cfm + confusion_matrix(y_test, y_pred)

## Average accuracy score

In [None]:
rfc_acc/kf.get_n_splits()

## Combine confusion matrix

In [None]:
ConfusionMatrixDisplay(rfc_cfm).plot()
plt.show()

## SVM classifier

In [None]:
svm = SVC(
    random_state=7
)

svm_acc = 0
svm_cfm = np.zeros((2, 2))

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    svm_acc = svm_acc + accuracy_score(y_test, y_pred)
    svm_cfm = svm_cfm + confusion_matrix(y_test, y_pred)

## Average accuracy score

In [None]:
svm_acc/kf.get_n_splits()

## Combine confusion matrix

In [None]:
ConfusionMatrixDisplay(svm_cfm).plot()
plt.show()

# Conclusion
According to the accuracy and confusion matrix of 3 models, Gradient Boosting perform better than SVM and Random Forest