In [None]:
!pip install dataprep

## 1. Load library

In [None]:
import pandas as pd
import numpy as np
from dataprep.eda import create_report
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier, cv
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

## 2. Read data and visualization

In [None]:
tr = pd.read_csv('../input/customer-analytics/Train.csv')
tr.head()

> Using dataprep package, We can check basic data EDA

In [None]:
create_report(tr)

In [None]:
tr.info()

#### 2-1. There was no difference in the number of calls by gender.

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Female', x=tr.loc[tr.Gender == 'F','Customer_care_calls'].value_counts().index, y=tr.loc[tr.Gender == 'F','Customer_care_calls'].value_counts().values),
    go.Bar(name='Male', x=tr.loc[tr.Gender == 'M','Customer_care_calls'].value_counts().index, y=tr.loc[tr.Gender == 'M','Customer_care_calls'].value_counts().values)
])

fig.update_layout(barmode='group')
fig.show()

#### 2-2. There was no difference in cost of the product by gender.

In [None]:
# Add histogram data
x1 = tr.loc[tr.Gender == 'F','Cost_of_the_Product']
x2 = tr.loc[tr.Gender == 'M','Cost_of_the_Product']

# Group data together
hist_data = [x1, x2]
group_labels = ['Female', 'Male']
colors = ['slategray', 'magenta']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colors)
fig.show()

#### 2-3. There was no difference in weight of product by gender.

In [None]:
# Add histogram data
x1 = tr.loc[tr.Gender == 'F','Weight_in_gms']
x2 = tr.loc[tr.Gender == 'M','Weight_in_gms']

# Group data together
hist_data = [x1, x2]
group_labels = ['Female', 'Male']
colors = ['slategray', 'magenta']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colors)
fig.show()

#### 2-4. There was no difference in weight of product by shipment mode.

In [None]:
# Add histogram data
x1 = tr.loc[tr.Mode_of_Shipment == 'Ship','Weight_in_gms']
x2 = tr.loc[tr.Mode_of_Shipment == 'Flight','Weight_in_gms']
x3 = tr.loc[tr.Mode_of_Shipment == 'Road','Weight_in_gms']

# Group data together
hist_data = [x1, x2, x3]
group_labels = ['Ship', 'Flight', 'Road']
colors = ['Red', 'Blue', 'Green']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colors)
fig.show()

## 3. Data preprocessing

In [None]:
tr = tr.iloc[:,1:]

In [None]:
sc = StandardScaler()
tr.iloc[:,[2,3,4,5,8,9]] = sc.fit_transform(tr.iloc[:,[2,3,4,5,8,9]])

In [None]:
tr_dum = pd.get_dummies(tr.iloc[:,:-1])

In [None]:
train = pd.concat([tr_dum,tr.iloc[:,-1]], axis='columns')

In [None]:
train.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.iloc[:,:-1], train.iloc[:,-1], random_state=123, stratify=train.iloc[:,-1], test_size=0.3)

## 4. Modeling

### 4-1. XGBoost

In [None]:
xgb = XGBClassifier(n_estimators=20, learning_rate=0.1, max_depth=50, use_label_encoder=False,objective = "binary:hinge")
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
Acc_score = accuracy_score(pred, y_test)
F1_score = f1_score(pred, y_test)
Precision_score = precision_score(pred, y_test)
print("Acc score : {:.2f}".format(Acc_score))
print("F1 score : {:.2f}".format(F1_score))
print("Precision score : {:.2f}".format(Precision_score))

### 4-2. Randomforest with GridSearchCV

In [None]:
rf = RandomForestClassifier()
rf_param_grid = {'n_estimators' : list(range(5,16)), 'max_depth' : list(range(5,16))}
rf_grid = GridSearchCV(estimator=rf,
                      param_grid=rf_param_grid,
                      scoring='accuracy',
                      cv=4,
                      return_train_score=True)

rf_grid.fit(train.iloc[:, :-1],train.iloc[:,-1])

In [None]:
import itertools
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
rf_grid.best_params_

In [None]:
rf_best = RandomForestClassifier(max_depth=15, n_estimators=5)
rf_best.fit(train.iloc[:, :-1],train.iloc[:,-1])

In [None]:
pred = rf_best.predict(train.iloc[:, :-1])

## 5. Result(Confusion matrix)

In [None]:
cnf_matrix_tra = confusion_matrix(train.iloc[:,-1], pred)
print(round(accuracy_score(train.iloc[:,-1], pred),2), '%')

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_tra , classes=class_names, title='Confusion matrix')
plt.show()