# Import Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import random 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn import metrics
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier
pd.set_option('display.max_columns', None)

# Import Data

In [None]:
path = '../input/walmart-recruiting-trip-type-classification/'
df = pd.read_csv(path + 'train.csv.zip')
test = pd.read_csv(path + 'train.csv.zip')

In [None]:
# df = pd.read_csv('train.csv', encoding = "utf-8")
# test = pd.read_csv('test.csv', encoding = "utf-8")

# EDA!

- **TripType** - a categorical id representing the type of shopping trip the customer made. This is the ground truth that you are predicting. TripType_999 is an "other" category.

- **VisitNumber** - an id corresponding to a single trip by a single customer

- **Weekday** - the weekday of the trip

- **Upc** - the UPC number of the product purchased

- **ScanCount** - the number of the given item that was purchased. A negative value indicates a product return.

- **DepartmentDescription** - a high-level description of the item's department

- **FinelineNumber** - a more refined category for each of the products, created by Walmart

There are 5704 duplicated entries 

In [None]:
df.loc[df.duplicated()].shape

Because every entries including ScanCount are duplicated, it is likely that the entries are really duplicates.

In [None]:
df.loc[df.duplicated(keep=False)]

Drop duplicates data

In [None]:
df = df.drop_duplicates()

Let's look at the data shape

In [None]:
df.shape

In [None]:
df.head()

## Explore the responses 

There are 38 unique type of trip

In [None]:
len(df['TripType'].unique())

The data is unbalanced, and there are 38! classes of data. Furthermore max-min number of members per class equal to ~170k members

In [None]:
plt.figure(figsize = (12, 10))

sn.set_style('whitegrid')
ax = sn.countplot(x = 'TripType', data = df, palette = 'mako')
ax = ax.set(title = 'The Frequent of Trip Type', ylabel = 'Counts', xlabel = 'Trip Type')

In [None]:
df['TripType'].value_counts().describe()

## Explore the predictors 

### ScanCount

No null in scancount

In [None]:
df.loc[df['ScanCount'].isnull()]

Histrogram plot to see how the data distributed

In [None]:
df['ScanCount'].plot.hist(bins=50)

In [None]:
df['ScanCount'].value_counts()

The biggest number of item purchase per item per visit seem to be 1 or 2 items

### Returned Item

The item that has been purchase and return in the same visit should be cut

In [None]:
df.loc[df.duplicated(subset=['TripType', 'VisitNumber', 'Weekday', 'Upc', 'DepartmentDescription', 'FinelineNumber'], keep=False)]

In [None]:
df = df.groupby(['Upc', 'TripType', 'VisitNumber', 'Weekday',
            'DepartmentDescription', 'FinelineNumber'])['ScanCount'].sum().reset_index()

In [None]:
df.loc[df.duplicated(subset=['TripType', 'VisitNumber', 'Weekday',
                             'Upc', 'DepartmentDescription', 'FinelineNumber'], keep=False)]

In [None]:
df = df.loc[df['ScanCount'] != 0]

The item that has been purchase and return in the same visit has been cut

In [None]:
df['ScanCount'].value_counts()

The number of item purchase has not been change much

### VisitNumber

No null number

In [None]:
df.loc[df['VisitNumber'].isnull()]

95674 unique visit numbers

In [None]:
len(df['VisitNumber'].unique())

In [None]:
number_item_purchase = pd.DataFrame()

**Create a series counting number of unique item purchase per visit**

In [None]:
number_item_purchase['unique_count'] = df.groupby(['VisitNumber'])['Upc'].count()

In [None]:
number_item_purchase['unique_count'].head()

In [None]:
number_item_purchase['unique_count'].plot.hist(bins=30, alpha=0.5)

In [None]:
number_item_purchase['unique_count'].describe()

Unsurprisingly, walmart being a hypermarket average unique item is around 7 items sold per trip.

**Create a series counting number of items purchase per visit**

In [None]:
number_item_purchase['item_sum'] = df.groupby(['VisitNumber'])['ScanCount'].sum()

In [None]:
number_item_purchase['item_sum'].head()

In [None]:
number_item_purchase['item_sum'].plot.hist(bins=30, alpha=0.5)

In [None]:
number_item_purchase['item_sum'].describe()

average sum items is around 8 items.

Add additional info to number_item_purchase for future uses

In [None]:
number_item_purchase = number_item_purchase.reset_index().merge(df[['TripType', 'VisitNumber', 'Weekday']].drop_duplicates(), on='VisitNumber')

In [None]:
number_item_purchase.shape

In [None]:
number_item_purchase.head()

### Weekday

Let's look which day in week generate the most sale

In [None]:
df.groupby(['Weekday'])['ScanCount'].sum().plot.bar()

Look like Sunday, Saturday, and Friday generate the most sale

### Department Description

There are 68 department description

In [None]:
len(df['DepartmentDescription'].unique())

Top ten department description sold

In [None]:
df['DepartmentDescription'].value_counts().head(10)

Bottom ten department description sold

In [None]:
df['DepartmentDescription'].value_counts().tail(10)

In [None]:
plt.figure(figsize = (10, 30))

sn.set_style('whitegrid')
ax = sn.countplot(y='DepartmentDescription', data=df, palette='mako', order=df['DepartmentDescription'].value_counts().index)
sx = ax.set(title='The Frequent of Trip Type', xlabel='Counts', ylabel='Trip Type')

Produce and merchandise dominate the chart

### FinelineNumber

In [None]:
len(df['FinelineNumber'].unique())

In [None]:
df['FinelineNumber'].value_counts()

In [None]:
df.groupby(['DepartmentDescription', 'FinelineNumber'])['ScanCount']\
.sum().reset_index().sort_values(by='ScanCount', ascending=False).head(10)

Again produce and merchandise dominate the chart

Fineline number is too granular, I will leave it out for now

# Feature Engineering

we need to change data into machine learning ready form. First we need 1 trip per row, luckily we already did that.

In [None]:
X = number_item_purchase.copy()

In [None]:
X.head()

## Create feature from department detail

Now let's add each department detail scan count as a predictors

In [None]:
cat = df.groupby(['VisitNumber', 'DepartmentDescription'])['ScanCount'].sum().reset_index()
cat.head()

In [None]:
cat_sale = pd.pivot_table(cat, values=['ScanCount'], index=['VisitNumber'],
               columns=['DepartmentDescription'], aggfunc='sum')\
               .reset_index().fillna(0)
cat_sale.columns = cat_sale.columns.droplevel()
cat_sale = cat_sale.rename(columns={'': "VisitNumber"})

In [None]:
X = X.merge(cat_sale, on='VisitNumber')

## Encoding days

Change week day to numeric variable first

In [None]:
day_of_week = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}

In [None]:
day = X['Weekday'].map(day_of_week)

Change numeric days to sin and cos to make it cyclical

In [None]:
X['sin_day'] = np.sin(2*np.pi*day/7)
X['cos_day'] = np.cos(2*np.pi*day/7)
X = X.drop(['Weekday'], axis=1)

In [None]:
X.head()

## Dealing with response

Setting y to be equal to Trip type

In [None]:
data = X.copy()
y = X['TripType']
X = X.drop(['TripType', 'VisitNumber'], axis=1)

In [None]:
y

Before trying to classify the model let's have a last detail look at a Relationship between predictors and response

# EDA 2

In [None]:
data.head()

Let's look at response type 39 and 40

In [None]:
data.loc[data['TripType'] == 39].head(5)

look like type 39 response seem to be grocery trips

In [None]:
data.loc[data['TripType'] == 40].head(5)

type 40 seem to be grocery like 39 but bulkier

In [None]:
data.loc[data['TripType'] == 5].head(5)

type 5 seem to be pharmacy trip

The trip seem to be divided by what and how much the customer brought, and the feline number may be too granular to be use without more preprocessing. Of course grouping/clustering the the feline number together may be the same as using department detail.

# Modelling

From the last EDA it seem trip type may be a lot more clear cut the expected. As the model try to estimate human judgement, the non-linear tree-based model may be a better fit.

## Split test set for use as final evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline

We need some kind of simple model to evaluate how well the more complicated model perform

### Random model

We will be use frequency as a kind of PMF to be a baseline for the dataset

In [None]:
y_train

Order the classes alphabetically, as done by sklearn.metrics.log_loss

In [None]:
random_model = y_train.value_counts().reset_index().sort_values(by='index')
random_model = np.array(random_model['TripType']/random_model['TripType'].sum())

Now input the random probability to each y in test set

In [None]:
y_train_pred_prob = []
for i in range(y_train.shape[0]):
    y_train_pred_prob.append(random_model)

### Model Evaluation

First using **cross entropy**

In [None]:
log_loss(y_train, y_train_pred_prob)

As expected the model did not perform well at all

Now test set

In [None]:
y_test_pred_prob = []
for i in range(y_test.shape[0]):
    y_test_pred_prob.append(random_model)

In [None]:
log_loss(y_test, y_test_pred_prob)

According to the probability every class will be predicted to be class with highest frequency which is, class type 40

In [None]:
y_test_pred = []
for i in range(y_test.shape[0]):
    y_test_pred.append(40)

In [None]:
report = metrics.classification_report(y_test, y_test_pred, digits=3, output_dict=True)

In [None]:
report_df = pd.DataFrame(report).transpose()
report_df

This can be use as a template for evaluate other models

## Random forest

In [None]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

In [None]:
y_test_pred_prob = rfc.predict_proba(X_test)
y_test_pred = rfc.predict(X_test)

### cross entrophy

In [None]:
log_loss(y_test, y_test_pred_prob)

### Other metrices

In [None]:
report = metrics.classification_report(y_test, y_test_pred, digits=3, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df

Unsurprisingly, Random Forest work better than the baseline model.

for multiclass classification, the model does better than expected; however, the model seem to has trouble classifying minority class

### features importance

In [None]:
importances = pd.DataFrame(rfc.feature_importances_, index=X_train.columns).reset_index()
importances.columns=['features', 'importance']
importances = importances.sort_values(by='importance', ascending=False)
plt.figure(figsize = (12, 30))
sn.set_style('whitegrid')
ax = sn.barplot( x='importance', y='features', data=importances, palette = 'mako')
ax = ax.set(title = 'The Random Forest Feature Importance', xlabel = 'Importances(GINI)', ylabel = 'Features')

The model seems to learn from total item purchase and unique item purchase well; the more popular the department, the more critical the feature.

## XGBoost

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
xgb = XGBClassifier(objective='multi:softmax', booster='gbtree', eval_metric='mlogloss', num_class='38', importance_type='weight')
eval_set = [(X_val, y_val)]
xgb.fit(X_tr, y_tr, eval_set=eval_set, verbose=False)

In [None]:
y_test_pred_prob = xgb.predict_proba(X_test)
y_test_pred = xgb.predict(X_test)

### cross entrophy

In [None]:
log_loss(y_test, y_test_pred_prob)

### Other metrices

In [None]:
report = metrics.classification_report(y_test, y_test_pred, digits=3, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df

The XGBoost outperform Random Forest, which is as expected. In general gradient boosting seem to be more powerful estimater than bagging algorithm like Random Forest

Still, the model can not deal with minority class over sampling maybe needed

### features importance

In [None]:
importances = pd.DataFrame(xgb.feature_importances_, index=X_train.columns).reset_index()
importances.columns=['features', 'importance']
importances = importances.sort_values(by='importance', ascending=False)
plt.figure(figsize = (12, 30))
sn.set_style('whitegrid')
ax = sn.barplot( x='importance', y='features', data=importances, palette = 'mako')
ax = ax.set(title = 'The Random Forest Feature Importance', xlabel = 'Importances(GINI)', ylabel = 'Features')

XGBoost feature importance is not that different from Random Forest, but outperform the Random Forest, which is not surprising given that XGBoost is generally a less bias estimator.

A lot more can be done, but stock XGBoost seems to do an adequate job of estimating the problem.