# Loading Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import gc
color = sns.color_palette()

%matplotlib inline

# Take a glimpse at the data sets

In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

op_prior_df = pd.read_csv("../input/order_products__prior.csv")
print(op_prior_df.shape)
op_train_df = pd.read_csv("../input/order_products__train.csv")
print(op_train_df.shape)
orders_df = pd.read_csv("../input/orders.csv")
print(orders_df.shape)

# Define what type of problem to solve

Based on the nature of the problem itself, it would be very tedious to solve it as a multi-classification problem. For example, if user 1 has purchased 3 items historically: 00001, 00002, 00003, then there are pow(2,3)=8 combinations to predict with. If user 1 has purchased 20 items, then there are pow(2,20)=1048576 combinations!

To make it simpler, reform the task as a binary classification problem is much simpler. We can consider each (order, product) pair, give label as reorder = 1 or 0. Then paste all the positive products as a line to an order to reshape the submission file.

In this case, we can extract features from order wise (which related to user behavior), and product wise.

## Order Wise EDA

In [None]:
orders_df.head()

Any NA value?

In [None]:
orders_df.isnull().sum()

In [None]:
orders_df[orders_df.days_since_prior_order.isnull()==True].head()

In [None]:
(orders_df.loc[orders_df.days_since_prior_order.isnull()==True, \
               'order_id'] == orders_df.loc[orders_df.order_number==1, 'order_id']).all()

Looks like they all belong to order_number == 1 (which makes sense)

In [None]:
op_prior_df.head()

In [None]:
op_prior_df.isnull().sum()

In [None]:
op_train_df.head()

In [None]:
op_train_df.isnull().sum()

### Number of unique users in total

In [None]:
print(orders_df.user_id.nunique())

### Count of prior, train, test in orders data set

In [None]:
order_eval_cnt = orders_df.eval_set.value_counts()

In [None]:
print(order_eval_cnt)

# Plot it
plt.figure(figsize=(12,5))
sns.barplot(order_eval_cnt.index, order_eval_cnt.values, alpha=0.8, color=color[3])
plt.title("Count of eval_set in orders data set")
plt.xlabel("eval_set")
plt.ylabel("count")
plt.show()

206,209 = 75,000 + 131,209 
Each user's last order consists the train and test data groups

### Number of orders per user

In [None]:
plt.figure(figsize=(12,5))
orders_df.user_id.value_counts().plot.hist(alpha=0.8, color=color[3])
plt.title("Number of orders per user")
plt.xlabel("Number of orders")
plt.ylabel("Frequency")
plt.show()

### Number of products per order

Concatenate op_prior_df and op_train_df into one data frame

In [None]:
op_concate_df = op_prior_df.append(op_train_df, ignore_index=True)
op_concate_df.shape

Are there any duplicate product_id within the same order?

In [None]:
op_dedup_len = op_concate_df[['order_id','product_id']].drop_duplicates().shape[0]
print(op_dedup_len != 33819106)

In [None]:
order_pd_cnt = op_concate_df.groupby(['order_id']).size().value_counts()

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(order_pd_cnt.index, order_pd_cnt.values, alpha=0.8, color=color[3])
plt.title("Number of products per order")
plt.xlabel("Number of products")
plt.ylabel("Frequency")
plt.show()

### Order Day of Week & Hour of Day
Order Day of Week (Looks like 0 and 1 are weekends with more orders)

In [None]:
order_dow_cnt = orders_df['order_dow'].value_counts()

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(order_dow_cnt.index, order_dow_cnt.values, alpha=0.8, color=color[3])
plt.title("Distribution of Order Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Frequency")
plt.show()

Order Hour of Day

In [None]:
order_hour_cnt = orders_df['order_hour_of_day'].value_counts()

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(order_hour_cnt.index, order_hour_cnt.values, alpha=0.8, color=color[3])
plt.title("Distribution of Order Hour of Day")
plt.xlabel("Our of Day")
plt.ylabel("Frequency")
plt.show()

Frequency of Order Hour of Day by Day of Week

In [None]:
dow_hour_df = orders_df.groupby(['order_dow','order_hour_of_day'])['order_number'].agg('count').reset_index()
dow_hour_df = dow_hour_df.pivot('order_dow','order_hour_of_day','order_number')

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(dow_hour_df)

### Days Since Prior Order

In [None]:
days_prior_cnt = orders_df['days_since_prior_order'].value_counts()

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(days_prior_cnt.index, days_prior_cnt.values, alpha=0.8, color=color[3])
plt.title("Distribution of Days Since Prior Order")
plt.xlabel("Days Since Prior Order")
plt.ylabel("Frequency")
plt.show()

DOW by Days Since Prior Order

In [None]:
dow_daysprior_df = orders_df.groupby(['order_dow','days_since_prior_order'])['order_number'].agg('count').reset_index()
dow_daysprior_df = dow_daysprior_df.pivot('order_dow','days_since_prior_order','order_number')

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(dow_daysprior_df)

It seems like I lot of people recursively order in weekends

 

## Product Wise EDA

### Proportion of reordered products per order

In [None]:
del orders_df
gc.collect()

In [None]:
prop_reorder = op_concate_df.groupby('order_id')['reordered'].agg(['count','sum'])
prop_reorder = prop_reorder['sum'] / prop_reorder['count']

In [None]:
prop_reorder.hist(figsize=(12,5), color=color[5], alpha=.8)
plt.title('Distribution of proportion of reordered products')

In [None]:
del prop_reorder
gc.collect()

A lot of products is reordered products as we can see.

### Is reorder probability related to add to cart order?

Distribution of add to cart order

In [None]:
op_concate_df.add_to_cart_order.hist(color=color[5],alpha=.8,figsize=(12,5),bins=145)
plt.title('Distribution of add to cart order')

In [None]:
op_concate_df.add_to_cart_order.describe()

Let's cut add_to_cart_order by quantile, then observe the probability of reordered by these groups.

In [None]:
add_to_cart_order_grp = pd.qcut(op_concate_df['add_to_cart_order'], 4)
p_reord_cart_ord = op_concate_df.groupby(add_to_cart_order_grp)['reordered'].agg(['count','sum'])
p_reord_cart_ord['p_reorder'] = p_reord_cart_ord['sum'] / p_reord_cart_ord['count']

In [None]:
del add_to_cart_order_grp
gc.collect()

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(p_reord_cart_ord.index, p_reord_cart_ord.p_reorder, color=color[5], alpha=.8, )
plt.title('Probability of product reordered by add to cart order')

The products reordered are more likely be added to cart earlier.

### Merging products.csv, department.csv, aisles.csv and order_products_XX.csv

In [None]:
aisles_df = pd.read_csv("../input/aisles.csv")
print(aisles_df.shape)
dpmt_df = pd.read_csv("../input/departments.csv")
print(dpmt_df.shape)
products_df = pd.read_csv("../input/products.csv")
print(products_df.shape)

In [None]:
products = pd.merge(left=products_df, right=dpmt_df, on='department_id', how='left')
products = pd.merge(left=products, right=aisles_df, on='aisle_id', how='left')
products.head()

In [None]:
del aisles_df, dpmt_df, products_df
gc.collect()

In [None]:
products.isnull().sum()

**Does one aisel belong to different departments?**

In [None]:
aisel_dpmt = products[['aisle','department']].drop_duplicates().groupby('aisle')['department'].value_counts()
aisel_dpmt.head()

In [None]:
(aisel_dpmt==1).all()

As we can see, an aisel can only belong to 1 department. Great.

**Number of aisels per department**

In [None]:
aisel_dpmt = pd.DataFrame(aisel_dpmt)
aisel_dpmt = aisel_dpmt.rename(columns={'department':'count'})
aisel_dpmt = aisel_dpmt.reset_index()
num_aisel_dpmt = aisel_dpmt.groupby('department').size()

In [None]:
num_aisel_dpmt = num_aisel_dpmt.sort_values(ascending=False)

In [None]:
plt.figure(figsize=(16,5))
sns.barplot(num_aisel_dpmt.index, num_aisel_dpmt.values, color=color[5], alpha=.8)
plt.xticks(rotation = 'vertical')

** Number of products per department **

In [None]:
pd_dpmt = products[['product_name','department']].groupby('department')['product_name'].count()
pd_dpmt = pd_dpmt.sort_values(ascending=False)

In [None]:
plt.figure(figsize=(16,5))
sns.barplot(pd_dpmt.index, pd_dpmt.values, color=color[5], alpha=.8)
plt.xticks(rotation = 'vertical')

### Bestsellers ###

**Merging products and order_product_XX.csv**

In [None]:
indexes = np.linspace(0, len(op_concate_df), num=100, dtype=np.int32)

In [None]:
len_op_concat = len(op_concate_df)

In [None]:
order_pd = pd.merge(left=op_concate_df.loc[:indexes[1],:], right=products, on='product_id', how='left')

In [None]:
op_concate_df = op_concate_df.loc[indexes[1]:len_op_concat,:]

In [None]:
%%time
for i in range(len(indexes)-2):
    temp = pd.merge(left=op_concate_df.loc[:indexes[i+2],:], right=products, on='product_id', how='left')
    if i == len(indexes)-3:
        del op_concate_df
    else:
        op_concate_df = op_concate_df.loc[indexes[i+2]:len_op_concat,:]
    order_pd = order_pd.append(temp, ignore_index=True)

In [None]:
order_pd.head()

**Best selling products**

In [None]:
bestsellers = order_pd.groupby('product_name').size()
bestsellers = bestsellers.sort_values(ascending=False)

In [None]:
top = 15
bestsellers = bestsellers[:top]

In [None]:
plt.figure(figsize=(16,5))
sns.barplot(bestsellers.index, bestsellers.values, color=color[5], alpha=.8)
plt.xticks(rotation = 'vertical')

Seems like organic foods are sold well.

**Which departments have the most selling products**

In [None]:
bestsellers_dpmt = order_pd.groupby('department').size()
bestsellers_dpmt = bestsellers_dpmt.sort_values(ascending=False)
bestsellers_dpmt = bestsellers_dpmt[:top]

In [None]:
plt.figure(figsize=(16,5))
sns.barplot(bestsellers_dpmt.index, bestsellers_dpmt.values, color=color[5], alpha=.8)
plt.xticks(rotation = 'vertical')

**Which aisles have the most selling products**

In [None]:
bestsellers_aisle = order_pd.groupby('aisle').size()
bestsellers_aisle = bestsellers_aisle.sort_values(ascending=False)
bestsellers_aisle = bestsellers_aisle[:top]

In [None]:
plt.figure(figsize=(16,5))
sns.barplot(bestsellers_aisle.index, bestsellers_aisle.values, color=color[5], alpha=.8)
plt.xticks(rotation = 'vertical')

### Most reordered

**Most reordered products**

In [None]:
most_reorder = order_pd.groupby('product_name')['reordered'].agg(['sum','count'])
most_reorder['reordered'] = most_reorder['sum'] / most_reorder['count']
most_reorder = most_reorder.sort_values('reordered',ascending=False)
most_reorder = most_reorder[:2*top]

In [None]:
most_reorder

Seems like these products have very high reordered ratio even if they have not many sales in total.

How about those with large sales?

In [None]:
most_reorder = order_pd.groupby('product_name')['reordered'].agg(['sum','count'])
most_reorder = most_reorder[most_reorder['count'] > 10000]
most_reorder['reordered'] = most_reorder['sum'] / most_reorder['count']
most_reorder = most_reorder.sort_values('reordered',ascending=False)
most_reorder = most_reorder[:top]

In [None]:
fig = plt.figure(figsize=(16,5)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

width = 0.4

most_reorder['reordered'].plot(kind='bar', color=color[5], ax=ax, width=width, position=1)
most_reorder['count'].plot(kind='bar', color=color[4], ax=ax2, width=width, position=0)

ax.set_ylabel('probability of being reordered')
ax2.set_ylabel('count of order that contains this product')
ax.set_ylim(.7,.875)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc=2)

handles2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(handles2, labels2, loc=0)

plt.xlabel(most_reorder.index)

plt.show()

It seems like even some products have not many historical sale, once they were bought by a user, it's very likely they will be reordered. 

**Which departments have the most reordered products**

In [None]:
most_reorder_dpmt = order_pd.groupby('department')['reordered'].agg(['sum','count'])
most_reorder_dpmt['reordered'] = most_reorder_dpmt['sum'] / most_reorder_dpmt['count']
most_reorder_dpmt = most_reorder_dpmt.sort_values('reordered',ascending=False)

In [None]:
fig = plt.figure(figsize=(16,5)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

width = 0.4

most_reorder_dpmt['reordered'].plot(kind='bar', color=color[5], ax=ax, width=width, position=1)
most_reorder_dpmt['count'].plot(kind='bar', color=color[4], ax=ax2, width=width, position=0)

ax.set_ylabel('probability of being reordered')
ax2.set_ylabel('count of order that contains products from this department')
ax.set_ylim(.3,.7)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc=2)

handles2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(handles2, labels2, loc=0)

plt.xlabel(most_reorder_dpmt.index)

plt.show()

Looks like produce and dairy eggs departments have both large sales and reordered demand.

**However, some departments don't have large sales in general (like pets), their reorder rate is quite high.**

**Which aisles have the most reordered products**

In [None]:
most_reorder_aisle = order_pd.groupby('aisle')['reordered'].agg(['sum','count'])
most_reorder_aisle['reordered'] = most_reorder_aisle['sum'] / most_reorder_aisle['count']
most_reorder_aisle = most_reorder_aisle.sort_values('reordered',ascending=False)

In [None]:
most_reorder_aisle = most_reorder_aisle[:top]
most_reorder_aisle

In [None]:
fig = plt.figure(figsize=(16,5)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

width = 0.4

most_reorder_aisle['reordered'].plot(kind='bar', color=color[5], ax=ax, width=width, position=1)
most_reorder_aisle['count'].plot(kind='bar', color=color[4], ax=ax2, width=width, position=0)

ax.set_ylabel('probability of being reordered')
ax2.set_ylabel('count of order that contains products from this department')
ax.set_ylim(.6,.8)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc=2)

handles2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(handles2, labels2, loc=0)

plt.xlabel(most_reorder_aisle.index)

plt.show()