# Instacart Market Basket Analysis

Import necessary modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as sk
import sklearn.tree as tree
from IPython.display import Image  
#import pydotplus
import matplotlib.pyplot as plt
import squarify 

In [None]:
%matplotlib inline
pd.set_option('display.width', 1000)

## Load datasets

In [None]:
# limited dataset to ensure kaggle is happy
orders = pd.read_csv('../input/orders.csv', nrows= 2000000)
products = pd.read_csv('../input/products.csv', nrows= 2000000)
departments = pd.read_csv('../input/departments.csv')
aisles = pd.read_csv('../input/aisles.csv')
order_products_prior = pd.read_csv('../input/order_products__prior.csv', nrows= 2000000)
order_products_train = pd.read_csv('../input/order_products__train.csv', nrows= 2000000)

In [None]:
orders.head(2)

In [None]:
products.head(2)

In [None]:
departments.head(2)

In [None]:
aisles.head(2)

In [None]:
order_products_prior.head(2)

In [None]:
order_products_train.head(2)

## Data Preparation

We prepare a dataframe of merged datasets to help with the analysis

In [None]:
df_train = order_products_train.copy().merge(orders, left_on='order_id', right_on='order_id')

In [None]:
df_train.head(2)

In [None]:
df_prior = order_products_prior.copy().merge(orders, left_on='order_id', right_on='order_id')

In [None]:
df_prior.head(2)

<hr/>

## Finding 1: Consumer Purchase Behavior Analysis

### Summary of the finding: 
Consumer purchasing pattern shows that healthy food are purchased early in the day whereas unhealthy food like ice-cream and pizzas are purchased later in the day.

### Validating the Findings:

In [None]:
orders.eval_set.unique()

In [None]:
order_set = orders.copy()
order_set = orders[orders.eval_set != 'test']

In [None]:
order_set.eval_set.unique()

In [None]:
order_set =orders.drop(['eval_set'], axis =1)

In [None]:
order_set.head(2)

In [None]:
len(order_set)

In [None]:
order_products = pd.concat([order_products_prior, order_products_train])

In [None]:
order_products.head()

In [None]:
len(order_products)

In [None]:
df = orders.merge(order_products, on='order_id')

In [None]:
Top_products = pd.DataFrame({'Size': df.groupby('product_id').size()}).sort_values('Size', ascending=False)\
.reset_index()[:2000]
Top_products = Top_products.merge(products, on='product_id')

In [None]:
df = df.loc[df['product_id'].isin(Top_products.product_id)]

In [None]:
product_orders_by_hour = pd.DataFrame({'Count': df.groupby(['product_id', 'order_hour_of_day']).size()})\
.reset_index()

In [None]:
product_orders_by_hour.head(24)

In [None]:
product_orders_by_hour['pct'] = product_orders_by_hour.groupby('product_id')['Count'].apply(lambda x: x/x.sum()*100)

In [None]:
product_orders_by_hour.head(24)

###   Total orders based on order hour of day

In [None]:
order_count = product_orders_by_hour.groupby('order_hour_of_day')['Count'].sum()

In [None]:
order_count= order_count.reset_index()

In [None]:
sns.factorplot(x ='order_hour_of_day',y ='Count',  data = order_count, kind ='bar', aspect = 3)

###   Split orders by Early in the Day (Morning, <12pm) and Later in the Day (Afternoon, >=12pm)

In [None]:
def MeanHour(x):
    return sum(x['order_hour_of_day'] * x['Count'])/sum(x['Count'])

In [None]:
MeanHour = pd.DataFrame({'MeanHour': product_orders_by_hour.groupby('product_id').apply(MeanHour)}).reset_index()

In [None]:
MeanHour.head(3)

In [None]:
Morning = MeanHour.sort_values('MeanHour')[:25]
Morning = Morning.merge(products, on='product_id')
Morning.head()

In [None]:
Late = MeanHour.sort_values('MeanHour', ascending=False)[:25]
Late = Late.merge(products, on='product_id')
Late.head()

#### Plot

In [None]:
# Create MorningPct table to get count of product_id with MeanHour
MorningPct = product_orders_by_hour.merge(Morning, on='product_id')
MorningPct=MorningPct.sort_values(['MeanHour', 'order_hour_of_day'])

In [None]:
# Create larePct table to get count of product_id with MeanHour
LatePct = product_orders_by_hour.merge(Late, on='product_id')
LatePct =LatePct.sort_values(['MeanHour', 'order_hour_of_day'], ascending=False)

In [None]:
Morning_ProductName = list(MorningPct['product_name'].unique())
Morning_ProductName = '\n'.join(Morning_ProductName)
Late_ProductName = list(LatePct['product_name'].unique())
Late_ProductName = '\n'.join(Late_ProductName)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

MorningPct.groupby('product_id').plot(x='order_hour_of_day', y='pct', ax=ax,legend=False,alpha =0.2,aa=True, color='green',
                                       linewidth=1.0,)

LatePct.groupby('product_id').plot(x='order_hour_of_day', y='pct', ax=ax, legend= False,alpha=0.2, aa=True,color='red',
                                   linewidth=1.0,)
plt.margins(x=0.5, y=0.05)

label_font_size = 13
plt.xlabel('Hour of Day Ordered', fontsize= label_font_size)
plt.ylabel('Percent of Orders by Product', fontsize=label_font_size)


tick_font_size = 10
ax.tick_params(labelsize=tick_font_size)
plt.xticks(range(0, 25, 2))
plt.yticks(range(0, 16, 5))
plt.xlim([-2, 28])

text_font_size = 9
ax.text(0.01, 1.0, Morning_ProductName,verticalalignment='top', horizontalalignment='left',transform=ax.transAxes,
        color='green', fontsize=text_font_size)
ax.text(0.99, 1.0, Late_ProductName,verticalalignment='top', horizontalalignment='right',
        transform=ax.transAxes,
        color='red', fontsize=text_font_size);

## Conclusion

Consumer purchasing pattern show that healthy food are purchased early in the day whereas unhealthy food like 
icecream, pizzas are purchased later in the day

## Finding 2 : Product Variety in Basket and User Buying Behaviour Over Time

### Summary of the Finding:
Instacart onboards thousands of customers weekly and they try their best to get new shoppers comfortable with the platform so they can get up and running quickly and maximize their earnings. But do these customers continue to use the platform over time? 

From our analysis, we found that as the basket size increase, there are more reordered products (~>50%) in the basket. This shows that product variety decreases over time for users and try fewer new products. However, we also found that with each order a user place, the number of items in the basket also reduces. 

### Validating the Finding:

Using the prior set, we get the basket size for each order and count the number or items that are reorder or new.

In [None]:
order_size_reorder = df_prior.groupby(['order_id']).agg({'add_to_cart_order':'max','reordered':'sum'})\
                    .rename(columns={'reordered':'Count of Reordered Items', 'add_to_cart_order': 'Order Size'})

In [None]:
order_size_reorder.head(2)

Calculate the average reorder rate for each order (basket) size.

In [None]:
avg_order_size = order_size_reorder.groupby('Order Size').agg({'Count of Reordered Items': 'mean'}).reset_index()

In [None]:
sns.factorplot(x='Order Size', y='Count of Reordered Items', data=avg_order_size, aspect=4, kind='point')

In [None]:
order_size_reorder.corr()

There is a strong positive correlation between the order size and number of reordered items in an order. This suggests that there is less variety in customers' purchased products as the order size grows.

We then look at customers' order size pattern over time.

In [None]:
order_size_by_order_number = order_size_reorder.copy()

In [None]:
order_size_by_order_number

In [None]:
order_size_by_order_number = order_size_by_order_number.merge(orders, left_on='order_id', right_on='order_id')

In [None]:
order_size_by_order_number.head()

In [None]:
order_size_by_order_number = order_size_by_order_number.groupby('order_number').agg({'Order Size': 'mean'}) \
                            .reset_index()

In [None]:
order_size_by_order_number['order_number_bin'] = pd.cut(order_size_by_order_number['order_number'],\
                            bins=[0,10,20,30,40,50,60,70,80,90,100])

In [None]:
sns.factorplot(x='order_number_bin', y='Order Size', data=order_size_by_order_number, aspect=4)

From the above chart we can see an increase in order size (total number of items in the order) from the first to the 30th order. Then the curve of the trend shifts downward after the 30th order.

#### Why is there a decline in basket size?

Analyzing the first 60 orders of users.

In [None]:
first60_orders = df_prior.copy().loc[df_prior.order_number <= 60]

In [None]:
first60_orders = first60_orders.merge(products, left_on='product_id', right_on='product_id')
first60_orders = first60_orders.merge(departments, left_on='department_id', right_on='department_id')
first60_orders = first60_orders.merge(aisles, left_on='aisle_id', right_on='aisle_id')

In [None]:
first60_orders

Drop unnecessary columns

In [None]:
first60_orders.drop(['eval_set', 'order_dow', 'order_hour_of_day','department_id', 'aisle_id'], axis=1, inplace=True)

Get the count of items of each order by aisle

In [None]:
order_number_by_aisle = first60_orders.groupby(['order_number', 'aisle']) \
        .agg({'order_number':'count', 'days_since_prior_order':'mean','reordered':'mean', \
              'add_to_cart_order':'mean','product_name':'first', 'department':'first'}) \
        .rename(columns={'order_number':'count', 'days_since_prior_order':'avg_days_prior', \
                         'add_to_cart_order':'avg_cart_position'}) \
        .sort_values(by=['order_number','count'], ascending=False).reset_index()

In [None]:
order_number_by_aisle.sort_values(by='count', ascending=False)

Get aisles with largest and smallest spread between orders 10 and 60

In [None]:
order_size_change = order_number_by_aisle.groupby('aisle')\
                    .agg({'count': lambda x: x.max()-x.min(), 'department': 'first'}) \
                    .rename(columns={'count':'spread'})

In [None]:
order_size_change

The 3 aisles/departments with the largest drop in orders from order 1 to 60.

In [None]:
order_size_change.nlargest(3, 'spread')

The 3 aisles/departments with the smallest drop in orders from order 1 to 60.

In [None]:
order_size_change.nsmallest(3, 'spread')

In [None]:
top3_aisle_order_number = order_number_by_aisle.groupby('order_number').head(3)

In [None]:
top3_aisle_order_number

In [None]:
sns.factorplot(x='order_number', y='count', hue='aisle', data=top3_aisle_order_number, kind='bar', aspect=3,\
               legend_out=False)

#### We then look at calculating the order total for each order a user place to get an estimate of Instacart earnings per order

Get all the unique department in the first 60 orders by users

In [None]:
order_number_by_aisle.department.unique()

In [None]:
order_number_by_aisle.head()

Estimate the average expenses for user for each order by giving an estimate dollar amount for each department. 
<br/><br/>
<i>Price estimates are based on data from https://www.titlemax.com/discovery-center/home-and-family/cost-of-common-groceries-10-years//</i>

In [None]:
dept_price_est = {
    'produce': 2.00, 
    'dairy eggs': 3.00, 
    'beverages': 2.00, 
    'snacks': 3.00, 
    'bakery': 3.00,
    'babies': 5.00, 
    'deli': 3.00,
    'frozen': 5.00, 
    'dry goods pasta':3.00, 
    'bakery': 3.00,
    'meat seafood': 5.00,
    'canned goods': 2.00,
    'pantry': 3.00,
    'breakfast': 3.00,
    'missing': 2.00,
    'international': 5.00,
    'household': 7.00, 
    'pets': 10.00, 
    'other': 3.00, 
    'personal care': 4.00, 
    'alcohol': 2.00, 
    'bulk': 2.00
}

Get the item count and dollar estimation for orders

In [None]:
orders_by_dept = first60_orders.groupby(['user_id','order_number', 'department']) \
        .agg({'product_name':'count'}) \
        .rename(columns={'product_name':'item_count'}).reset_index()

In [None]:
orders_by_dept.sort_values(by='user_id')

In [None]:
orders_by_dept['amount'] = orders_by_dept['department'].apply(lambda x: dept_price_est[x])

In [None]:
orders_by_dept['amount'] = orders_by_dept['amount']*orders_by_dept['item_count']

In [None]:
orders_by_dept_data = orders_by_dept.groupby(['order_number', 'department']).agg({'item_count':'mean', \
                                                                                  'amount':'mean'}).reset_index()

In [None]:
orders_by_dept_data

Calculate the order total for each order

In [None]:
user_orders = orders_by_dept.groupby(['user_id', 'order_number']).agg({'item_count':'sum', 'amount':'sum'})\
                .rename(columns={'amount': 'item_total'}).reset_index()

In [None]:
user_orders

For every order, Instacart charges a delivery fee of \$5.99 (except for first order on the platform) and a 10% service fee for orders more than \$12

In [None]:
user_orders['delivery_fee'] = user_orders['order_number'].apply(lambda x: 5.99 if x > 1 else 0)
user_orders['service_fee'] = user_orders['item_total'].apply(lambda x: x*0.1 if x >= 12 else 0)
user_orders['order_total'] = user_orders['item_total'] + user_orders['delivery_fee'] + user_orders['service_fee']

Assuming that instacart makes a 5% margin from all items sold via the platform from the retailers and another 5% off delivery fee,

In [None]:
user_orders['instacart_margin'] = user_orders['item_total'] * 0.05
user_orders['instacart_earnings'] = user_orders['instacart_margin'] + user_orders['service_fee']\
                                    + (user_orders['service_fee'] * 0.05)

In [None]:
user_orders

Calculate the average item count and order total for each order number

In [None]:
avg_total_by_order_number = user_orders.groupby('order_number') \
                            .agg({'item_total': 'mean', 'order_total': 'mean', 'instacart_earnings':'mean'}) \
                            .reset_index()

In [None]:
avg_total_by_order_number

In [None]:
sns.set(font_scale=2)
fig, ax = plt.subplots(figsize=(30,10))
plt.plot(avg_total_by_order_number.order_number, avg_total_by_order_number.order_total, color='red')
plt.plot(avg_total_by_order_number.order_number, avg_total_by_order_number.item_total, color='orange')
plt.title("Order Total (USD) by Order Number", loc='center', fontsize=14, fontweight=0, color='black')
plt.xlabel("Order Number")
plt.ylabel("Amount (USD)")
plt.legend()
plt.axvline(31,linestyle='--')

In [None]:
sns.set(font_scale=1)
sns.factorplot(x='order_number', y='instacart_earnings', data=avg_total_by_order_number, aspect=4, color='green')
plt.title("Order Total (USD) by Order Number", loc='center', fontsize=14, fontweight=0, color='black')
plt.xlabel("Order Number")
plt.ylabel("Amount (USD)")
plt.axvline(31,linestyle='--')

In [None]:
order_number_by_aisle['avg_days_prior'].fillna(0)

Calculate average interval of days between the orders

In [None]:
avg_days_prior_by_order = order_number_by_aisle.groupby('order_number').agg({'avg_days_prior':'mean'})

In [None]:
avg_days_prior_by_order['days_cumsum'] = avg_days_prior_by_order['avg_days_prior'].cumsum()

In [None]:
avg_days_prior_by_order

In [None]:
sns.factorplot(x='order_number', y='days_cumsum', data=avg_days_prior_by_order.reset_index(), aspect=4)
plt.title("Day Intervals Between Orders", loc='center', fontsize=14, fontweight=0, color='black')
plt.xlabel("Order Number")
plt.ylabel("No. of Days")

From the above chart, we can see that users typically have 1.5 years from when they first join Instacart and place their first order until the day they place their 60th order. 

### Conclusion

From the above analysis, we can see that from order number 1 to 60, the count of the top 3 ordered items have declined by >50%, thus reducing the basket size over time. We assume that the decline may be caused by poor quality of produce that are causing users to search for alternatives with their produce purchase and the higher prices they pay for the produce on Instacart (view customer price comparison here: https://imgur.com/a/VfQb6). However, users still continue to use the platform for other products such as frozen juice, baby accessories and beauty products. 
<br/> With the current earnings pattern, if Instacart does not have any other source of revenue, the company would be required to onboard new customers every 3-6 months to sustain the business. If this pattern continues, Instacart is likely unable to survive for long.

<hr/>

## Finding 3 : Position in Cart Affects Reorder Probability of Products

### Summary of the finding: 
The earlier the product's position in the cart, the higher the chance that the product is reordered.

### Validating the Finding:

Using the df_train dataframe, we first calculate the average cart position of a product and its reordered status.

In [None]:
product_reorder = df_train.groupby(['product_id','reordered']).agg({'add_to_cart_order': 'mean'})

Since a cart position can only be a whole number, we round up the averaged cart positoni calculated above.

In [None]:
import math
product_reorder['add_to_cart_order'] = product_reorder['add_to_cart_order'].apply(lambda x: math.ceil(x))

In [None]:
product_reorder.reset_index(inplace = True)

### Plot

Chart to show the Count of Reorder or No Reorder for every cart position. Since the occurance for cart positions > 34 is low, we omit them from the below charts.

### Decision Tree

Plotting the Decision Tree to predict a product's reorder by its position in the cart

In [None]:
X = product_reorder.drop(['product_id', 'reordered'],axis=1)

In [None]:
Y = product_reorder.reordered

In [None]:
dt = tree.DecisionTreeClassifier(max_depth=2)

In [None]:
dt.fit(X,Y)

In [None]:
dt_feature_names = list(X.columns)
dt_target_names = [str(s) for s in Y.unique()]
tree.export_graphviz(dt, out_file='add-to-cart-order-tree.png', 
    feature_names=dt_feature_names, class_names=dt_target_names,
    filled=True)  
#graph = pydotplus.graph_from_dot_file('add-to-cart-order-tree.png')
#Image(graph.create_png())

Validating the Decision Tree

In [None]:
reordered_count = product_reorder.groupby(['add_to_cart_order','reordered']) \
                    .agg({'reordered':'count'}) \
                    .rename(columns={'reordered':'reorder_count'})

In [None]:
reordered_count['add_to_cart_order_total'] = reordered_count.groupby(level=0)['reorder_count'].transform('sum')

In [None]:
reordered_count['reorder_prob'] = reordered_count['reorder_count'] / reordered_count['add_to_cart_order_total']

In [None]:
reordered_count.head(2)

In [None]:
reordered_count.reset_index(inplace=True)

In [None]:
sns.factorplot(y='reorder_count', x='add_to_cart_order', hue='reordered', \
               data=reordered_count.loc[reordered_count.add_to_cart_order < 35] \
               ,aspect=3, legend_out=False)

Chart to show the Reorder Probability vs Cart Position.

In [None]:
reordered_count['add_to_cart_order_bin'] = pd.cut(reordered_count['add_to_cart_order'],\
                            bins=[0,8.5,9.5,10.5, 100])

In [None]:
sns.factorplot(y='reorder_prob', x='add_to_cart_order_bin', hue='reordered', data=reordered_count, kind='bar',aspect=3)

#### What's being placed to cart in the top 9 positions?

In [None]:
top9_cart_position_products = df_train.loc[df_train.add_to_cart_order <= 9.5]

In [None]:
top9_cart_position_products = top9_cart_position_products.merge(products, left_on='product_id', right_on='product_id') 

In [None]:
top9_cart_position_products = top9_cart_position_products.merge(departments, left_on='department_id', right_on='department_id') 

In [None]:
top9_cart_position_products.drop(['aisle_id', 'department_id', 'product_id'], axis=1, inplace=True)

In [None]:
top9_cart_position_products.sort_values(by='add_to_cart_order').head(5)

In [None]:
top9_cart_position_products.groupby(['department', 'product_name']) \
                                .agg({'department': 'count'}).rename(columns={'department':'product_count'})

In [None]:
top9_cart_position_dept = top9_cart_position_products.groupby(['department']) \
                                .agg({'department': 'count'}).rename(columns={'department':'product_count'})

In [None]:
top9_cart_position_dept.reset_index(inplace=True)

In [None]:
top9_cart_position_dept.head(2)

In [None]:
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(10, 8)
colors = ["#e0301e", "#602320","#a32020","#eb8c00","#dc6900","#4d7358","#e39e54", "#e8d174", "#326ada","#433e90","#a19c9c"]
#colors = ['#dc6900', "#ffeead", "#ff6f69", "#ffcc5c", "#602320", "#e0301e", "#c1242f","#65c25e", "#6d7371", "#b9bab6"]
squarify.plot(sizes=top9_cart_position_dept.product_count, label=top9_cart_position_dept.department, alpha=.6, color=colors)
plt.title("Products Added to Cart Prior 9.5th Position by Departments",fontsize=18,fontweight="bold")
plt.axis('off')
plt.show()

In [None]:
top9_cart_position_products.groupby(['department', 'product_name']) \
    .agg({'department': 'count'}).rename(columns={'department':'product_count'}) \
    .reset_index().sort_values('product_count', ascending=False).head(5)

### Conclusion

Items added to cart at position <= 9.5 have a higher chance of reorder as they are mostly produce or products with a shorter shelf life.

<hr/>

## Finding 4 : Purchase pattern of weekly and monthly customers
    

In [None]:
product_df=products.copy().merge(departments, on ='department_id',how ='left').merge(aisles, on ='aisle_id',how ='left')
product_df.head()

In [None]:
PO =df_train.merge(product_df , on ='product_id', how ='left')

In [None]:
Top_Aisle = PO.copy()

In [None]:
sns.distplot(Top_Aisle['days_since_prior_order'].fillna(0).astype(int));

In [None]:
user_days_since= Top_Aisle.groupby(['user_id', 'order_id']).apply(lambda x: x.iloc[0]['days_since_prior_order']).rename('days_since').reset_index()

In [None]:
user_days_since = user_days_since.groupby('user_id').apply(lambda x: x['days_since'].mean()).rename('mean_time').reset_index()

In [None]:
user_days_since.head()

In [None]:
weekly_users = user_days_since[user_days_since['mean_time'] < 8]

In [None]:
monthly_users = user_days_since[user_days_since['mean_time'] > 21]

In [None]:
monthly_data = Top_Aisle.merge(monthly_users, on='user_id',how='inner')

In [None]:
weekly_data = Top_Aisle.merge(weekly_users, on='user_id',how='inner')

In [None]:
WeeklyDF=weekly_data.groupby('department').size().rename('counts').reset_index().sort_values('counts', ascending=False)
WeeklyDF.head()

In [None]:
MonthlyDF=monthly_data.groupby('department').size().rename('counts').reset_index().sort_values('counts', ascending=False)
MonthlyDF.head()

#### Departments shopped by monthly users

In [None]:
sns.factorplot(x ='department',y ='counts',  data = MonthlyDF, kind ='bar', aspect = 5)

#### Departments shopped by weekly users

In [None]:
sns.factorplot(x ='department',y ='counts',  data = WeeklyDF, kind ='bar', aspect = 5)

In [None]:
reorder_by_dept = PO.copy()
reorder_by_dept.head()

Get the average days interval for orders by department

In [None]:
reorder_by_dept = reorder_by_dept.groupby('department_id')\
                    .agg({'department':'first', 'days_since_prior_order':'mean', 'reordered':'mean'})

In [None]:
reorder_by_dept.head()

Since days are only in whole numbers, we round up the calculated mean days_since_prior_order

In [None]:
reorder_by_dept['days_since_prior_order'] = reorder_by_dept['days_since_prior_order'].apply(lambda x: math.ceil(x))

Plot a line graph to see the pattern or days interval for each department

In [None]:
sns.factorplot(x='department', y='days_since_prior_order', data=reorder_by_dept, kind='bar', aspect=4)

### Conclusion :

It is evident from both the graphs that weekly and monthly users tend to purchase from the same department. Household  items have the longest interval between orders and baby-related items have the smallest interval between orders.