## Instacart dataset exploratory
* Instacart kaggle : https://www.kaggle.com/c/instacart-market-basket-analysis#prizes
* blog post : https://tech.instacart.com/3-million-instacart-orders-open-sourced-d40d29ead6f2
* data dictionary : https://gist.github.com/jeremystan/c3b39d947d9b88b3ccff3147dbcf6c6b
* dataset file list 


In [1]:
merge_order_product_dsimport pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.options.display.max_rows = 20
%matplotlib inline
sns.set(style="whitegrid", palette="colorblind", font_scale=1, rc={'font.family':'NanumGothic'} )
#sns.set(style="whitegrid", palette="colorblind", font_scale=1, rc={'font.family':'AppleGothic'} )

def toReadable(v):
    value = round(v,2) if isinstance(v, float) else v

    if value < 1000:
        return str(value)
    elif value<1000000:
        return str(round(value/1000,1))+'K'
    elif value>=1000000:
        return str(round(value/1000000,1))+'M'
    return value

---
## Load Dataset

In [None]:
raw_order_ds = pd.read_csv('../input/orders.csv')
order_product_ds = pd.read_csv('../input/order_products__prior.csv')
product_ds = pd.read_csv('../input/products.csv')

order_product_cnt_ds = order_product_ds.groupby('order_id').count()[['product_id']]
order_product_cnt_ds.columns = ['product_cnt']

## join product count 
order_ds = raw_order_ds.merge(order_product_cnt_ds, left_on='order_id', right_index=True)

### Dataset Summery
Let's look at the simple stats of a dataset

In [None]:
total_user = len(order_ds.user_id.unique())
total_order = len(order_ds)
total_ordered_product = len(order_product_ds)
unique_products = len(order_product_ds.product_id.unique())

print("total user = {}".format(toReadable(total_user)))
print("total order = {} ({} orders per a user )".format(toReadable(total_order), toReadable(total_order/total_user) ))
print("total product = ", toReadable(unique_products))
print("total ordered product  = {} ({} orders per a product )".format(
    toReadable(total_ordered_product), toReadable(total_ordered_product/unique_products) ))

---
# Weekly-Hour Analysis
* Day 0f week index[0,1..,6] is assumed as ' Sun Mon Tue Wed Tur Fri Sat '

In [None]:
index2day = "Dom Sab Vie Jue Mie Mar Lun".split()

In [None]:
def drawWeekHour(ds, values,  aggfunc=len, title=None, figsize=(18,5) , cmap=None):
    weekhour_ds = ds.pivot_table(index='order_dow', columns='order_hour_of_day', values=values, aggfunc=aggfunc).fillna(0)
    weekhour_ds.index =  [  index2day[index] for index in weekhour_ds.index]
    sns.set(style="whitegrid", palette="colorblind", font_scale=1 )

    plt.figure(figsize=figsize)
    f = sns.heatmap(weekhour_ds, annot=True, fmt="1.1f", linewidths=.5, cmap=cmap) 
    plt.xlabel("Hora")
    plt.ylabel("Día de la semana")
    if title:
        plt.title(title, fontsize=15)

### Weekly-hour headmap: Orders Heatmap
* Most orders are Mondays and Sundays from 9 to 16 
* Why are there lots of orders on Monday morning?

In [None]:
drawWeekHour(order_ds, values='order_id', title="Total Order Frequency(unit:1k)", aggfunc=lambda x: len(x)/1000)

### Why do many orders on sun/mon( 9-16 )? 
Many users? Many orders per one user?
* Let's look at unique users heatmap
* similar to orders heatmap!!

In [None]:
avg_users = round(order_ds.groupby(['order_dow','order_hour_of_day']).agg({'user_id':lambda x: len(x.unique())/1000}).mean().values[0],2)
drawWeekHour(order_ds, values='user_id', title="Total Unique Users(unit:1k) / Avg Users= {}k".format(avg_users), aggfunc=lambda x: len(x.unique())/1000)

* Let's look at orders per one user.
* No tiene ningún efecto claro, excepto Sabado de 8-10H.
 * Regular orders or common lifestyle?

In [None]:
drawWeekHour(order_ds, values='user_id', title="Total Unique Users(unit:1k)"
             , aggfunc=lambda x: len(x)/len(x.unique()))

#### Which products do people order at 9:00 on Monday?
Let's look at top 5 product at 9:00 each day.

In [None]:
merge_order_product_ds = order_product_ds.merge(order_ds, on='order_id' )
merge_order_product_ds = merge_order_product_ds.merge(product_ds, on='product_id')

In [None]:
hour_9_order_product_ds = merge_order_product_ds[merge_order_product_ds.order_hour_of_day==9]
grouped = hour_9_order_product_ds[:].groupby(['order_dow'])

In [None]:
topn = 5
hour_9_popluar_product = []
for (dow,rows) in grouped:
    sub_ds = rows.groupby('product_id', as_index=False).agg({'order_id':len}).sort_values('order_id', ascending=False)[:topn]
    sub_ds['dow'] = dow
    sub_ds['rank'] = list(range(0,topn))
    hour_9_popluar_product.append(sub_ds)

# pd.options.display.max_rows=200
hour_9_popluar_product_ds = pd.concat(hour_9_popluar_product).sort_values(['rank','dow']).merge(product_ds, on='product_id')\
.pivot(index='dow',columns='rank',values='product_name')
hour_9_popluar_product_ds.index = index2day

* Hmm, People seems to be eating more bananas  on Monday morning.

In [None]:
hour_9_popluar_product_ds

### Bestseller  20 product

In [None]:
def topItemEachGroup(ds, group_name, sort_name, topn):
    concat_list = []
    for (key, rows) in ds.groupby(group_name):
        sub_ds = rows.sort_values(sort_name, ascending=False)[:topn]
        sub_ds['rank'] = list(range(1,topn+1))
        concat_list.append(sub_ds)

    return pd.concat(concat_list)

In [None]:
def drawRankTrend(pivot_ds, ylabel='Rank'):
    sns.set(style="whitegrid", palette="colorblind", font_scale=1.3)

    index_max = pivot_ds.index.max()
    rank_max = pivot_ds.max().max()
    pivot_ds = pivot_ds.applymap(lambda x:rank_max-x+1)
    pivot_ds.plot(marker='o', figsize=(16,12), cmap='Dark2', xticks=pivot_ds.index, legend=None )
    
    plt.yticks(np.arange(rank_max,0,-1), np.arange(1,rank_max+1))
    for name, rank in pivot_ds.loc[index_max].sort_values(ascending=False).dropna().iteritems():
        plt.text(index_max*1.01,rank,name)
    plt.ylabel(ylabel)
    plt.show()
    

### Bestseller  20 product Hour Of Day Trend

In [None]:
hour_product_ds = merge_order_product_ds.groupby(['product_name','order_hour_of_day'], as_index=False).agg({'order_id':len})
hour_top_product_ds = topItemEachGroup(hour_product_ds, 'order_hour_of_day', 'order_id' , 20)
hour_top_product_pivot_ds = hour_top_product_ds.pivot(index='order_hour_of_day', columns='product_name', values='rank') 

* 1~6 rank's product is stable 

In [None]:
drawRankTrend(hour_top_product_pivot_ds)

### Bestseller  20 product Day Of Week Trend

In [None]:
rank_ds = merge_order_product_ds.groupby(['product_name','order_dow'], as_index=False).agg({'order_id':len})
rank_ds = topItemEachGroup(rank_ds, 'order_dow', 'order_id' , 20)
rank_pivot_ds = rank_ds.pivot(index='order_dow', columns='product_name', values='rank') 

In [None]:
drawRankTrend(rank_pivot_ds)

### When do people put a lot of products in your cart?

* There is clear effect of day-hour-heatmap. Weekends and night(21-23H) time are high.
* It seems like there are lots of shopping hours on the weekend and night.

In [None]:
drawWeekHour(order_ds, values='product_cnt', title="Product cnt per a order", aggfunc=lambda x: np.mean(x), cmap='YlGn')

### How often do you reorder? 
* days_since_prior_order heatmap 
* The smaller value, order more often

In [None]:
drawWeekHour(order_ds, values='days_since_prior_order', title="prior orders", aggfunc=lambda x: np.mean(x), cmap='YlGn')

In [None]:
sns.set(style="whitegrid", palette="colorblind", font_scale=1.4, rc={'font.family':'NanumGothic'} )

---
## Reorder Analysis
###  The larger the order number, the shorter days it takes to order again..
* order number = the order sequence number of each user* 

In [None]:
print("Avg days_since_prior_order {} Days".format( round(order_ds.days_since_prior_order.mean(),2)))

In [None]:
order_ds.groupby('order_number').agg({'days_since_prior_order':np.mean, 'product_cnt':np.mean}).plot(figsize=(16,6), 
                                title="Order sequence # vs day_since_prior_order", marker='o')
plt.tight_layout()
plt.show()

### The shorter days_since_prior_order, the higher the rate of reorder.

In [None]:
merge_order_product_ds = order_product_ds.merge(order_ds, on='order_id' )

In [None]:
reordered_since_days_ds = merge_order_product_ds.groupby(['days_since_prior_order','reordered']).agg({'product_id':len})
reordered_since_days_ds = reordered_since_days_ds.reset_index().pivot(index='days_since_prior_order', columns='reordered', values='product_id')
reordered_since_days_ds['reorder_rate'] = reordered_since_days_ds[1] /reordered_since_days_ds.sum(axis=1)
avg_reordered_rate = round(reordered_since_days_ds[1].sum() / reordered_since_days_ds[[0,1]].sum().sum(),2)

In [None]:
reordered_since_days_ds[['reorder_rate']].plot(kind='line', marker='o',figsize=(16,6))
plt.title("Reordered Rate (Avg {})".format(avg_reordered_rate), fontsize=20)
plt.tight_layout()
plt.show()

### Larger order number, higher re-order rate?


In [None]:
reordered_order_num_ds = merge_order_product_ds.groupby(['order_number','reordered']).agg({'product_id':len})
reordered_order_num_ds = reordered_order_num_ds.reset_index().pivot(index='order_number', columns='reordered', values='product_id')
reordered_order_num_ds['reorder_rate'] = reordered_order_num_ds[1] /reordered_order_num_ds.sum(axis=1)
avg_reordered_rate = round(reordered_order_num_ds[1].sum() / reordered_order_num_ds[[0,1]].sum().sum(),2)
reordered_order_num_ds.fillna(0, inplace=True)

### It looks like log curve!!!! 
* When order number is 40 more, Reorder-Rate exceed 80% more!!! 

In [None]:
reordered_order_num_ds[['reorder_rate']].plot(kind='line', marker='o',figsize=(16,6))
plt.title("Reordered Rate (Avg {})".format(avg_reordered_rate), fontsize=20)
plt.show()

### Which product is the reorder-rate high?

In [None]:
product_reorder_ds = merge_order_product_ds.groupby(['product_id']).agg({'order_id':len,
                                                                         'reordered':lambda x: len(x[x>0]),
                                                                         'user_id':lambda x: len(x.unique())})

In [None]:
convert_colnames = {'user_id':'unique_users','reordered':'reorder' , 'order_id':'total_order'}
product_reorder_ds.columns = [  convert_colnames[col] for col in product_reorder_ds.columns]

In [None]:
product_reorder_ds['reorder_rate'] = round(product_reorder_ds.reorder / product_reorder_ds.total_order,2)
product_reorder_ds['orders_per_user'] = round(product_reorder_ds.total_order/product_reorder_ds.unique_users,2)
product_reorder_ds = product_reorder_ds.merge(product_ds, left_index=True, right_on='product_id')

####  Popular reorder-rate product top 20.( more than 1000 ordered product )
* Most milk and banana products.

In [None]:
product_reorder_ds[product_reorder_ds.total_order>1000].sort_values('reorder_rate', ascending=False)\
        [['product_name','total_order', 'reorder_rate', 'aisle_id','orders_per_user']][:20]

#### Top 20 Reorder-rate Aisle, Bottom 20 reorder-rate Aisle 

In [None]:
# product_reorder_ds.groupby('aisle_id').agg({'product_name':                                           lambda x: })
from collections import defaultdict
import operator

def popularWords(names, topn=2):
    wordFrequency = defaultdict(int)
    def updateWords(words):
        for word in words :
            if len(word)>1:
                wordFrequency[word] += 1
    names.apply(lambda x: updateWords(x.split()))
    tops = sorted(wordFrequency.items(), key=operator.itemgetter(1),reverse=True)[:topn]
    return " ".join([n[0] for n in tops])

In [None]:
aisle_ds = product_ds.groupby('aisle_id').agg({'product_name':popularWords
                                               , 'product_id':lambda x:len(x.unique())})
# aisle_ds.columns = ['products','product_names']

In [None]:
aisle_order_stat_ds = product_reorder_ds.groupby('aisle_id').agg({'total_order':sum, 'reorder':sum})
aisle_order_stat_ds['reorder_rate'] = round(aisle_order_stat_ds.reorder / aisle_order_stat_ds.total_order, 2)
aisle_order_stat_ds = aisle_order_stat_ds.merge(aisle_ds, left_index=True, right_index=True).sort_values('reorder_rate', ascending=False)

* Top aisle is the fresh product with short expiration date 
* Botton aisle is long expirataion date products 

In [None]:
sns.set(style="whitegrid", palette="colorblind", font_scale=1.4, rc={'font.family':'NanumGothic'} )

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

top20_ds = aisle_order_stat_ds.set_index('product_name')[['reorder_rate']][:20]
top20_ds.plot(kind='bar', figsize=(16,6), alpha=.7, ax=ax1
             , title='Top 20 reorder rate Aisle (avg={})'.format(toReadable(top20_ds.reorder_rate.mean())))

bottom20_ds = aisle_order_stat_ds.set_index('product_name')[['reorder_rate']][-20:]
bottom20_ds.plot(kind='bar', figsize=(16,6), alpha=.7, ax=ax2
                , title='Bottom 20 reorder rate Aisle (avg={})'.format(toReadable(bottom20_ds.reorder_rate.mean())))
plt.show()

### Correlation high:  Total_order - Reorder_rate

In [None]:
from scipy.stats import spearmanr
g = sns.jointplot("reorder_rate", "total_order", kind="reg", marker='.', ylim=(0,100000), size=8, ratio=8
                  , stat_func=spearmanr
                  , data=product_reorder_ds)

---
## Clustering similar product by user's order informaiton
* Traing product2vec using word2vec 
 * word = product_id
 * scentence = user's order = [product_id1, product_id2, ... ]
* clustering by trained product vector
* Use only products ordered more than 200 times

In [None]:
order_product_list = merge_order_product_ds\
    .sort_values(['user_id','order_id','add_to_cart_order'])[['order_id','product_id']]\
    .values.tolist()

product_corpus = []
sentence = []
new_order_id = order_product_list[0][0]
for (order_id, product_id) in order_product_list:
    if new_order_id != order_id:
        product_corpus.append(sentence)
        sentence = []
        new_order_id = order_id
    sentence.append(str(product_id))

In [None]:
from gensim.models import Word2Vec

model100D = Word2Vec(product_corpus, window=6, size=100, workers=4, min_count=200)
# model100D.save('./resource/prod2vec.100d.model')
# model = Word2Vec.load('./resource/prod2vec.100d.model')

In [None]:
def toProductName(id):
    return product_ds[product_ds.product_id==id]['product_name'].values.tolist()[0]
toProductName(24852)

In [None]:
def most_similar_readable(model, product_id):
    similar_list = [(product_id,1.0)]+model.wv.most_similar(str(product_id))
    
    return [( toProductName(int(id)), similarity ) for (id,similarity) in similar_list]

### What is the most similar?
* most similar to banana(24852) is ..

In [None]:
pd.DataFrame(most_similar_readable(model, 24852), columns=['product','similarity'])

* most similar to Drinking Water(27845) is ..

In [None]:
pd.DataFrame(most_similar_readable(model, 27845), columns=['product','similarity'])

* most similar to Organic Whole Milk(40939) is .. 

In [None]:
pd.DataFrame(most_similar_readable(model, 40939), columns=['product','similarity'])

In [None]:
pd.DataFrame(most_similar_readable(model, 48697), columns=['product','similarity'])

### Product2Vec works well !!! 

* Create 500 clusters as similar products
* using kmeans ( k=500 )

In [None]:
import kmeans

In [None]:
def clustering(model, k=500, delta=0.00000001, maxiter=200):
    movie_vec = model.wv.syn0
    centres, index2cid, dist = kmeans.kmeanssample(movie_vec, k, 
                                                   metric = 'cosine', 
                                                   delta = delta, 
                                                   nsample = 0, maxiter = maxiter,)
    clustered_ds = pd.DataFrame( [ (a, b, c) for a, b, c in zip(model.wv.index2word, index2cid, dist )],
                 columns=['product_id', 'cid', 'dist'] ).sort_values(['cid','dist'], ascending=True)

    prod2cid = { product_id:cid for product_id,cid in zip(model.wv.index2word, index2cid) }

    return (centres, index2cid, dist, clustered_ds, prod2cid)

In [None]:
(centres, index2cid, dist, clustered_ds, prod2cid) = clustering(model)

In [None]:
clustered_ds.product_id = clustered_ds.product_id.apply(pd.to_numeric)

In [None]:
def idToProductDesc(id):
    return product_ds[product_ds.product_id==id][['product_name','aisle_id']].values.tolist()[0]
    
def getProductNames(product_id_list):
    return [ idToProductDesc(int(product_id)) for  product_id in product_id_list ]

import urllib
def printClusterMembers(cluster_id, topn=10):
    members = getProductNames(clustered_ds[clustered_ds.cid==cluster_id].product_id[:topn].tolist())
    for member in members:
        print("{aisle} / {name}  https://www.google.co.kr/search?tbm=isch&q={q}".format( 
            aisle=member[1], name=member[0], q=urllib.parse.quote_plus(member[0]) ) 
        )

### Clustered Result
### Let's look at clustered product 

* Cluster ID = 0 th

In [None]:
printClusterMembers(1, topn=10)

* Cluster ID = 100 th

In [None]:
printClusterMembers(100, topn=10)

* Cluster ID = 200 th

In [None]:
printClusterMembers(200, topn=10)

* Cluster ID = 300 th

In [None]:
printClusterMembers(300, topn=10)

* Cluster ID = 400 th

In [None]:
printClusterMembers(400, topn=10)

* Cluster ID = 499 th

In [None]:
printClusterMembers(499, topn=10)

### It looks goooood!! 

----
### Order time trend of clustered product.

* Extract representative keywords from each cluster.
* Reprosentative keywords : 3 words and max 15 latters
* Sort by popular order hour 

In [None]:
clusterIdToKeywords = { cid: popularWords(sub_ds.product_name,3) for cid, sub_ds in clustered_ds.merge(product_ds, on='product_id').groupby('cid')}

#### Hour of Day Trend Per cluster 

In [None]:
product_hod_ds = merge_order_product_ds.pivot_table(index='product_id', columns='order_hour_of_day', values='order_id', aggfunc=len, fill_value=0)

orderByHotHour = clustered_ds.merge(product_hod_ds, left_on='product_id', right_index=True)\
    .groupby('cid').sum()[np.arange(0,24)].idxmax(axis=1).sort_values().index

In [None]:
sns.set(style="whitegrid", palette="colorblind", font_scale=1, rc={'font.family':'NanumGothic'} )

def drawHODCluster(ncols, nrows, startClusterNumber, step):
    fig, axes = plt.subplots(ncols=ncols, nrows = nrows, figsize=(ncols*2.5,nrows*2), sharex=True, sharey=True)

    for cid, ax  in enumerate(axes.flatten()):
        cid = startClusterNumber + (cid*step)
        if cid>=500:
            break
        cid = orderByHotHour[cid]

        product_id_list = clustered_ds[clustered_ds.cid==cid].product_id.values
        tmp_ds = product_hod_ds.loc[product_id_list].T
        hot_hour = tmp_ds.sum(axis=1).argmax()
        normalized_ds =(tmp_ds/tmp_ds.max())
        title = "{cid}th {n} products \n({keyword})".format(cid=cid, n=normalized_ds.shape[1],  keyword=clusterIdToKeywords[cid][:23])
        normalized_ds.plot(linewidth=.3, legend=False, alpha=.4, ax=ax, title=title, color='r' if hot_hour<13 else 'k')
        ax.plot((hot_hour,hot_hour),(1,0), '-.', linewidth=1, color='b')
        ax.text(hot_hour,0,"{h}h(hot)".format(h=hot_hour),color='b')

    fig.tight_layout()

In [None]:
ncols, nrows=(6,4)
step = 3
for n in np.arange(0,500,ncols*nrows*step):
    drawHODCluster(ncols, nrows, n, step)

#### Hour of Day Trend Per cluster 

In [None]:
product_dow_ds = merge_order_product_ds.pivot_table(index='product_id', columns='order_dow', values='order_id', aggfunc=len, fill_value=0)

orderByHotDay = clustered_ds.merge(product_dow_ds, left_on='product_id', right_index=True)\
    .groupby('cid').sum()[np.arange(0,6)].idxmax(axis=1).sort_values().index

In [None]:
def drawDOWCluster(ncols, nrows, startClusterNumber, step):
    sns.set(style="whitegrid", palette="colorblind", font_scale=1, rc={'font.family':'NanumGothic'} )
    week_day = "Sun Mon Tue Wed Thu Fri Sat".split()
    fig, axes = plt.subplots(ncols=ncols, nrows = nrows, figsize=(ncols*2.5,nrows*2), sharex=True, sharey=True)

    for cid, ax  in enumerate(axes.flatten()):
        cid = startClusterNumber + (cid*step)
        if cid>=500:
            break
        cid = orderByHotDay[cid]    
        product_id_list = clustered_ds[clustered_ds.cid==cid].product_id.values
        tmp_ds = product_dow_ds.loc[product_id_list].T
        hot_day = tmp_ds.sum(axis=1).argmax()
        normalized_ds =(tmp_ds/tmp_ds.max())
        normalized_ds.index = week_day
        title = "{cid}th \n({keyword})".format(cid=cid, h=hot_day,  keyword=clusterIdToKeywords[cid][:23])
        normalized_ds.plot(kind='bar', linewidth=.1, legend=False, alpha=.4, ax=ax, title=title, color='r' if hot_day in(0,6) else 'k')
        ax.plot((hot_day,hot_day),(1,0), '-.', linewidth=2, color='b')
        # ax.text(hot_day+.3,-.5,"{h}".format(h=week_day[hot_day]),color='b')
    
    fig.tight_layout()

In [None]:
ncols, nrows=(6,4)
step = 3
for n in np.arange(0,500,ncols*nrows*step):
    drawDOWCluster(ncols, nrows, n, step)