## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

In [1]:
import numpy as np
import pandas as pd
import pickle

from tqdm import *

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [3]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [4]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv")

In [5]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv")

In [6]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [7]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [8]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv")

## aisles

In [9]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
84,85,food storage
42,43,buns rolls
44,45,candy chocolate
32,33,kosher foods
30,31,refrigerated


## departments

In [10]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [11]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [12]:
len(products_df)

49688

In [13]:
max(products_df["product_id"])

49688

## orders

In [14]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
518605,2142489,31241,prior,10,4,12,2.0
581611,423443,35038,prior,37,4,0,5.0
2418852,647634,145550,prior,20,1,14,0.0
2624026,2488674,157994,prior,18,2,20,8.0
1119412,1825504,67342,prior,12,0,17,30.0
1832762,1946913,110051,prior,4,6,14,10.0
674869,1202357,40696,prior,1,2,11,
765345,2644977,46006,prior,7,2,17,30.0
2343259,939619,141066,prior,26,6,19,5.0
278583,3190946,16833,prior,12,3,21,4.0


In [15]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [16]:
len(orders_df[orders_df['eval_set']=='test']['order_id'].unique())

75000

In [17]:
last_items_ordered_df.sample(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
795969,1960456,32265,2,1
1159530,2866121,31663,22,1
160794,392937,4920,4,1
1367458,3380687,26620,6,1
157835,385685,45066,5,1
1030393,2547202,26062,5,1
619392,1524104,13838,22,1
147520,360060,29180,6,0
232586,568873,29487,11,0
693833,1705657,24852,9,1


### users_df, derived

In [18]:
users_df = orders_df[["user_id"]]

In [19]:
users_df = users_df.drop_duplicates().reset_index().drop('index',axis=1)

In [20]:
users_df.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


## denormalizing

In [21]:
last_items_ordered_df = pd.merge(last_items_ordered_df,orders_df,on='order_id',how='left')

In [22]:
prior_items_ordered_df = pd.merge(prior_items_ordered_df,orders_df,on='order_id',how='left').drop('order_id',axis=1)

In [23]:
unique_user_ids = set(orders_df.groupby('user_id').groups.keys())

In [24]:
len(unique_user_ids)

206209

In [25]:
unique_product_ids = set(products_df['product_id'].values)

In [26]:
len(unique_product_ids)

49688

## what is the static reorder_factor for each user?

I.e. what percentage of products is reordered?

> since we need data from both the prior as from the last dataframes, we need to calculate sum and counts separately, because mean of means would not be correctly weighted.


In [27]:
users_df['reorder_cl'] = last_items_ordered_df.groupby('user_id')['reordered'].count()
users_df['reorder_sl'] = last_items_ordered_df.groupby('user_id')['reordered'].sum()

users_df['reorder_cp'] = prior_items_ordered_df.groupby('user_id')['reordered'].count()
users_df['reorder_sp'] = prior_items_ordered_df.groupby('user_id')['reordered'].sum()


users_df['user_reorder_factor'] = (users_df['reorder_sl'] + users_df['reorder_sp']) / (users_df['reorder_cl'] + users_df['reorder_cp'])

users_df.drop(['reorder_cl','reorder_sl','reorder_cp','reorder_sp'],axis=1,inplace=True)

In [28]:
users_df.head()

Unnamed: 0,user_id,user_reorder_factor
0,1,
1,2,0.728571
2,3,0.464602
3,4,
4,5,


In [39]:
to_save = users_df[["user_id","user_reorder_factor"]]
to_save.set_index("user_id",inplace=True)
pickle.dump(to_save.to_dict()['user_reorder_factor'],open(PICKLE_ROOT+"static_user_reorder_factors.p","wb"))

## now do the same for each product

In [29]:
# last orders
products_df['reorder_cl'] = last_items_ordered_df.groupby('product_id')['reordered'].count()
products_df['reorder_sl'] = last_items_ordered_df.groupby('product_id')['reordered'].sum()

# prior orders
products_df['reorder_cp'] = prior_items_ordered_df.groupby('product_id')['reordered'].count()
products_df['reorder_sp'] = prior_items_ordered_df.groupby('product_id')['reordered'].sum()

products_df['product_reorder_factor'] = (products_df['reorder_sl'] + products_df['reorder_sp']) / (products_df['reorder_cl'] + products_df['reorder_cp'])

products_df.drop(['reorder_cl','reorder_sl','reorder_cp','reorder_sp'],axis=1,inplace=True)

In [31]:
to_save = products_df[["product_id","product_reorder_factor"]]
to_save.set_index("product_id",inplace=True)
pickle.dump(to_save.to_dict()['product_reorder_factor'],open(PICKLE_ROOT+"static_product_reorder_factors.p","wb"))

### what are all products ever ordered by each user?

>  must use both prior and last orders


In [None]:
products_prior = prior_items_ordered_df[["product_id","user_id"]]
products_last = last_items_ordered_df[["product_id","user_id"]]

products_prior = pd.concat([products_prior,products_last])
products_by_user_df = products_prior.groupby('user_id')['product_id'].value_counts().to_frame()

In [None]:
grouped = prior_items_ordered_df[["product_id","user_id"]].groupby('user_id')['product_id']

In [None]:
groups = grouped.groups

In [None]:
del(groups)

In [None]:
del(grouped)

In [None]:
all_products_previously_ordered_by_each_user = dict()

for user_id in tqdm(unique_user_ids):
    prior_products = groups[user_id].values
    
    uniq = set(prior_products.tolist())
    
    all_products_previously_ordered_by_each_user[user_id] = uniq

In [None]:
all_products_previously_ordered_by_each_user[1]

In [None]:
pickle.dump(all_products_previously_ordered_by_each_user,open(PICKLE_ROOT+"all_products_previously_ordered_by_each_user.p","wb"))

In [None]:
max(all_products_previously_ordered_by_each_user.keys())