## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

In [1]:
import numpy as np
import pandas as pd
import pickle

from tqdm import *

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [3]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [4]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv")

In [5]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv")

In [6]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [7]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [8]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv")

## aisles

In [9]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
12,13,prepared meals
51,52,frozen breakfast
106,107,chips pretzels
7,8,bakery desserts
41,42,frozen vegan vegetarian


## departments

In [10]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [11]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [12]:
len(products_df)

49688

In [13]:
max(products_df["product_id"])

49688

## orders

In [14]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
2736887,2576163,164913,prior,9,3,21,1.0
593716,1729161,35777,prior,7,4,22,30.0
2655666,2714969,159846,prior,10,3,17,21.0
409650,1565,24659,prior,76,2,19,3.0
100591,2265142,6074,prior,16,6,13,0.0
1683053,2003857,101019,prior,24,0,13,21.0
862451,1599959,51828,prior,3,5,10,30.0
199169,835672,12060,prior,1,2,11,
2106827,1522270,126706,prior,8,0,13,3.0
2171914,2273341,130695,prior,3,3,5,30.0


In [15]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [16]:
len(orders_df[orders_df['eval_set']=='test']['order_id'].unique())

75000

In [17]:
last_items_ordered_df.sample(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
1329281,3284556,45447,2,0
368780,900251,3710,3,1
1089195,2691618,12060,2,1
815693,2012447,42193,1,1
1131579,2796457,47209,1,1
440485,1077506,4066,17,1
374851,915439,43183,6,0
739476,1820157,16387,1,1
76459,188755,28699,8,1
1004101,2480724,33754,21,0


### users_df, derived

In [18]:
users_df = orders_df[["user_id"]]

In [19]:
users_df = users_df.drop_duplicates().reset_index().drop('index',axis=1)

In [20]:
users_df.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


## denormalizing

In [21]:
last_items_ordered_df = pd.merge(last_items_ordered_df,orders_df,on='order_id',how='left')

In [22]:
prior_items_ordered_df = pd.merge(prior_items_ordered_df,orders_df,on='order_id',how='left')

In [23]:
unique_user_ids = set(orders_df.groupby('user_id').groups.keys())

In [24]:
len(unique_user_ids)

206209

In [25]:
unique_product_ids = set(products_df['product_id'].values)

In [26]:
len(unique_product_ids)

49688

## what is the static reorder_factor for each user?

I.e. what percentage of products is reordered?

> since we need data from both the prior as from the last dataframes, we need to calculate sum and counts separately, because mean of means would not be correctly weighted.


In [27]:
users_df['reorder_cl'] = last_items_ordered_df.groupby('user_id')['reordered'].count()
users_df['reorder_sl'] = last_items_ordered_df.groupby('user_id')['reordered'].sum()

users_df['reorder_cp'] = prior_items_ordered_df.groupby('user_id')['reordered'].count()
users_df['reorder_sp'] = prior_items_ordered_df.groupby('user_id')['reordered'].sum()


users_df['user_reorder_factor'] = (users_df['reorder_sl'] + users_df['reorder_sp']) / (users_df['reorder_cl'] + users_df['reorder_cp'])

users_df.drop(['reorder_cl','reorder_sl','reorder_cp','reorder_sp'],axis=1,inplace=True)

In [28]:
users_df.head()

Unnamed: 0,user_id,user_reorder_factor
0,1,
1,2,0.728571
2,3,0.464602
3,4,
4,5,


In [29]:
to_save = users_df[["user_id","user_reorder_factor"]]
to_save.set_index("user_id",inplace=True)
pickle.dump(to_save.to_dict()['user_reorder_factor'],open(PICKLE_ROOT+"static_user_reorder_factors.p","wb"))

## now do the same for each product

In [30]:
# last orders
products_df['reorder_cl'] = last_items_ordered_df.groupby('product_id')['reordered'].count()
products_df['reorder_sl'] = last_items_ordered_df.groupby('product_id')['reordered'].sum()

products_df['product_reorder_factor'] = (products_df['reorder_sl']) / (products_df['reorder_cl'])

products_df.drop(['reorder_cl','reorder_sl'],axis=1,inplace=True)

In [31]:
products_df.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,product_reorder_factor
0,1,Chocolate Sandwich Cookies,61,19,
1,2,All-Seasons Salt,104,13,0.644737
2,3,Robust Golden Unsweetened Oolong Tea,94,7,0.25
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,1.0
4,5,Green Chile Anytime Sauce,5,13,0.636364
5,6,Dry Nose Oil,11,11,1.0
6,7,Pure Coconut Water With Orange,98,7,
7,8,Cut Russet Potatoes Steam N' Mash,116,1,1.0
8,9,Light Strawberry Blueberry Yogurt,120,16,0.538462
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,0.4


In [32]:
product_factors = products_df[["product_id","product_reorder_factor"]]
product_factors.set_index("product_id",inplace=True)
pickle.dump(product_factors.to_dict()['product_reorder_factor'],open(PICKLE_ROOT+"static_product_reorder_factors.p","wb"))

In [33]:
product_factors.head()

Unnamed: 0_level_0,product_reorder_factor
product_id,Unnamed: 1_level_1
1,
2,0.644737
3,0.25
4,1.0
5,0.636364


### what are all products ever ordered by each user in the test set?

>  ONLY THE LAST ORDER MADE BY THE USER (except the order in the test set, obviously)


In [34]:
users_in_test_set = orders_df[orders_df['eval_set']=='test'][["user_id","order_id","order_number"]]

In [35]:
users_in_test_set.head(10)

Unnamed: 0,user_id,order_id,order_number
38,3,2774568,13
44,4,329954,6
53,6,1528013,4
96,11,1376945,8
102,12,1356845,6
152,15,2161313,23
159,16,1416320,7
217,19,1735923,10
222,20,1980631,5
272,22,139655,16


In [36]:
last_order_number_by_users_in_test_set = users_in_test_set.groupby("user_id")["order_number"].max().to_frame().reset_index()
last_order_number_by_users_in_test_set.head(10)

Unnamed: 0,user_id,order_number
0,3,13
1,4,6
2,6,4
3,11,8
4,12,6
5,15,23
6,16,7
7,19,10
8,20,5
9,22,16


In [37]:
second_last_order_number_for_users_in_test_set = last_order_number_by_users_in_test_set
second_last_order_number_for_users_in_test_set["order_number"] = second_last_order_number_for_users_in_test_set["order_number"]-1
second_last_order_number_for_users_in_test_set.head(10)

Unnamed: 0,user_id,order_number
0,3,12
1,4,5
2,6,3
3,11,7
4,12,5
5,15,22
6,16,6
7,19,9
8,20,4
9,22,15


In [38]:
second_to_last_orders_for_users_in_test_set = pd.merge(second_last_order_number_for_users_in_test_set,orders_df,on=('user_id','order_number'),how='left')
second_to_last_orders_for_users_in_test_set.head(10)
second_to_last_orders_for_users_in_test_set.drop(["order_number","eval_set","order_dow","order_hour_of_day","days_since_prior_order"],axis=1,inplace=True)

In [39]:
second_to_last_orders_for_users_in_test_set.head(10)

Unnamed: 0,user_id,order_id
0,3,1402502
1,4,2557754
2,6,998866
3,11,1468214
4,12,221248
5,15,487368
6,16,2000615
7,19,86918
8,20,2741696
9,22,2647850


In [40]:
products_prior = prior_items_ordered_df[["product_id","order_id","reordered"]]
products_last = last_items_ordered_df[["product_id","order_id","reordered"]]
all_products_and_orders = pd.concat([products_prior,products_last])

In [41]:
last_products_for_users_in_test_set = pd.merge(second_to_last_orders_for_users_in_test_set,all_products_and_orders,on='order_id',how='left')

In [42]:
last_products_for_users_in_test_set.sample(10)

Unnamed: 0,user_id,order_id,product_id,reordered
433880,114093,2307482,24850,0
69129,18444,469617,10782,0
484266,127597,1820917,27796,1
756616,199724,2728671,1700,1
547759,144336,2612443,29447,1
610187,160856,1402440,25513,0
741182,195788,855514,27715,1
409919,107873,90895,37849,0
42778,11541,1450035,45478,0
683637,180257,1107655,18370,0


## here is the difference: only consider stuff where reordered=1

In [43]:
last_products_for_users_in_test_set = last_products_for_users_in_test_set.query('reordered == 1')
last_products_for_users_in_test_set.drop(['order_id',"reordered"],axis=1,inplace=True)

In [44]:
last_products_for_users_in_test_set.head(10)

Unnamed: 0,user_id,product_id
0,3,39190
1,3,18599
2,3,23650
3,3,21903
4,3,47766
5,3,24810
14,11,33572
15,11,27959
18,11,8309
27,12,13176


In [45]:
orders_in_test_set = orders_df[orders_df["eval_set"]=="test"].drop(["eval_set","order_number","order_dow","order_hour_of_day","days_since_prior_order"],axis=1)

In [46]:
orders_in_test_set.head(10)

Unnamed: 0,order_id,user_id
38,2774568,3
44,329954,4
53,1528013,6
96,1376945,11
102,1356845,12
152,2161313,15
159,1416320,16
217,1735923,19
222,1980631,20
272,139655,22


In [47]:
submission = pd.merge(   
    pd.merge(
        orders_in_test_set,
        last_products_for_users_in_test_set,
        on='user_id',
        how='left'
    ).drop("user_id",axis=1),
    product_factors.reset_index(),
    on='product_id',
    how='left'
).fillna(
    -1 ## to keep the NaN
).groupby(
    "order_id"
)["product_id"].apply(
     lambda prod_ids: ' '.join([str(int(i)) for i in sorted(set(prod_ids)) ])
).to_frame().reset_index().replace(
    to_replace='-1', 
    value='None' ## adjust the NaNs
).sort_values("order_id")

In [48]:
submission.rename(columns={"product_id":"products"}).to_csv("just-repeat-sorted.csv",index=False,encoding='utf-8')