## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

In [1]:
import numpy as np
import pandas as pd
import pickle

from tqdm import *

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [3]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [4]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv")

In [5]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv")

In [6]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [7]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [8]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv")

## aisles

In [9]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
133,134,specialty wines champagnes
8,9,pasta sauce
109,110,pickled goods olives
120,121,cereal
106,107,chips pretzels


## departments

In [10]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [11]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [12]:
len(products_df)

49688

In [13]:
max(products_df["product_id"])

49688

## orders

In [14]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
8489,1826055,525,prior,8,3,6,15.0
487304,1801845,29387,prior,23,1,10,7.0
1931291,818309,115988,prior,2,5,12,15.0
3257234,90351,196309,prior,40,1,16,9.0
2693017,2800131,162198,prior,15,4,10,7.0
1421147,1083507,85367,prior,6,5,11,7.0
2688875,2085203,161943,prior,25,0,16,7.0
1101021,3067385,66311,prior,20,0,21,10.0
2925364,1628370,176481,prior,23,3,11,5.0
2137055,242879,128533,prior,29,3,13,2.0


In [15]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [16]:
len(orders_df[orders_df['eval_set']=='test']['order_id'].unique())

75000

In [17]:
last_items_ordered_df.sample(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
1160653,2868883,34458,6,1
1036456,2562175,39275,11,0
1043921,2582540,23054,11,1
769170,1893026,34637,2,0
822197,2028060,14142,6,1
987819,2439033,7058,8,0
257133,627271,32553,4,1
483436,1184974,22124,2,1
851784,2103003,28204,7,1
71499,176435,27275,9,0


### users_df, derived

In [18]:
users_df = orders_df[["user_id"]]

In [19]:
users_df = users_df.drop_duplicates().reset_index().drop('index',axis=1)

In [20]:
users_df.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


## denormalizing

In [21]:
last_items_ordered_df = pd.merge(last_items_ordered_df,orders_df,on='order_id',how='left')

In [22]:
prior_items_ordered_df = pd.merge(prior_items_ordered_df,orders_df,on='order_id',how='left').drop('order_id',axis=1)

In [23]:
unique_user_ids = set(orders_df.groupby('user_id').groups.keys())

In [24]:
len(unique_user_ids)

206209

In [25]:
unique_product_ids = set(products_df['product_id'].values)

In [26]:
len(unique_product_ids)

49688

## what is the static reorder_factor for each user?

I.e. what percentage of products is reordered?

> since we need data from both the prior as from the last dataframes, we need to calculate sum and counts separately, because mean of means would not be correctly weighted.


In [27]:
users_df['reorder_cl'] = last_items_ordered_df.groupby('user_id')['reordered'].count()
users_df['reorder_sl'] = last_items_ordered_df.groupby('user_id')['reordered'].sum()

users_df['reorder_cp'] = prior_items_ordered_df.groupby('user_id')['reordered'].count()
users_df['reorder_sp'] = prior_items_ordered_df.groupby('user_id')['reordered'].sum()


users_df['user_reorder_factor'] = (users_df['reorder_sl'] + users_df['reorder_sp']) / (users_df['reorder_cl'] + users_df['reorder_cp'])

users_df.drop(['reorder_cl','reorder_sl','reorder_cp','reorder_sp'],axis=1,inplace=True)

In [28]:
users_df.head()

Unnamed: 0,user_id,user_reorder_factor
0,1,
1,2,0.728571
2,3,0.464602
3,4,
4,5,


In [29]:
to_save = users_df[["user_id","user_reorder_factor"]]
to_save.set_index("user_id",inplace=True)
pickle.dump(to_save.to_dict()['user_reorder_factor'],open(PICKLE_ROOT+"static_user_reorder_factors.p","wb"))

## now do the same for each product

In [30]:
# last orders
products_df['reorder_cl'] = last_items_ordered_df.groupby('product_id')['reordered'].count()
products_df['reorder_sl'] = last_items_ordered_df.groupby('product_id')['reordered'].sum()

# prior orders
products_df['reorder_cp'] = prior_items_ordered_df.groupby('product_id')['reordered'].count()
products_df['reorder_sp'] = prior_items_ordered_df.groupby('product_id')['reordered'].sum()

products_df['product_reorder_factor'] = (products_df['reorder_sl'] + products_df['reorder_sp']) / (products_df['reorder_cl'] + products_df['reorder_cp'])

products_df.drop(['reorder_cl','reorder_sl','reorder_cp','reorder_sp'],axis=1,inplace=True)

In [63]:
product_factors = products_df[["product_id","product_reorder_factor"]]
product_factors.set_index("product_id",inplace=True)
pickle.dump(to_save.to_dict()['product_reorder_factor'],open(PICKLE_ROOT+"static_product_reorder_factors.p","wb"))

In [84]:
product_factors.head()

Unnamed: 0_level_0,product_reorder_factor
product_id,Unnamed: 1_level_1
1,
2,0.614627
3,0.138298
4,0.738516
5,0.458689


### what are all products ever ordered by each user in the test set?

>  must use both prior and last orders


In [92]:
users_in_test_set = orders_df[orders_df['eval_set']=='test'][["user_id","order_id"]]

In [96]:
users_in_test_set

Unnamed: 0,user_id,order_id
38,3,2774568
44,4,329954
53,6,1528013
96,11,1376945
102,12,1356845
152,15,2161313
159,16,1416320
217,19,1735923
222,20,1980631
272,22,139655


In [34]:
products_prior = prior_items_ordered_df[["product_id","user_id"]]
products_last = last_items_ordered_df[["product_id","user_id"]]

all_products_and_users = pd.concat([products_prior,products_last])

In [35]:
products_and_users_in_test_set = pd.merge(users_in_test_set,all_products_and_users,on='user_id',how='left')

In [36]:
products_and_users_in_test_set

Unnamed: 0,user_id,product_id
0,3,38596
1,3,21903
2,3,248
3,3,40604
4,3,8021


In [1]:
THRESHOLD = 0.60

submission = pd.merge(
    pd.merge(
        products_and_users_in_test_set,
        product_factors.reset_index(),
        on='product_id',
        how='left'
    ).query(
        "product_reorder_factor > {}".format(THRESHOLD)
    ).groupby(
        'user_id'
    )["product_id"].apply(
     lambda prod_ids: ' '.join([str(int(i)) for i in sorted(set(prod_ids))])
    ).to_frame().reset_index(),
    users_in_test_set,
    how='right',
    on='user_id'
).sort_values(
"order_id"
).fillna("None")

NameError: name 'pd' is not defined

In [2]:
submission = submission.reset_index().drop(["index","user_id"],axis=1).rename(columns={"product_id":"products"})

NameError: name 'submission' is not defined

In [163]:
cols = submission.columns.tolist()

In [164]:
submission = submission[cols[-1:]+cols[:-1]]

In [165]:
submission.to_csv("submission.csv",index=False)