## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

### in addition to using the last reordered products by each user

### also calculate the reorder factor for each product and each user, among the products he/she has ever RE-ordered. Then take the top N products, sorted by reorder factor, in last month only

In [1]:
import numpy as np
import pandas as pd
import pickle

from tqdm import *

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [3]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [None]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv")

In [4]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv")

In [5]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [6]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [7]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv")

## aisles

In [8]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
95,96,lunch meat
92,93,breakfast bakery
114,115,water seltzer sparkling water
81,82,baby accessories
40,41,cat food care


## departments

In [9]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [10]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [11]:
len(products_df)

49688

In [12]:
max(products_df["product_id"])

49688

## orders

In [13]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
2192409,3065648,131995,prior,10,3,13,6.0
2729867,2680944,164471,prior,4,3,16,5.0
2424131,671341,145871,prior,14,5,15,6.0
1553850,3241134,93325,prior,41,4,7,4.0
3051077,75897,184039,prior,7,4,18,10.0
2620301,517596,157751,prior,11,4,14,12.0
1979772,1630848,118884,prior,49,4,17,4.0
2165570,239153,130292,prior,15,4,16,8.0
390847,2454542,23474,test,9,2,14,15.0
558223,1505128,33649,train,8,4,16,14.0


In [14]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [15]:
len(orders_df[orders_df['eval_set']=='test']['order_id'].unique())

75000

In [16]:
last_items_ordered_df.sample(10)

NameError: name 'last_items_ordered_df' is not defined

### users_df, derived

In [17]:
users_df = orders_df[["user_id"]]

In [18]:
users_df = users_df.drop_duplicates().reset_index().drop('index',axis=1)

In [19]:
users_df.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


## denormalizing

In [None]:
# last_items_ordered_df = pd.merge(last_items_ordered_df,orders_df,on='order_id',how='left')

In [None]:
# prior_items_ordered_df = pd.merge(prior_items_ordered_df,orders_df,on='order_id',how='left')

## what is the static reorder_factor for each Product, for each product ever re-ordered by each user, in the test set?

In [None]:
orders_df.head()

In [20]:
users_in_test_set_df = pd.merge(
    orders_df.query("eval_set == 'test'"),
    users_df,
    on='user_id',
    how='left'
)[["user_id"]]

In [21]:
users_in_test_set_df.head()

Unnamed: 0,user_id
0,3
1,4
2,6
3,11
4,12


In [22]:
prior_orders_and_items_df = pd.merge(
    prior_items_ordered_df,
    orders_df, on='order_id', how='left'
)

In [24]:
prior_orders_and_items_df = pd.merge(
 users_in_test_set_df,
 prior_orders_and_items_df,
 on = 'user_id',
 how='left'
)


prior_orders_and_reordered_items_df = prior_orders_and_items_df.query(
    'reordered == 1'
)

In [25]:
prior_orders_and_reordered_items_df.head(20)

Unnamed: 0,user_id,order_id,product_id,add_to_cart_order,reordered,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1,3,444309,21903,2,1,prior,2,3,19,9.0
5,3,444309,17668,6,1,prior,2,3,19,9.0
8,3,444309,32402,9,1,prior,2,3,19,9.0
9,3,521107,39190,1,1,prior,11,0,18,11.0
10,3,521107,47766,2,1,prior,11,0,18,11.0
11,3,521107,21903,3,1,prior,11,0,18,11.0
12,3,521107,43961,4,1,prior,11,0,18,11.0
13,3,521107,17668,5,1,prior,11,0,18,11.0
14,3,676467,18599,1,1,prior,10,3,16,17.0
15,3,676467,17668,2,1,prior,10,3,16,17.0


In [None]:
labels_df = pd.get_dummies(prior_orders_and_reordered_items_df["product_id"],sparse=True)

In [33]:
labels_df

Unnamed: 0,34,196,651,694,1090,1359,1529,1654,1747,1819,...,48697,48745,48988,49175,49191,49215,49236,49383,49424,49683
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
prior_orders_and_reordered_items_by_users_in_test_set.head()

In [None]:
pd.merge(
prior_orders_and_items_by_users_in_test_set.query("user_id == 63469").sort_values(by=["order_number"]),
products_df,
    on="product_id",
    how="left"
).drop(
    ["eval_set","user_id"],axis=1
).groupby("order_id")["order_id"].count()

In [None]:
# df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

In [None]:
num_reordered = pd.merge(
prior_orders_and_items_by_users_in_test_set.sort_values(by=["order_number"]),
products_df,
    on="product_id",
    how="left"
).drop(
    ["eval_set","user_id"],axis=1
).groupby("order_id")["reordered"].sum().to_frame().rename(columns={"reordered":"num_reordered"})

num_reordered.head()

In [None]:
reorders_by_order_number = pd.merge(
prior_orders_and_items_by_users_in_test_set.sort_values(by=["order_number"]),
products_df,
    on="product_id",
    how="left"
).drop(
    ["eval_set","user_id"],axis=1
).groupby(["order_number"]).agg( {'reordered': [np.sum,'count'] } )

In [None]:
reorders_by_order_number['reordered','sum'].to_frame()

In [None]:
reorders_by_order_number["ratio"] = reorders_by_order_number['reordered','sum'] / reorders_by_order_number['reordered','count']

In [None]:
corr_df =  reorders_by_order_number.reset_index()
corr_df["order_number"].corr(corr_df["ratio"])

In [None]:
reorder_fraction = num_reordered["num_reordered"] / num_ordered["num_ordered"]

In [None]:
reorder_fraction.to_frame()

In [None]:
corr_df['days_since_prior_order'].corr(corr_df['order_dow'])

In [None]:
# last orders
products_df['reorder_cl'] = last_items_ordered_df.groupby('product_id')['reordered'].count()
products_df['reorder_sl'] = last_items_ordered_df.groupby('product_id')['reordered'].sum()

products_df['product_reorder_factor'] = (products_df['reorder_sl']) / (products_df['reorder_cl'])

products_df.drop(['reorder_cl','reorder_sl'],axis=1,inplace=True)

In [None]:
products_df.head(10)

In [None]:
product_factors = products_df[["product_id","product_reorder_factor"]]
product_factors.set_index("product_id",inplace=True)
pickle.dump(product_factors.to_dict()['product_reorder_factor'],open(PICKLE_ROOT+"static_product_reorder_factors.p","wb"))

In [None]:
product_factors.head()