## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

In [1]:
import numpy as np
import pandas as pd
import pickle

from tqdm import *

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickle/"

In [3]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [4]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv")

In [5]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv")

In [6]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [7]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [8]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv")

## aisles

In [9]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
13,14,tofu meat alternatives
28,29,honeys syrups nectars
102,103,ice cream toppings
19,20,oral hygiene
20,21,packaged cheese


## departments

In [10]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [11]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [12]:
len(products_df)

49688

## orders

In [13]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
520921,2073290,31403,prior,2,1,21,8.0
3252982,328148,196080,prior,3,4,19,19.0
1806763,3249707,108506,prior,15,1,14,16.0
77075,348983,4705,prior,48,6,11,18.0
2345560,2604624,141190,prior,43,1,12,1.0
1017800,1892103,61174,prior,9,0,20,6.0
348737,2337249,21096,prior,6,0,11,14.0
3033180,3210775,183016,prior,1,3,11,
856701,1801466,51471,prior,9,5,17,8.0
1808613,3249971,108603,prior,10,5,18,30.0


In [14]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [15]:
len(orders_df[orders_df['eval_set']=='test']['order_id'].unique())

75000

In [16]:
last_items_ordered_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
5,1,13176,6,0
6,1,47209,7,0
7,1,22035,8,1
8,36,39612,1,0
9,36,19660,2,1


## denormalizing

In [17]:
last_items_ordered_df = pd.merge(last_items_ordered_df,orders_df,on='order_id',how='left')

In [18]:
prior_items_ordered_df = pd.merge(prior_items_ordered_df,orders_df,on='order_id',how='left').drop('order_id',axis=1)

In [19]:
unique_user_ids = set(orders_df.groupby('user_id').groups.keys())

In [20]:
unique_user_ids

{1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [21]:
len(unique_user_ids)

206209

In [22]:
unique_product_ids = set(products_df['product_id'].values)

In [23]:
len(unique_product_ids)

49688

## what is the static reorder_factor for each user?

I.e. what percentage of products is reordered?

> i must do count and them sum because the mean is split into the two datasets, and mean of means wouldn't be right

In [24]:
dict_prior = prior_items_ordered_df.groupby('user_id').agg({'reordered': [np.size, np.sum]}).to_dict()

In [25]:
dict_last = last_items_ordered_df.groupby('user_id').agg({'reordered': [np.size, np.sum]}).to_dict()

In [26]:
dict_count_prior= dict_prior[('reordered','size')]
dict_sum_prior = dict_prior[('reordered','sum')]

In [27]:
dict_count_last= dict_last[('reordered','size')]
dict_sum_last = dict_last[('reordered','sum')]

In [28]:
static_user_reorder_factors = dict()

for user_id in unique_user_ids:
    
    user_factor = 0
    
    count_prior = dict_count_prior.get(user_id,0)
    sum_prior = dict_sum_prior.get(user_id,0)
    
    count_last = dict_count_last.get(user_id,0)
    sum_last = dict_sum_last.get(user_id,0)

    denominator = (count_prior+count_last)
    
    if denominator == 0:
        static_reorder_factor = 0
    else:
        static_reorder_factor = (sum_prior+sum_last) / denominator
    
    static_user_reorder_factors[user_id] = static_reorder_factor

## now do the same for each product

In [29]:
dict_prior = prior_items_ordered_df.groupby('product_id').agg({'reordered': [np.size, np.sum]}).to_dict()

In [30]:
dict_last = last_items_ordered_df.groupby('product_id').agg({'reordered': [np.size, np.sum]}).to_dict()

In [31]:
dict_count_prior= dict_prior[('reordered','size')]
dict_sum_prior = dict_prior[('reordered','sum')]

In [32]:
dict_count_last= dict_last[('reordered','size')]
dict_sum_last = dict_last[('reordered','sum')]

In [33]:
static_product_reorder_factors = dict()

for product_id in unique_product_ids:
      
    count_prior = dict_count_prior.get(product_id,0)
    sum_prior = dict_sum_prior.get(product_id,0)
    
    count_last = dict_count_last.get(product_id,0)
    sum_last = dict_sum_last.get(product_id,0)

    denominator = (count_prior+count_last)
    
    if denominator == 0:
        static_reorder_factor = 0
    else:
        static_reorder_factor = (sum_prior+sum_last) / denominator
    
    static_product_reorder_factors[product_id] = static_reorder_factor

### what are all products ever ordered by each user?

> remember to use both prior and last orders


In [34]:
all_products_ordered_by_user = dict()

for user_id in tqdm(unique_user_ids):
    prior_products = prior_items_ordered_df[prior_items_ordered_df['user_id']==user_id]['product_id'].values
    last_products = last_items_ordered_df[last_items_ordered_df['user_id']==user_id]['product_id'].values
    
    uniq = set(prior_products.tolist()+last_products.tolist())
    
    all_products_ordered_by_user[user_id] = uniq
    
pickle.dump(all_products_ordered_by_user,open(PICKLE_ROOT+"all_products_ordered_by_user.p","wb"))            

  5%|▌         | 11119/206209 [04:51<1:22:46, 39.28it/s]

KeyboardInterrupt: 

            5%|▌         | 11119/206209 [05:10<1:30:43, 35.84it/s]

In [None]:
max(all_products_ordered_by_user.keys())

In [None]:
def make_prediction(user_id,threshold):
    """
    returns a list of product_ids
    """
    
    user_reorder_factor = static_user_reorder_factors[user_id]
    
    product_ids_ordered_by_user = all_products_ordered_by_user[user_id]
    
    product_factors = [static_product_reorder_factors[i] for i in product_ids_ordered_by_user]
    
    pred = list()
    
    for product_id in all_products_ordered_by_user[user_id]:
        weighted = user_reorder_factor * static_product_reorder_factors[product_id]
    
        if weighted > threshold:
            pred.append(product_id)
    
    
    return pred

### build submission

In [None]:
test_orders = orders_df[orders_df['eval_set']=='test']

In [None]:
test_orders_and_users = test_orders[["order_id","user_id"]].values

In [None]:
test_orders_and_users.sort(axis=0)    

### first submission: chance of each product being re-ordered = user_factor * product_factor

In [None]:
THRESHOLD = 0.30

submission_data = []

for row in test_orders_and_users:
    order_id = row[0]
    user_id = row[1]
    
    predicted_products = make_prediction(user_id,THRESHOLD)
    
    label_string = " ".join(str(prod_id) for prod_id in predicted_products)
    
    d = {'order_id':order_id, 'products': label_string if label_string else 'None'}
    
    submission_data.append(d)

df = pd.DataFrame(data=submission_data)

df.to_csv('submission.csv',index=False)
