## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

from tqdm import *

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [3]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [4]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv",dtype={'order_id': np.int32, 'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 'reordered': np.int8})

In [5]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv",dtype={'order_id': np.int32, 
                                                           'user_id': np.int32, 
                                                           'order_number': np.int32, 
                                                           'order_dow': np.int8, 
                                                           'order_hour_of_day': np.int8, 
                                                           'days_since_prior_order': np.float16})

In [6]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [7]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [8]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv",dtype={'order_id': np.int32, 
                              'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 
                              'reordered': np.int8})

In [9]:
prior_items_ordered_df.dtypes

order_id             int32
product_id           int32
add_to_cart_order    int16
reordered             int8
dtype: object

## aisles

In [10]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
73,74,dish detergents
133,134,specialty wines champagnes
45,46,mint gum
24,25,soap
114,115,water seltzer sparkling water


## departments

In [11]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [12]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [13]:
len(products_df)

49688

In [14]:
all_labels = products_df['product_id'].values

In [15]:
max(products_df["product_id"])

49688

## orders

In [16]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
2159303,1684920,129886,test,15,4,20,6.0
320421,3147443,19403,train,7,0,16,29.0
1216656,1787890,73135,prior,3,0,11,4.0
716773,3142401,43160,prior,6,4,16,3.0
1540165,312662,92510,prior,26,3,11,1.0
1542601,257064,92649,prior,2,1,15,14.0
1840564,2867694,110506,prior,10,0,8,7.0
2471461,3271233,148728,prior,4,3,21,6.0
2064285,790129,124109,prior,1,1,12,
875904,974326,52651,prior,5,2,12,30.0


In [17]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [18]:
len(orders_df[orders_df['eval_set']=='test']['order_id'].unique())

75000

In [19]:
last_items_ordered_df.sample(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
201439,493809,49131,3,1
191307,468538,47209,7,0
1004839,2482721,21938,18,1
810369,1998739,25890,11,1
489353,1201659,41844,11,1
381471,930826,41708,4,0
222842,546053,43726,6,1
1164730,2878303,16848,7,1
768340,1891233,8518,21,0
461321,1129829,47209,5,1


## denormalizing

In [20]:
last_items_ordered_df = pd.merge(last_items_ordered_df,orders_df,on='order_id',how='left')

In [21]:
prior_items_ordered_df = pd.merge(prior_items_ordered_df,orders_df,on='order_id',how='left').drop('order_id',axis=1)

In [22]:
unique_user_ids = set(orders_df.groupby('user_id').groups.keys())

In [23]:
len(unique_user_ids)

206209

In [24]:
unique_product_ids = set(products_df['product_id'].values)

In [25]:
len(unique_product_ids)

49688

In [26]:
all_products_previously_ordered_by_each_user = pickle.load(open(PICKLE_ROOT+"all_products_previously_ordered_by_each_user.p","rb"))

In [27]:
static_user_reorder_factors = pickle.load(open(PICKLE_ROOT+"static_user_reorder_factors.p","rb"))

In [28]:
static_product_reorder_factors = pickle.load(open(PICKLE_ROOT+"static_product_reorder_factors.p","rb"))

In [37]:
static_product_reorder_factors[8]

0.41935483870967744

In [41]:
all_products_previously_ordered_by_each_user

{1: {196,
  10258,
  10326,
  12427,
  13032,
  13176,
  14084,
  17122,
  25133,
  26088,
  26405,
  30450,
  35951,
  38928,
  39657,
  41787,
  46149,
  49235},
 2: {23,
  79,
  1559,
  2002,
  2573,
  3151,
  4071,
  4957,
  5212,
  5322,
  5450,
  5869,
  5907,
  7781,
  7963,
  8138,
  8296,
  8479,
  9124,
  9681,
  10305,
  12000,
  12258,
  13176,
  13351,
  13742,
  14306,
  14553,
  15841,
  16521,
  16589,
  16797,
  17224,
  17758,
  17872,
  18523,
  18961,
  19051,
  19057,
  19156,
  19240,
  20084,
  20574,
  20785,
  21150,
  21227,
  21376,
  21709,
  22124,
  22474,
  22559,
  22825,
  22829,
  22963,
  24768,
  24852,
  24954,
  24990,
  27344,
  27413,
  27737,
  27966,
  28874,
  28918,
  30489,
  30908,
  32052,
  32139,
  32792,
  33276,
  33754,
  33957,
  34688,
  35917,
  36287,
  36735,
  37646,
  38656,
  39877,
  39928,
  40198,
  40571,
  41787,
  42342,
  42356,
  44303,
  45066,
  45613,
  45948,
  46676,
  46886,
  47144,
  47209,
  47526,
  47553,
  

In [42]:
def make_prediction(user_id,threshold):
    """
    returns a list of product_ids
    """
    
    user_reorder_factor = static_user_reorder_factors[user_id]
    
    product_ids_ordered_by_user = all_products_previously_ordered_by_each_user[user_id]
    
    product_factors = [static_product_reorder_factors[i] for i in product_ids_ordered_by_user]
    
    pred = list()
    
    for product_id in all_products_previously_ordered_by_each_user[user_id]:
        weighted = user_reorder_factor * static_product_reorder_factors[product_id]
    
        if weighted > threshold:
            pred.append(product_id)
    
    
    return pred

### get validation data

In [43]:
validation_orders = orders_df[orders_df['eval_set']=='train']
validation_order_ids = validation_orders["order_id"].values

In [45]:
actual = dict()

for order_id in tqdm(validation_order_ids):
    ordered_items = last_items_ordered_df[last_items_ordered_df["order_id"]==order_id]["product_id"].values

    actual[order_id]= ordered_items
    


  0%|          | 0/131209 [00:00<?, ?it/s][A
  0%|          | 66/131209 [00:00<03:18, 659.43it/s][A
  0%|          | 145/131209 [00:00<03:09, 693.18it/s][A
  0%|          | 221/131209 [00:00<03:04, 711.36it/s][A
  0%|          | 299/131209 [00:00<02:59, 728.48it/s][A
  0%|          | 377/131209 [00:00<02:56, 741.98it/s][A
  0%|          | 455/131209 [00:00<02:54, 751.23it/s][A
  0%|          | 536/131209 [00:00<02:50, 767.72it/s][A
  0%|          | 617/131209 [00:00<02:47, 779.07it/s][A
  1%|          | 700/131209 [00:00<02:45, 790.54it/s][A
  1%|          | 780/131209 [00:01<02:44, 790.71it/s][A
  1%|          | 862/131209 [00:01<02:43, 796.66it/s][A
  1%|          | 942/131209 [00:01<02:43, 795.61it/s][A
  1%|          | 1024/131209 [00:01<02:42, 800.28it/s][A
  1%|          | 1105/131209 [00:01<02:42, 799.59it/s][A
  1%|          | 1185/131209 [00:01<02:42, 798.92it/s][A
  1%|          | 1267/131209 [00:01<02:41, 802.17it/s][A
  1%|          | 1348/131209 [00:01<02

### test predictions on validation data

In [46]:
mlb = MultiLabelBinarizer()
mlb.fit_transform([all_labels])

array([[1, 1, 1, ..., 1, 1, 1]])

In [47]:
del(prior_items_ordered_df)

In [48]:
validation_orders_and_users = validation_orders[["order_id","user_id"]].values

In [49]:
THRESHOLD = 0.25

predicted = dict()

for row in validation_orders_and_users:
    order_id = row[0]
    user_id = row[1]
    
    predicted_products = make_prediction(user_id,THRESHOLD)
    
    predicted[order_id]= predicted_products


In [50]:
sample_ids = np.random.choice(validation_order_ids,size=2000,replace=False)


f1s = list()

for order_id in sample_ids:
    actual_binary_labels = mlb.transform([actual[order_id]]).ravel()
    predicted_binary_labels= mlb.transform([predicted[order_id]]).ravel()
  
    f1 = f1_score(actual_binary_labels,predicted_binary_labels)
    
    f1s.append(f1)
    
print(np.array(f1s).mean())    

  'precision', 'predicted', average, warn_for)


0.0522413875688


### build submission

In [51]:
test_orders = orders_df[orders_df['eval_set']=='test']

In [52]:
test_orders_and_users = test_orders[["order_id","user_id"]].values

In [53]:
test_orders_and_users.sort(axis=0)    

In [54]:
submission_data = []

for row in test_orders_and_users:
    order_id = row[0]
    user_id = row[1]
    
    predicted_products = make_prediction(user_id,THRESHOLD)
    
    label_string = " ".join(str(prod_id) for prod_id in predicted_products)
    
    d = {'order_id':order_id, 'products': label_string if label_string else 'None'}
    
    submission_data.append(d)

df = pd.DataFrame(data=submission_data)

df.to_csv('submission.csv',index=False)


In [None]:
submission_data