## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

In [68]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

from tqdm import *

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [5]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [54]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv")

In [7]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv")

In [8]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [9]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [10]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv")

## aisles

In [11]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
34,35,poultry counter
27,28,red wines
17,18,bulk dried fruits vegetables
57,58,frozen breads doughs
115,116,frozen produce


## departments

In [12]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [13]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [14]:
len(products_df)

49688

In [15]:
all_labels = products_df['product_id'].values

In [16]:
max(products_df["product_id"])

49688

## orders

In [17]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1400420,1944729,84139,prior,6,6,8,9.0
3410955,1182755,205600,prior,6,0,16,14.0
156029,2163713,9383,train,9,1,13,30.0
19070,843517,1185,prior,5,6,10,24.0
1071380,2573149,64542,prior,36,5,16,17.0
3319134,1284511,200057,test,8,2,18,30.0
897000,1457448,53862,prior,47,2,23,12.0
1512601,3398689,90907,prior,4,3,16,16.0
3017063,3365821,182076,prior,8,0,15,7.0
2519766,444124,151649,prior,21,0,9,21.0


In [18]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [19]:
len(orders_df[orders_df['eval_set']=='test']['order_id'].unique())

75000

In [20]:
last_items_ordered_df.sample(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
1143012,2823405,30391,7,1
55401,135821,44142,14,0
659168,1621444,3843,6,1
1284277,3172170,7781,6,1
786842,1937094,47626,3,1
485611,1191598,41834,13,1
915314,2258643,28196,1,1
977575,2410699,47630,10,0
1280203,3162945,8048,3,1
1362949,3368452,36011,1,1


## denormalizing

In [55]:
last_items_ordered_df = pd.merge(last_items_ordered_df,orders_df,on='order_id',how='left')

In [22]:
prior_items_ordered_df = pd.merge(prior_items_ordered_df,orders_df,on='order_id',how='left').drop('order_id',axis=1)

In [23]:
unique_user_ids = set(orders_df.groupby('user_id').groups.keys())

In [24]:
len(unique_user_ids)

206209

In [25]:
unique_product_ids = set(products_df['product_id'].values)

In [26]:
len(unique_product_ids)

49688

In [27]:
all_products_previously_ordered_by_each_user = pickle.load(open(PICKLE_ROOT+"all_products_previously_ordered_by_each_user.p","rb"))

In [28]:
static_user_reorder_factors = pickle.load(open(PICKLE_ROOT+"static_user_reorder_factors.p","rb"))

In [29]:
static_product_reorder_factors = pickle.load(open(PICKLE_ROOT+"static_product_reorder_factors.p","rb"))

In [30]:
def make_prediction(user_id,threshold):
    """
    returns a list of product_ids
    """
    
    user_reorder_factor = static_user_reorder_factors[user_id]
    
    product_ids_ordered_by_user = all_products_previously_ordered_by_each_user[user_id]
    
    product_factors = [static_product_reorder_factors[i] for i in product_ids_ordered_by_user]
    
    pred = list()
    
    for product_id in all_products_previously_ordered_by_each_user[user_id]:
        weighted = user_reorder_factor * static_product_reorder_factors[product_id]
    
        if weighted > threshold:
            pred.append(product_id)
    
    
    return pred

### get validation data

In [62]:
validation_orders = orders_df[orders_df['eval_set']=='train']
validation_order_ids = validation_orders["order_id"].values

In [65]:
actual = dict()

for order_id in tqdm(validation_order_ids):
    ordered_items = last_items_ordered_df[last_items_ordered_df["order_id"]==order_id]["product_id"].values

    actual[order_id]= ordered_items
    

100%|██████████| 131209/131209 [03:09<00:00, 691.50it/s]


### test predictions on validation data

In [80]:
mlb = MultiLabelBinarizer()
mlb.fit_transform([all_labels])

array([[1, 1, 1, ..., 1, 1, 1]])

In [31]:
del(prior_items_ordered_df)

In [33]:
validation_orders_and_users = validation_orders[["order_id","user_id"]].values

In [84]:
THRESHOLD = 0.25

predicted = dict()

for row in validation_orders_and_users:
    order_id = row[0]
    user_id = row[1]
    
    predicted_products = make_prediction(user_id,THRESHOLD)
    
    predicted[order_id]= predicted_products


In [91]:
sample_ids = np.random.choice(validation_order_ids,size=2000,replace=False)


f1s = list()

for order_id in sample_ids:
    actual_binary_labels = mlb.transform([actual[order_id]]).ravel()
    predicted_binary_labels= mlb.transform([predicted[order_id]]).ravel()
  
    f1 = f1_score(actual_binary_labels,predicted_binary_labels)
    
    f1s.append(f1)
    
print(np.array(f1s).mean())    

  'precision', 'predicted', average, warn_for)


0.173002579583


### build submission

In [86]:
test_orders = orders_df[orders_df['eval_set']=='test']

In [87]:
test_orders_and_users = test_orders[["order_id","user_id"]].values

In [88]:
test_orders_and_users.sort(axis=0)    

In [89]:
submission_data = []

for row in test_orders_and_users:
    order_id = row[0]
    user_id = row[1]
    
    predicted_products = make_prediction(user_id,THRESHOLD)
    
    label_string = " ".join(str(prod_id) for prod_id in predicted_products)
    
    d = {'order_id':order_id, 'products': label_string if label_string else 'None'}
    
    submission_data.append(d)

df = pd.DataFrame(data=submission_data)

df.to_csv('submission.csv',index=False)


In [None]:
submission_data