## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

In [1]:
import numpy as np
import pandas as pd
import pickle

from tqdm import *

In [7]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [3]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [4]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv")

In [5]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv")

In [8]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [9]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [10]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv")

## aisles

In [11]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
1,2,specialty cheeses
122,123,packaged vegetables fruits
81,82,baby accessories
105,106,hot dogs bacon sausage
44,45,candy chocolate


## departments

In [12]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [13]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [14]:
len(products_df)

49688

In [37]:
max(products_df["product_id"])

49688

## orders

In [15]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
2393447,2604564,144023,prior,4,3,8,4.0
1845846,955406,110802,prior,19,1,15,7.0
2771378,417795,167077,prior,6,2,11,9.0
3398956,3037081,204831,prior,6,4,11,30.0
2945289,3177063,177728,prior,1,1,14,
3000071,1306085,181009,prior,33,5,20,8.0
2672931,2777108,160913,prior,10,4,16,18.0
2693257,1518097,162212,prior,5,4,12,9.0
993269,2900805,59696,prior,5,2,17,4.0
1586355,596253,95241,prior,2,5,12,6.0


In [16]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [17]:
len(orders_df[orders_df['eval_set']=='test']['order_id'].unique())

75000

In [19]:
last_items_ordered_df.sample(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
1246417,3079001,15121,5,1
1066284,2636197,38480,5,1
960998,2370686,41540,1,1
981654,2422191,24307,4,1
820559,2023905,2452,10,0
716218,1760682,34358,5,1
1007210,2488407,13643,14,0
327530,797686,37646,11,1
1313742,3245330,15290,8,1
1314641,3247780,33065,1,1


## denormalizing

In [20]:
last_items_ordered_df = pd.merge(last_items_ordered_df,orders_df,on='order_id',how='left')

In [21]:
prior_items_ordered_df = pd.merge(prior_items_ordered_df,orders_df,on='order_id',how='left').drop('order_id',axis=1)

In [22]:
unique_user_ids = set(orders_df.groupby('user_id').groups.keys())

In [24]:
len(unique_user_ids)

206209

In [25]:
unique_product_ids = set(products_df['product_id'].values)

In [26]:
len(unique_product_ids)

49688

## what is the static reorder_factor for each user?

I.e. what percentage of products is reordered?

In [27]:
static_user_reorder_factors = prior_items_ordered_df.groupby('user_id').agg({'reordered': np.mean}).to_dict()['reordered']

## now do the same for each product

In [28]:
static_product_reorder_factors = prior_items_ordered_df.groupby('product_id').agg({'reordered': np.mean}).to_dict()['reordered']

### what are all products ever ordered by each user?

In [43]:
all_products_previously_ordered_by_each_user = dict()

for user_id in tqdm(unique_user_ids):
    prior_products = prior_items_ordered_df[prior_items_ordered_df['user_id']==user_id]['product_id'].values
    
    uniq = set(prior_products.tolist())
    
    all_products_previously_ordered_by_each_user[user_id] = uniq

100%|██████████| 206209/206209 [1:27:42<00:00, 42.00it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/media/felipe/ssd_vol/instacart/pickles/all_products_previously_ordered_by_each_user.p'

In [45]:
pickle.dump(all_products_previously_ordered_by_each_user,open(PICKLE_ROOT+"all_products_previously_ordered_by_each_user.p","wb"))

In [49]:
max(all_products_previously_ordered_by_each_user.keys())

206209

In [59]:
def make_prediction(user_id,threshold):
    """
    returns a list of product_ids
    """
    
    user_reorder_factor = static_user_reorder_factors[user_id]
    
    product_ids_ordered_by_user = all_products_previously_ordered_by_each_user[user_id]
    
    product_factors = [static_product_reorder_factors[i] for i in product_ids_ordered_by_user]
    
    pred = list()
    
    for product_id in all_products_previously_ordered_by_each_user[user_id]:
        weighted = user_reorder_factor * static_product_reorder_factors[product_id]
    
        if weighted > threshold:
            pred.append(product_id)
    
    
    return pred

### build submission

In [60]:
test_orders = orders_df[orders_df['eval_set']=='test']

In [61]:
test_orders_and_users = test_orders[["order_id","user_id"]].values

In [62]:
test_orders_and_users.sort(axis=0)    

In [63]:
THRESHOLD = 0.30

submission_data = []

for row in test_orders_and_users:
    order_id = row[0]
    user_id = row[1]
    
    predicted_products = make_prediction(user_id,THRESHOLD)
    
    label_string = " ".join(str(prod_id) for prod_id in predicted_products)
    
    d = {'order_id':order_id, 'products': label_string if label_string else 'None'}
    
    submission_data.append(d)

df = pd.DataFrame(data=submission_data)

df.to_csv('submission.csv',index=False)


In [None]:
submission_data

### make labels (seq of products)

In [None]:
labels = np.zeros()

### first submission: chance of each product being re-ordered = user_factor * product_factor