## https://www.kaggle.com/c/instacart-market-basket-analysis

- aisles.csv       

- order_products__prior.csv  

- orders.csv    

- sample_submission.csv

- departments.csv

- order_products__train.csv

- products.csv

### instructions:

> order_products_prior give the order information of all users in the history. order_products_train give the current order information of some users. You need to predict the current order of rest of users.

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import f1_score
from tqdm import *

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [5]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [6]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__train.csv")

In [7]:
orders_df = pd.read_csv(DATA_ROOT+"/orders.csv")

In [8]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [9]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [10]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+"order_products__prior.csv")

## aisles

In [11]:
aisles_df.sample(5)

Unnamed: 0,aisle_id,aisle
59,60,trash bags liners
125,126,feminine care
130,131,dry pasta
26,27,beers coolers
60,61,cookies cakes


## departments

In [12]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


## products

In [13]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [14]:
len(products_df)

49688

In [15]:
max(products_df["product_id"])

49688

## orders

In [16]:
orders_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
426721,283457,25636,prior,1,3,16,
981448,1757189,59009,prior,12,6,16,7.0
384558,530201,23121,prior,11,1,15,10.0
1686509,2822888,101228,prior,6,4,11,8.0
1866967,2364437,112035,prior,9,0,11,19.0
156145,2703981,9396,train,18,0,15,7.0
2724723,652798,164118,prior,46,1,11,9.0
2544056,1491721,153066,prior,21,3,7,29.0
2840839,588694,171427,prior,3,3,18,12.0
94129,2395840,5695,prior,4,4,23,5.0


In [98]:
orders_df[orders_df['eval_set']=='train'].sort_values('order_id').head(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1868044,1,112108,train,4,4,10,9.0
1322255,36,79431,train,23,6,18,30.0
709772,38,42756,train,6,6,16,24.0
284948,96,17227,train,7,6,20,30.0
941403,98,56463,train,41,3,8,14.0
2079114,112,125030,train,5,5,14,26.0
3022377,170,182389,train,7,0,13,14.0
1644272,218,98711,train,12,0,21,17.0
848638,226,51011,train,4,0,12,30.0
2597184,349,156353,train,9,3,16,30.0


In [17]:
orders_df.groupby('eval_set').count()

Unnamed: 0_level_0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
eval_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
prior,3214874,3214874,3214874,3214874,3214874,3008665
test,75000,75000,75000,75000,75000,75000
train,131209,131209,131209,131209,131209,131209


In [18]:
last_items_ordered_df.sample(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
197907,484514,35898,15,1
754521,1857587,45106,2,0
825868,2036339,30183,2,1
519305,1275388,21616,14,1
379427,925844,3454,33,0
651520,1603023,3990,8,0
653088,1606785,27695,7,0
612338,1507172,41844,10,1
1072972,2652127,9275,18,1
484212,1187354,4516,7,1


### users_df, derived

In [19]:
users_df = orders_df[["user_id"]]

In [20]:
users_df = users_df.drop_duplicates().reset_index().drop('index',axis=1)

In [21]:
users_df.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


In [39]:
user_ids_in_validation_set = orders_df[orders_df["eval_set"]=="train"].drop_duplicates()[["user_id"]]

In [82]:
user_ids_in_validation_set.head(20)

Unnamed: 0,user_id
10,1
25,2
49,5
74,7
78,8
82,9
88,10
115,13
129,14
200,17


## function

In [22]:
def make_validation_actual():
    
    validation_data_dict = pd.merge(
        last_items_ordered_df,
        orders_df[orders_df["eval_set"]=="train"],
        on="order_id",
        how="left"
    ).groupby(
        "order_id"
    )["product_id"].apply(
         lambda prod_ids: sorted(set(prod_ids)) 
    ).to_dict()
    
    return validation_data_dict


In [23]:
def make_validation_predictions_last_reordered():
    users_in_validation_set = orders_df[orders_df['eval_set']=='train'][["user_id","order_id","order_number"]]
    last_order_number_by_users_in_validation_set = users_in_validation_set.groupby("user_id")["order_number"].max().to_frame().reset_index()

    second_last_order_numbers_by_users_in_validation_set=last_order_number_by_users_in_validation_set.copy()
    second_last_order_numbers_by_users_in_validation_set["order_number"] = second_last_order_numbers_by_users_in_validation_set["order_number"]-1

    second_last_orders_by_users_in_validation_set = pd.merge(
        second_last_order_numbers_by_users_in_validation_set,
        orders_df,
        on=["user_id","order_number"],
        how="left"
    ) 

    preds_dict = pd.merge(
        pd.merge(
            second_last_orders_by_users_in_validation_set,
            prior_items_ordered_df,
            on="order_id",
            how="left"
            ).query(
            "reordered == 1"
            ).groupby(
                "user_id"
            )["product_id"].apply(
                lambda prod_ids: sorted(set(prod_ids))
            ).to_frame().reset_index(),
        users_in_validation_set,
        on='user_id',
        how='right'
    )[
        ["order_id",'product_id']
    ].set_index(
        "order_id"
    ).fillna(
        "None"
    ).to_dict()["product_id"]
    
    return preds_dict
    

In [24]:
def make_validation_predictions_last_TWO_reordered():
    users_in_validation_set = orders_df[orders_df['eval_set']=='train'][["user_id","order_id","order_number"]]
    last_order_number_by_users_in_validation_set = users_in_validation_set.groupby("user_id")["order_number"].max().to_frame().reset_index()

    second_last_order_numbers_by_users_in_validation_set=last_order_number_by_users_in_validation_set.copy()
    second_last_order_numbers_by_users_in_validation_set["order_number"] = second_last_order_numbers_by_users_in_validation_set["order_number"]-1

    second_last_orders_by_users_in_validation_set = pd.merge(
        second_last_order_numbers_by_users_in_validation_set,
        orders_df,
        on=["user_id","order_number"],
        how="left"
    ) 

    third_last_order_numbers_by_users_in_validation_set = second_last_order_numbers_by_users_in_validation_set.copy()
    third_last_order_numbers_by_users_in_validation_set["order_number"] = third_last_order_numbers_by_users_in_validation_set["order_number"]-1

    third_last_orders_by_users_in_validation_set = pd.merge(
        third_last_order_numbers_by_users_in_validation_set,
        orders_df,
        on=["user_id","order_number"],
        how="left"
    )
    
    last_2_orders_for_each_user = pd.concat([
        second_last_orders_by_users_in_validation_set,
        third_last_orders_by_users_in_validation_set])
    
    preds_dict = pd.merge(
        pd.merge(
            last_2_orders_for_each_user,
            prior_items_ordered_df,
            on="order_id",
            how="left"
            ).query(
            "reordered == 1"
            ).groupby(
                "user_id"
            )["product_id"].apply(
                lambda prod_ids: sorted(set(prod_ids))
            ).to_frame().reset_index(),
        users_in_validation_set,
        on='user_id',
        how='right'
    )[
        ["order_id",'product_id']
    ].set_index(
        "order_id"
    ).fillna(
        "None"
    ).to_dict()["product_id"]
    
    return preds_dict

In [25]:
def make_validation_predictions_last_THREE_reordered():
    users_in_validation_set = orders_df[orders_df['eval_set']=='train'][["user_id","order_id","order_number"]]
    last_order_number_by_users_in_validation_set = users_in_validation_set.groupby("user_id")["order_number"].max().to_frame().reset_index()

    second_last_order_numbers_by_users_in_validation_set=last_order_number_by_users_in_validation_set.copy()
    second_last_order_numbers_by_users_in_validation_set["order_number"] = second_last_order_numbers_by_users_in_validation_set["order_number"]-1

    second_last_orders_by_users_in_validation_set = pd.merge(
        second_last_order_numbers_by_users_in_validation_set,
        orders_df,
        on=["user_id","order_number"],
        how="left"
    ) 

    third_last_order_numbers_by_users_in_validation_set = second_last_order_numbers_by_users_in_validation_set.copy()
    third_last_order_numbers_by_users_in_validation_set["order_number"] = third_last_order_numbers_by_users_in_validation_set["order_number"]-1

    third_last_orders_by_users_in_validation_set = pd.merge(
        third_last_order_numbers_by_users_in_validation_set,
        orders_df,
        on=["user_id","order_number"],
        how="left"
    )
    
    fourth_last_order_numbers_by_users_in_validation_set = third_last_order_numbers_by_users_in_validation_set.copy()
    fourth_last_order_numbers_by_users_in_validation_set["order_number"] = fourth_last_order_numbers_by_users_in_validation_set["order_number"]-1

    fourth_last_orders_by_users_in_validation_set = pd.merge(
        fourth_last_order_numbers_by_users_in_validation_set,
        orders_df,
        on=["user_id","order_number"],
        how="left"
    )
    
    last_orders_for_each_user = pd.concat([
        second_last_orders_by_users_in_validation_set,
        third_last_orders_by_users_in_validation_set,
        fourth_last_orders_by_users_in_validation_set])
    
    preds_dict = pd.merge(
        pd.merge(
            last_orders_for_each_user,
            prior_items_ordered_df,
            on="order_id",
            how="left"
            ).query(
            "reordered == 1"
            ).groupby(
                "user_id"
            )["product_id"].apply(
                lambda prod_ids: sorted(set(prod_ids))
            ).to_frame().reset_index(),
        users_in_validation_set,
        on='user_id',
        how='right'
    )[
        ["order_id",'product_id']
    ].set_index(
        "order_id"
    ).fillna(
        "None"
    ).to_dict()["product_id"]
    
    return preds_dict

In [135]:
def make_validation_predictions_last_AND_second_last_reordered():
    users_in_validation_set = orders_df[orders_df['eval_set']=='train'][["user_id","order_id","order_number"]]
    last_order_number_by_users_in_validation_set = users_in_validation_set.groupby("user_id")["order_number"].max().to_frame().reset_index()

    second_last_order_numbers_by_users_in_validation_set=last_order_number_by_users_in_validation_set.copy()
    second_last_order_numbers_by_users_in_validation_set["order_number"] = second_last_order_numbers_by_users_in_validation_set["order_number"]-1

    second_last_orders_by_users_in_validation_set = pd.merge(
        second_last_order_numbers_by_users_in_validation_set,
        orders_df,
        on=["user_id","order_number"],
        how="left"
    ) 

    third_last_order_numbers_by_users_in_validation_set = second_last_order_numbers_by_users_in_validation_set.copy()
    third_last_order_numbers_by_users_in_validation_set["order_number"] = third_last_order_numbers_by_users_in_validation_set["order_number"]-1

    third_last_orders_by_users_in_validation_set = pd.merge(
        third_last_order_numbers_by_users_in_validation_set,
        orders_df,
        on=["user_id","order_number"],
        how="left"
    )

    last_2_orders_for_each_user = pd.concat([
        second_last_orders_by_users_in_validation_set,
        third_last_orders_by_users_in_validation_set])
    
    users_and_product_ids_in_both_last_orders = pd.merge(
        last_2_orders_for_each_user,
        prior_items_ordered_df,
        on="order_id",
        how="left"
        ).query('reordered == 1').groupby(
            ['user_id','product_id']
        )[['order_id']].count().query(
         'order_id == 2'
        )
    
    users_and_product_ids_in_both_last_orders = users_and_product_ids_in_both_last_orders.reset_index().drop(
        'order_id',axis=1
    )
    
    order_ids_and_products = pd.merge(
        orders_df[orders_df['eval_set']=='train'],
        users_and_product_ids_in_both_last_orders,
        on='user_id',
        how='left'
    )
    
    def format_result(product_ids):
    
        out = list()

        for prod_id in product_ids:
            if pd.isnull(prod_id):
                return "None"
            else:
                out.append(int(prod_id))

        return sorted(set(out))     
        
    validation_orders_and_products = order_ids_and_products.groupby('order_id')["product_id"].apply(
        lambda prods: format_result(prods)
    )
    
    preds_dict = validation_orders_and_products.to_dict()
    
    return preds_dict

In [100]:
def evaluate_f1(actual_dict,predicted_dict,num_samples):
    assert(len(actual_dict)==len(predicted_dict))
    
    keys_actual = set(actual_dict.keys())
    keys_pred = set(predicted_dict.keys())
    
    assert(keys_actual == keys_pred)
    
    running_f1 = list()
    
    sampled_keys = np.random.choice(np.array(list(keys_actual)),size=num_samples,replace=False)
    
    for key in sampled_keys:
        actual_products = actual_dict[key]
        predicted_products = predicted_dict[key]
  
        if actual_products == "None":
            actual_products = [0]

        if predicted_products == "None":
            predicted_products = [0]            
        
        
        preds = np.zeros(len(products_df))
        actuals = np.zeros(len(products_df))
        
#         print("predicted: {}, actual: {} ".format(sorted(predicted_products), sorted(actual_products)))
        
        preds[predicted_products] = 1
        actuals[actual_products] = 1
        
        f1 = f1_score(actuals,preds,average='binary')
        running_f1.append(f1)
        

        
    return np.array(running_f1).mean()    

In [140]:
# validation

actual = make_validation_actual()
predicted = make_validation_predictions_last_TWO_reordered()

In [142]:
evaluate_f1(actual,predicted,3000)

0.2885161166246904

In [None]:
orders_in_test_set = orders_df[orders_df["eval_set"]=="test"].drop(["eval_set","order_number","order_dow","order_hour_of_day","days_since_prior_order"],axis=1)

users_in_test_set = orders_df[orders_df['eval_set']=='test'][["user_id","order_id","order_number"]]
last_order_number_by_users_in_test_set = users_in_test_set.groupby("user_id")["order_number"].max().to_frame().reset_index()

second_last_order_numbers_by_users_in_test_set=last_order_number_by_users_in_test_set.copy()
second_last_order_numbers_by_users_in_test_set["order_number"] = second_last_order_numbers_by_users_in_test_set["order_number"]-1

second_last_orders_by_users_in_test_set = pd.merge(
    second_last_order_numbers_by_users_in_test_set,
    orders_df,
    on=["user_id","order_number"],
    how="left"
) 


third_last_order_numbers_by_users_in_test_set = second_last_order_numbers_by_users_in_test_set.copy()
third_last_order_numbers_by_users_in_test_set["order_number"] = third_last_order_numbers_by_users_in_test_set["order_number"]-1

third_last_orders_by_users_in_test_set = pd.merge(
    third_last_order_numbers_by_users_in_test_set,
    orders_df,
    on=["user_id","order_number"],
    how="left"
)


fourth_last_order_numbers_by_users_in_test_set = third_last_order_numbers_by_users_in_test_set.copy()
fourth_last_order_numbers_by_users_in_test_set["order_number"] = fourth_last_order_numbers_by_users_in_test_set["order_number"]-1

fourth_last_orders_by_users_in_test_set = pd.merge(
    fourth_last_order_numbers_by_users_in_test_set,
    orders_df,
    on=["user_id","order_number"],
    how="left"
)

last_orders_for_each_user = pd.concat([
    second_last_orders_by_users_in_test_set,
    third_last_orders_by_users_in_test_set,
    fourth_last_orders_by_users_in_test_set])

In [None]:
submission = pd.merge(
    pd.merge(
        last_orders_for_each_user,
        prior_items_ordered_df,
        on="order_id",
        how="left"
        ).query(
        "reordered == 1"
        ).groupby(
            "user_id"
        )["product_id"].apply(
            lambda prod_ids: " ".join( str(num) for num in sorted(set(prod_ids)))
        ).to_frame().reset_index(),
    users_in_test_set,
    on='user_id',
    how='right'
)[
    ["order_id",'product_id']
].fillna(
    "None"
).sort_values("order_id")

In [None]:
submission.head(100)

In [None]:
submission.rename(columns={"product_id":"products"}).to_csv("just-repeat-sorted-last-3.csv",index=False,encoding='utf-8')