In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *

In [2]:
# aisles = pd.read_csv('./data/aisles.csv')
# dept = pd.read_csv('./data/departments.csv')
orders = pd.read_csv('./data/orders.csv')
products = pd.read_csv('./data/products.csv')
orders_p = pd.read_csv('./data/order_products__prior.csv')
orders_tr = pd.read_csv('./data/order_products__train.csv')

In [3]:
def merge_products(x):
    """ Join the prediced reordered items into one line per order
    
    Args:
        x: items that will be reordered
    
    Returns:
        a string that have all reordered items into one line per order id"""
    return " ".join(list(x.astype('str')))

In [4]:
#path to save the prediction
path='./data/submit/'

### Baseline (using last order per user)

In [5]:
orders['last_order'] = orders['order_id'].shift(1)

In [6]:
prior_order = orders.query('eval_set == "prior"')
train_order = orders.query('eval_set == "train"')
test_order = orders.query('eval_set == "test"')

In [7]:
baseline = pd.merge(test_order, orders_p, left_on='last_order', right_on='order_id', how='left')

In [8]:
baseline_df = baseline[['order_id_x','product_id']]
baseline_df = baseline_df.groupby(['order_id_x'])['product_id'].aggregate(merge_products).reset_index()

In [9]:
baseline_df.columns = ['order_id','products']
baseline_df.shape

(75000, 2)

In [10]:
baseline_df.to_csv(path+'base_model.csv', index=False)

### Classification Models

In [11]:
df_submit = pd.read_csv('./data/df_submit.csv')
load = './data/save_model/'

In [12]:
def submit_data(model_name, dataset, prob):
    """ step on loading saved models and creating the data submission to Kaggle competition
    
    Args:
        model_name: trained model from machine learning section
        dataset: test dataset
        prob: probability threshold to determine whether the prediction is 0 or 1
        
    Returns:
        df: dataframe that match Kaggle sumbission requirement
    """
    model = load_model(load+model_name)
    pred = predict_model(model, data=dataset, probability_threshold=prob)
    pred.index = dataset.index    
    submit = pred.loc[pred['Label']==1].reset_index()
    submit['product_id'] = submit['product_id'].astype('str')
    df = pd.merge(test_order[['order_id', 'user_id']], submit, on=['user_id'], how='left').fillna(0)
    df= df.groupby(['order_id'])['product_id'].aggregate(merge_products).reset_index()
    df.columns = ['order_id','products']
    print(df.shape)
    return df

##### Decision Tree

In [13]:
# dt = load_model('./data/dt')
# submit_data(dt, df_submit).to_csv('./data/dt.csv', index=False)

##### Random Forecast

In [14]:
submit_data('rfc', df_submit, .25).to_csv(path+'rfc.csv', index=False)

(75000, 2)


##### Light Gradient Boosting 

In [15]:
submit_data('lgbm', df_submit, .25).to_csv(path+'lightgbm.csv', index=False)

(75000, 2)


##### Gradient Boosting

In [16]:
submit_data('gbc', df_submit, .25).to_csv(path+'gbc.csv', index=False)

(75000, 2)


##### Logistic Regression

In [17]:
submit_data('lr', df_submit, .25).to_csv(path+'lr.csv', index=False)

(75000, 2)


##### XgBoost

In [18]:
submit_data('xgb', df_submit, .25).to_csv(path+'xgb.csv', index=False)

(75000, 2)
