# Data Submission

This workbook is to streamline the steps on submitting data to Kaggle for final F1 scroing.

In [1]:
import pandas as pd
from pycaret.classification import *

In [2]:
orders = pd.read_csv('./data/orders.csv')
products = pd.read_csv('./data/products.csv')
orders_p = pd.read_csv('./data/order_products__prior.csv')
orders_tr = pd.read_csv('./data/order_products__train.csv')

In [3]:

def merge_products(x):
    """ Join the prediced reordered items into one line per order

    Args:
        x: items that will be reordered

    Returns:
        a string that have all reordered items into one line per order id"""
    return " ".join(list(x.astype('str')))

In [4]:
# path to save the prediction


path = './data/submit/'

In [5]:
prior_order = orders.query('eval_set == "prior"')
train_order = orders.query('eval_set == "train"')
test_order = orders.query('eval_set == "test"')

### Baseline (using last order per user)

In [6]:
# baseline = orders.copy()
# baseline['last_order'] = baseline['order_id'].shift(1)

In [7]:
# test_order_base = baseline.query('eval_set == "test"')

In [8]:
# baseline = pd.merge(test_order_base, orders_p, left_on='last_order',
#                     right_on='order_id', how='left')

In [9]:
# baseline_df = baseline[['order_id_x', 'product_id']]
# baseline_df = baseline_df.groupby(['order_id_x'])['product_id'].aggregate(
#     merge_products).reset_index()

In [10]:
# baseline_df.columns = ['order_id', 'products']
# baseline_df.shape

In [11]:
# baseline_df.to_csv(path + 'base_model.csv', index=False)

### Classification Models

In [12]:
df_submit = pd.read_csv('./data/df_submit.csv')
load = './data/save_model/'

In [13]:
df_submit.head()

Unnamed: 0,user_id,product_id,order_number,last_purchase,reordered,lag,product_appear,buy_cnt
0,3,248,2,0.0,0,0.0,0.5,0.0
1,3,1005,10,0.0,0,98.0,0.1,1.0
2,3,1819,7,0.0,2,13.0,0.428571,0.0
3,3,7503,3,0.0,0,21.0,0.333333,0.0
4,3,8021,2,0.0,0,0.0,0.5,0.0


In [14]:

def submit_data(model_name, dataset, prob):
    """ step on loading saved models and creating the data submission to /
    Kaggle competition

    Args:
        model_name: trained model from machine learning section
        dataset: test dataset
        prob: probability threshold to determine whether the prediction /
        is 0 or 1

    Returns:
        df: dataframe that match Kaggle sumbission requirement
    """
    model = load_model(load+model_name)
    pred = predict_model(model, data=dataset, probability_threshold=prob)
    pred.index = dataset.index
    submit = pred.loc[pred['Label'] == 1].reset_index()
    submit['product_id'] = submit['product_id'].astype('str')
    df = pd.merge(test_order[['order_id', 'user_id']], submit, on=['user_id'],
                  how='left').fillna(0)
    df = df.groupby(['order_id'])['product_id'].aggregate(
        merge_products).reset_index()
    df.columns = ['order_id', 'products']
    print(df.shape)
    return df

##### Random Forecast

In [15]:
# tuned model

submit_data('rfc_tune', df_submit, .17).to_csv(
    path+'rfc_tune.csv', index=False)

##### Light Gradient Boosting 

In [16]:
submit_data('lgbm_1', df_submit, .17).to_csv(path+'lightgbm.csv', index=False)

In [17]:
# tuned model

submit_data('lgbm_tune', df_submit, .17).to_csv(
    path+'lgbm_tune.csv', index=False)

##### Gradient Boosting

In [12]:
submit_data('gbc', df_submit, .17).to_csv(path+'gbc.csv', index=False)

(75000, 2)


In [17]:
# tuned model

submit_data('gbc_tune', df_submit, .17).to_csv(
    path+'gbc_tune.csv', index=False)

(75000, 2)


##### XgBoost

In [15]:
submit_data('xgb_tune', df_submit, .17).to_csv(
    path+'xgb_tune.csv', index=False)

(75000, 2)
