In [2]:
import pandas as pd
import numpy as np
import os
import warnings
import gc
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

COLS = ["user_id", "product_id", 

        "order_number", "last_order_number", "order_dow", "order_hour_of_day", "days_since_prior_order",
        "days_since_prev_order_item", "products_ordered_in_this_order", ## Current order Features

        "total_orders_uxp", "usual_cart_position_uxp", "streak_chance_uxp", "item_n2_chance_uxp", "item_n3_chance_uxp", 
        "item_n4_chance_uxp", "item_n5_chance_uxp", "average_reorder_time_uxp", ## Uxp Features

        "reorder_ratio_product", "total_purchases_product", "usual_position_in_cart_product", "avg_cart_size_product",
        "usual_buying_time_product", "usual_buying_day_product", "one_shot_ratio_product", "unique_users_product", ## Prod Features

        "total_orders_user", "avg_time_between_orders_user", "usual_buying_time_user", "usual_buying_day_user", "total_unique_products_bought_user",
        "first_product_reordered_user", "complete_new_order_user", "average_product_per_order_user", "average_reorder_ratio_per_order_user", ## User Features
        
        "reordered"]

FEATURES = [
    "order_number", "last_order_number", "order_dow", "order_hour_of_day", "days_since_prior_order",
    "days_since_prev_order_item", ## Current order Features

    "total_orders_uxp", "usual_cart_position_uxp", "streak_chance_uxp", "item_n2_chance_uxp", "item_n3_chance_uxp", 
    "item_n4_chance_uxp", "item_n5_chance_uxp", "average_reorder_time_uxp", ## Uxp Features

    "reorder_ratio_product", "total_purchases_product", "usual_position_in_cart_product", "avg_cart_size_product",
    "usual_buying_time_product", "usual_buying_day_product", "one_shot_ratio_product", "unique_users_product", ## Prod Features

    "total_orders_user", "avg_time_between_orders_user", "usual_buying_time_user", "usual_buying_day_user", "total_unique_products_bought_user",
    "first_product_reordered_user", "complete_new_order_user", "average_product_per_order_user", "average_reorder_ratio_per_order_user", ## User Features
]

In [3]:
def generateUserFeatures(df):
    user_base_features = df.groupby("user_id").agg(
        total_orders = ("order_number", "max"), ## app liking
        avg_time_between_orders = ("days_since_prior_order", "mean"), ## app loyalty
        usual_buying_time = ("order_hour_of_day", "median"), ## buying pattern
        usual_buying_day = ("order_dow", "median"), ## buying pattern
        total_unique_products_bought = ("product_id", "nunique"), ## new product recommendation
    ).reset_index()

    first_product_reordered = df[((df.cart_order == 1) & (df.order_number != 1))].groupby("user_id")["reordered"].mean().reset_index(name="first_product_reordered")
    user_base_features = user_base_features.merge(first_product_reordered, on="user_id", how="left")

    temp = df[df.order_number != 1].groupby(["user_id", "order_id"])["reordered"].sum().reset_index()
    temp["complete_new_order"] = (temp["reordered"] == 0).astype(int)
    new_order = temp.groupby("user_id")["complete_new_order"].mean().reset_index()
    user_base_features = user_base_features.merge(new_order, on="user_id", how="left")

    user_order_features = df[df.order_number != 1].groupby(["user_id", "order_number"]).agg(
        product_per_order = ("product_id", "count"), ## buying nature
        reorder_ratio_per_order = ("reordered", "mean"),  ## buying nature
    ).reset_index()

    user_order_features = user_order_features.groupby("user_id").agg(
        average_product_per_order = ("product_per_order", "median"), ## buying nature
        average_reorder_ratio_per_order = ("reorder_ratio_per_order", "mean"), ## buying nature
    ).reset_index()

    user_features = user_base_features.merge(user_order_features, on="user_id", how="left")

    del user_order_features, temp, new_order, first_product_reordered, user_base_features
    gc.collect()
    user_features.columns = [col + "_user" if col != "user_id" else col for col in user_features.columns]
    return user_features

def generateProductFeatures(df):
    product_base_features = df.groupby("product_id").agg(
        reorder_ratio = ("reordered", "mean"), ## Product popularity
        total_purchases = ("order_id", "count"), ## Overall interest
        usual_position_in_cart = ("cart_order", "median"), ## Type of product [Needs/Wants/Last minute impulse]
        avg_cart_size = ("cart_size", "mean"), ## Part of staple order or not
        usual_buying_time = ("order_hour_of_day", "median"), ## buying pattern
        usual_buying_day = ("order_dow", "median"), ## buying pattern
    ).reset_index()

    user_product_base_features = df.groupby(["user_id", "product_id"]).agg(
        total_orders = ("order_id", "nunique"), ## Staple/single shots
        usual_cart_position = ("cart_order", "median")).reset_index() ## User's view on this product (Need/Want),

    single_order_ratio = (user_product_base_features[user_product_base_features['total_orders'] == 1].groupby('product_id').size() / user_product_base_features.groupby('product_id').size()).reset_index(name="one_shot_ratio")
    unique_users = user_product_base_features.groupby('product_id').size().reset_index(name="unique_users")
    product_features = product_base_features.merge(single_order_ratio, on="product_id", how="left")
    product_features = product_features.merge(unique_users, on="product_id", how="left")
    product_features["one_shot_ratio"] = product_features["one_shot_ratio"].fillna(0)

    product_features.columns = [col+"_product" if col != "product_id" else col for col in product_features.columns if col]

    del product_base_features, single_order_ratio, unique_users
    gc.collect()
    return product_features

def generateUXPFeatures(df):
    user_product_features = df.groupby(["user_id", "product_id"]).agg(
        total_orders = ("order_id", "nunique"), ## Staple/single shots
        usual_cart_position = ("cart_order", "median"), ## User's view on this product (Need/Want),

        ### Reorder chances within n orders
        streak_chance = ("item_in_every_order", "mean"),
        item_n2_chance = ("item_in_every_2_order", "mean"),
        item_n3_chance = ("item_in_every_3_order", "mean"),
        item_n4_chance = ("item_in_every_4_order", "mean"),
        item_n5_chance = ("item_in_every_5_order", "mean"),
        
        average_reorder_time = ("days_since_prev_order_item", "median")).reset_index() 

    user_product_features["average_reorder_time"] = user_product_features["average_reorder_time"].fillna(0)
    user_product_features.columns = [col+"_uxp" if ((col != "user_id") & (col != "product_id")) else col for col in user_product_features.columns if col]
    return user_product_features

def generateOderFeatures(df):
    ## User - Product usage (Days since last order of this item by this user)
    df["last_order_number"] = df.groupby(['user_id', 'product_id'])["order_number"].shift(1)

    temp = df.drop_duplicates(subset=["user_id", "order_number"])[["user_id", "order_number", "days_since_prior_order"]]
    temp["days_since_first_order"] = temp.groupby(["user_id"])["days_since_prior_order"].cumsum()
    temp["last_order_number"] = temp["order_number"]

    df = df.merge(temp[["user_id", "order_number", "days_since_first_order"]], left_on=["user_id", "order_number"], right_on=["user_id", "order_number"], how="left")
    df = df.rename(columns={"days_since_first_order": "curr_order_days_since_first_order"})

    df = df.merge(temp[["user_id", "last_order_number", "days_since_first_order"]], left_on=["user_id", "last_order_number"], right_on=["user_id", "last_order_number"], how="left")
    df = df.rename(columns={"days_since_first_order": "prev_order_days_since_first_order"})

    df["curr_order_days_since_first_order"] = df["curr_order_days_since_first_order"].fillna(0)
    df["prev_order_days_since_first_order"] = df["reordered"]*(df["last_order_number"]/df["last_order_number"])*df["prev_order_days_since_first_order"].fillna(0)
    df.loc[:,"days_since_prev_order_item"] = df["curr_order_days_since_first_order"] - df["prev_order_days_since_first_order"]

    df = df.drop(["curr_order_days_since_first_order", "prev_order_days_since_first_order"], axis=1)

    df["last_order_num_diff"] = df["order_number"] - df["last_order_number"]
    df["item_in_every_order"] = ((df["order_number"] - df["last_order_number"]) == 1).astype(int)
    df["item_in_every_2_order"] = ((df["order_number"] - df["last_order_number"]) == 2).astype(int)
    df["item_in_every_3_order"] = ((df["order_number"] - df["last_order_number"]) == 3).astype(int)
    df["item_in_every_4_order"] = ((df["order_number"] - df["last_order_number"]) == 4).astype(int)
    df["item_in_every_5_order"] = ((df["order_number"] - df["last_order_number"]) == 4).astype(int)

    order_features = df.groupby("order_id").agg(
        cart_size = ("product_id", "count"), 
        reorder_ratio = ("reordered", "mean"),
    ).reset_index()
    df = df.merge(order_features, left_on="order_id", right_on="order_id")

    del temp
    gc.collect()

    return df

In [4]:
def materializeFeatures(prior):
    user_features = generateUserFeatures(prior)
    product_features = generateProductFeatures(prior)
    uxp_features = generateUXPFeatures(prior)
    data = pd.merge(uxp_features.merge(product_features, on="product_id"), user_features, on="user_id")
    del user_features, product_features, uxp_features
    gc.collect()
    
    return data

def createDataset(prior, currentOrder, cols=COLS):
    data = materializeFeatures(prior)
    currentOrder["products_ordered_in_this_order"] = currentOrder["cart_order"] - 1
    currentOrder = currentOrder[["user_id", "product_id", "order_number", "order_dow", 
                       "order_hour_of_day", "days_since_prior_order", "days_since_prev_order_item", 
                       "last_order_number", "products_ordered_in_this_order", "reordered"]]
    
    df = data.merge(currentOrder, on=["user_id", "product_id"], how="left")

    df = df[COLS]
    del prior, currentOrder
    gc.collect()

    return df

def preprocess(PRIOR_ORDERS, CURRENT_ORDER, col):
    PRIOR_USER_PRODUCT_ORDERS = PRIOR_ORDERS.drop_duplicates(subset=["user_id", "product_id"])[["user_id", "product_id"]]
    CURRENT_USER_PRODUCT_ORDERS = CURRENT_ORDER[["user_id", "product_id", "reordered"]]
    CURRENT_ORDER_MERGED = PRIOR_USER_PRODUCT_ORDERS.merge(CURRENT_USER_PRODUCT_ORDERS, on=["user_id", "product_id", ], how="left").sort_values(by=["user_id"]).reset_index(drop=True)
    CURRENT_ORDER_MERGED["reordered"] = CURRENT_ORDER_MERGED["reordered"].fillna(0)
    reordered = CURRENT_ORDER_MERGED["reordered"]
    CURRENT_ORDER_MERGED["reordered"] = 1
    CURRENT_ORDER_MERGED = CURRENT_ORDER_MERGED.merge(CURRENT_ORDER.drop_duplicates(["user_id"])[["order_id", "user_id", "order_number", "order_dow", "order_hour_of_day", "days_since_prior_order", "max_onb", "train", "test"]], on="user_id", how="left")

    ORDERS = pd.concat([PRIOR_ORDERS, CURRENT_ORDER_MERGED], ignore_index=True)
    ORDERS = generateOderFeatures(ORDERS)

    PRIORS = ORDERS[ORDERS.order_number < ORDERS[col]].drop(["train", "test"], axis=1).reset_index(drop=True)
    CURRENT = ORDERS[ORDERS.order_number == ORDERS[col]].drop(["train", "test"], axis=1).reset_index(drop=True)
    CURRENT["reordered"] = reordered
    CURRENT["cart_order"] = CURRENT["cart_order"].fillna(1)

    del PRIOR_USER_PRODUCT_ORDERS, CURRENT_USER_PRODUCT_ORDERS, CURRENT_ORDER_MERGED, ORDERS, reordered
    gc.collect()
    return PRIORS, CURRENT

def setup(data, trainMinOrders=3):
    max_onb = data.groupby("user_id")["order_number"].max().reset_index(name="max_onb")
    data = data.merge(max_onb, on="user_id")

    data["train"] = np.maximum(data["max_onb"] - 1, trainMinOrders) ## Atleast 3 orders required in prior
    data["test"] = np.maximum(data["max_onb"], trainMinOrders+1) ### Last order as test set

    train_priors = data[data.order_number < data.train].reset_index(drop=True) 
    train = data[data.order_number == data.train].reset_index(drop=True)

    test_priors = data[data.order_number < (data.train + 1)].reset_index(drop=True)
    test = data[data.order_number == (data.train + 1)].reset_index(drop=True)

    train_priors, train = preprocess(train_priors, train, "train")
    test_priors, test = preprocess(test_priors, test, "test")

    trainDataset = createDataset(train_priors, train)
    testDataset = createDataset(test_priors, test)

    del train_priors, test_priors, train, test, data, max_onb, 
    return trainDataset, testDataset

In [5]:
def trainModel(trainingDf, features=FEATURES):
    X = trainingDf[features]
    y = trainingDf["reordered"]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert the dataset into a DMatrix object, which is a data structure that XGBoost uses for optimized data handling
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X, label=y)

    # Set the parameters for the XGBoost model
    params = {
        'max_depth': 3, 
        'eta': 0.1, 
        'objective': 'binary:logistic',
        'eval_metric': 'logloss'
    }

    # Train the XGBoost model
    num_round = 100
    bst = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

    # Make predictions on the test set
    y_pred_prob = bst.predict(dtest)
    y_pred = np.where(y_pred_prob > 0.5, 1, 0)

    # Evaluate the model
    conf_matrix = confusion_matrix(y, y_pred)
    f1_score = f1_score(y, y_pred)
    print(f"F1 Score = {round(f1_score, 3)}")
    
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                              display_labels=bst.classes_)
    disp.plot()

    # Plot the feature importance
    plt.figure(figsize=(10, 8))
    xgb.plot_importance(bst, importance_type='weight')
    plt.title('Feature Importance')
    plt.show()

    return bst

def testModel(data, model, features=FEATURES):

    X = data[features]
    y = data["reordered"]
    
    dtest = xgb.DMatrix(X, label=y)
    y_pred_prob = model.predict(dtest)
    y_pred = np.where(y_pred_prob > 0.5, 1, 0)

    # Evaluate the model
    conf_matrix = confusion_matrix(y, y_pred)

    print("Confusion Matrix:")
    print(conf_matrix)

In [6]:
og_df = pd.read_csv("../Dataset/TrainingDataset.csv", index_col=0)
meta_data = pd.read_csv("../Dataset/Metadata.csv", index_col=0)

In [19]:
total_users = og_df.user_id.nunique()

df = og_df[og_df.user_id == 2]

In [20]:
train, test = setup(df, 3)


In [21]:
train

Unnamed: 0,user_id,product_id,order_number,last_order_number,order_dow,order_hour_of_day,days_since_prior_order,days_since_prev_order_item,products_ordered_in_this_order,total_orders_uxp,...,total_orders_user,avg_time_between_orders_user,usual_buying_time_user,usual_buying_day_user,total_unique_products_bought_user,first_product_reordered_user,complete_new_order_user,average_product_per_order_user,average_reorder_ratio_per_order_user,reordered
0,2,23,13,8.0,4,11,30.0,102.0,0.0,1,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0
1,2,1559,13,12.0,4,11,30.0,30.0,0.0,5,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0
2,2,2002,13,11.0,4,11,30.0,58.0,0.0,4,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0
3,2,2573,13,10.0,4,11,30.0,88.0,0.0,1,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0
4,2,3151,13,12.0,4,11,30.0,30.0,0.0,1,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,2,48099,13,9.0,4,11,30.0,94.0,0.0,1,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0
83,2,48110,13,8.0,4,11,30.0,102.0,0.0,2,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0
84,2,48210,13,5.0,4,11,30.0,156.0,0.0,1,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0
85,2,49273,13,9.0,4,11,30.0,94.0,0.0,1,...,12,15.464968,10.0,2.0,87,0.727273,0.0,14.0,0.513314,0.0


In [26]:
df[df.order_number == 13]

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,cart_order,reordered
229,3268552,2,13,4,11,30.0,42342,1,0
230,3268552,2,13,4,11,30.0,30908,2,0
231,3268552,2,13,4,11,30.0,79,3,0
232,3268552,2,13,4,11,30.0,5869,4,0
233,3268552,2,13,4,11,30.0,44303,5,0
234,3268552,2,13,4,11,30.0,16521,6,0
235,3268552,2,13,4,11,30.0,39877,7,0
236,3268552,2,13,4,11,30.0,19057,8,0
237,3268552,2,13,4,11,30.0,45948,9,0


In [27]:
df[df.order_number == 14]

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,cart_order,reordered
238,839880,2,14,3,10,13.0,24852,1,1
239,839880,2,14,3,10,13.0,16589,2,1
240,839880,2,14,3,10,13.0,1559,3,1
241,839880,2,14,3,10,13.0,19156,4,1
242,839880,2,14,3,10,13.0,18523,5,1
243,839880,2,14,3,10,13.0,22825,6,0
244,839880,2,14,3,10,13.0,27413,7,0
245,839880,2,14,3,10,13.0,33754,8,1
246,839880,2,14,3,10,13.0,21709,9,1
247,839880,2,14,3,10,13.0,47209,10,1


In [28]:
test[test.product_id == 22825]

Unnamed: 0,user_id,product_id,order_number,last_order_number,order_dow,order_hour_of_day,days_since_prior_order,days_since_prev_order_item,products_ordered_in_this_order,total_orders_uxp,...,total_orders_user,avg_time_between_orders_user,usual_buying_time_user,usual_buying_day_user,total_unique_products_bought_user,first_product_reordered_user,complete_new_order_user,average_product_per_order_user,average_reorder_ratio_per_order_user,reordered


In [21]:
TRAIN = pd.DataFrame()
TEST = pd.DataFrame()

start = 0
end = 10000

while(start < total_users):
    print(f"{start} to {end}")
    df = og_df[((og_df.user_id >= start) & (og_df.user_id < end))]
    train, test = setup(df, 3)
    TRAIN = pd.concat([TRAIN, train], ignore_index=True)
    TEST = pd.concat([TEST, test], ignore_index=True)
    del train, test
    gc.collect()

    start += 10000
    end += 10000

0 to 10000
10000 to 20000
20000 to 30000
30000 to 40000
40000 to 50000
50000 to 60000
60000 to 70000
70000 to 80000
80000 to 90000
90000 to 100000
100000 to 110000
110000 to 120000
120000 to 130000
130000 to 140000
140000 to 150000
150000 to 160000
160000 to 170000
170000 to 180000
180000 to 190000
190000 to 200000
200000 to 210000


In [22]:
total_users

206209

In [26]:
with open("../FeatureStore/TrainingDataset_AllOrders.pickle", "wb") as file:
    pickle.dump(TRAIN, file)

with open("../FeatureStore/TestDataset_AllOrders.pickle", "wb") as file:
    pickle.dump(TEST, file)

In [52]:
model = trainModel(TRAIN)

NotFittedError: need to call fit or load_model beforehand

In [None]:
model.classes_

In [41]:
TEST_2 = TEST.dropna()

In [43]:
testModel(TEST_2, model)

Confusion Matrix:
[[10594500   308724]
 [ 1089520    31949]]


AttributeError: 'Booster' object has no attribute 'classes_'

In [1]:
df

NameError: name 'df' is not defined