In [1]:
from implicit.nearest_neighbours import tfidf_weight
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from pathlib import Path
from numpy import bincount, log, sqrt

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time
import heapq

In [8]:
# Order datasets
df_order_products_prior = pd.read_csv("datasets/order_products__prior.csv")
df_order_products_train = pd.read_csv("datasets/order_products__train.csv")
df_orders = pd.read_csv("datasets/orders.csv") 

# Products
df_products = pd.read_csv("datasets/products.csv")

# Merge prior orders and products
df_merged_order_products_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

In [3]:
current_order_user_df = df_orders.loc[(df_orders.eval_set == "prior")].reset_index()
current_order_user_df = current_order_user_df[["order_id", "user_id"]]

assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_prior["order_id"].unique())

# Group product_id for each order into products
df_order_products_prior = df_order_products_prior[["order_id", "product_id"]]
df_product_frequency = df_order_products_prior['product_id'].value_counts()
df_order_products_prior = df_order_products_prior.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})


assert current_order_user_df.size == df_order_products_prior.size

df_prior_user_products = pd.merge(current_order_user_df, df_order_products_prior, on="order_id")
df_prior_user_products = df_prior_user_products[["user_id", "products"]]
df_prior_user_products = df_prior_user_products.groupby("user_id")["products"].agg(sum).reset_index()

  df_prior_user_products = df_prior_user_products.groupby("user_id")["products"].agg(sum).reset_index()


In [4]:
df_prior_user_products

Unnamed: 0,user_id,products
0,1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
1,2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
2,3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
3,4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
4,5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."
...,...,...
206204,206205,"[27845, 28745, 3896, 49235, 21137, 37067, 3873..."
206205,206206,"[13817, 24099, 47011, 17734, 38530, 47011, 385..."
206206,206207,"[47766, 3397, 3469, 2450, 45965, 20583, 30233,..."
206207,206208,"[34213, 27966, 33000, 23579, 27845, 20995, 430..."


In [5]:
df_order_user_current = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
df_order_user_current = df_order_user_current[["order_id", "user_id"]]

# Sanity check #1: `current_order_user_df` and `df_order_products_train` should have the same number of 
# unique order ids
assert len(df_order_user_current["order_id"].unique()) == len(df_order_products_train["order_id"].unique())

# Convert train dataframe to a similar format
df_order_products_test = df_order_products_train[["order_id", "product_id"]]
df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

# Sanity check #2: `df_order_products_test` and `df_order_user_current` should have the same number of 
# records before attempting to merge them
assert df_order_products_test.size == df_order_user_current.size

# Merge on order id
df_user_products_test = pd.merge(df_order_user_current, df_order_products_test, on="order_id")
df_user_products_test = df_user_products_test[["user_id", "products"]]

In [6]:
df_user_products_test

Unnamed: 0,user_id,products
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
3,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
4,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."
...,...,...
131204,206199,"[29429, 8898, 6128, 6701, 38855, 38341, 43821,..."
131205,206200,"[22828, 13176, 39190, 34243, 8174, 27451, 1559..."
131206,206203,"[15693, 37188, 21469, 41007, 2482, 14050, 2638..."
131207,206205,"[27845, 21137, 28745, 22035, 24852, 46886, 314..."


In [9]:
# Consider ony "prior" orders and remove all columns except `user_id` from `df_orders`
df_order_user_prior = df_orders.loc[df_orders.eval_set == "prior"]
df_order_user_prior = df_order_user_prior[["order_id", "user_id"]]

# Remove all columns except order_id and user_id from df_orders and 
# merge the above on `order_id` and remove `order_id`
df_merged = pd.merge(df_order_user_prior, df_order_products_prior[["order_id", "product_id"]], on="order_id")
df_user_product_prior = df_merged[["user_id", "product_id"]]
df_user_product_prior = df_user_product_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})

df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")

In [10]:
df_user_product_prior

Unnamed: 0,user_id,product_id,quantity
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3
...,...,...,...
13307948,206209,43961,3
13307949,206209,44325,1
13307950,206209,48370,1
13307951,206209,48697,1


In [11]:
product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                        (df_user_product_prior["product_id"].cat.codes.copy(),
                                         df_user_product_prior["user_id"].cat.codes.copy())))

In [25]:
user_product_matrix = product_user_matrix.T

In [28]:
def tfidf_weight(tf):
    """
    Given a Term Frequency matrix
    Returns a TF-IDF weight matrix
    """
    
    tf_idf = coo_matrix(tf)

    # calculate IDF
    N = float(tf_idf.shape[0])
    idf = log(N / (1 + bincount(tf_idf.col)))

    # apply TF-IDF adjustment
    tf_idf.data = sqrt(tf_idf.data) * idf[tf_idf.col]
    return tf_idf

In [29]:
tf_idf = tfidf_weight(user_product_matrix)

# convert to Compressed Sparse Row format
tf_idf = tf_idf.tocsr()

In [36]:
tf_idf[0]

(1, 49677)

In [37]:
# Selecting one user to test
target_user_id = 1

# Fetch row of target user
target_user = tf_idf[target_user_id - 1]

# Calculate Cosine Similarity Vector of target user
similarities = cosine_similarity(tf_idf, target_user, False)