In [None]:
import time

import numpy as np
import pandas as pd

In [None]:
# small = ""
small = "small_"

In [20]:
metadata_path = (f"preprocessed_data/{small}product_metadata.csv")
train_orders_path = f"preprocessed_data/{small}train_orders.npy"
validation_orders_path = f"preprocessed_data/{small}validation_orders.npy"
test_orders_path = f"preprocessed_data/{small}test_orders.npy"

orders_path = f"preprocessed_data/{small}order_data.csv"

In [21]:
if small:
    MIN_USER_TRANSACTIONS = 0
    MIN_PRODUCT_TRANSACTIONS = 500
    SUBSET_OF_USERS = 0.1
else:
    MIN_USER_TRANSACTIONS = 0
    MIN_PRODUCT_TRANSACTIONS = 0
    SUBSET_OF_USERS = 1.0
    
np.random.seed(0)
    
print(f"Min. User Transactions: {MIN_USER_TRANSACTIONS}, Min. Product Transactions: {MIN_PRODUCT_TRANSACTIONS}, Subset of Users: {SUBSET_OF_USERS}")

Min. User Transactions: 0, Min. Product Transactions: 500, Subset of Users: 0.1


In [22]:
order_products_train_df = pd.read_csv("data/order_products__train.csv")
order_products_prior_df = pd.read_csv("data/order_products__prior.csv")
order_products_df = pd.concat(
    [order_products_train_df, order_products_prior_df]
)[["order_id", "product_id"]]
print(f"Dimensions of concatenated data frame: {order_products_df.shape} \n")

del order_products_train_df, order_products_prior_df

order_products_df.head()

Dimensions of concatenated data frame: (33819106, 2) 



Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [23]:
orders_df = pd.read_csv("data/orders.csv")[
    ["order_id", "user_id", "order_number", "eval_set"]
]
# remove the orders belonging to the test set, as we don't have those transactions
orders_df = orders_df[orders_df.eval_set != "test"]
orders_df.head()

Unnamed: 0,order_id,user_id,order_number,eval_set
0,2539329,1,1,prior
1,2398795,1,2,prior
2,473747,1,3,prior
3,2254736,1,4,prior
4,431534,1,5,prior


### Filter out orders from users with less than MIN_USER_TRANSACTIONS transactions

In [24]:
# Filter out orders from users with less than MIN_USER_TRANSACTIONS transactions
tmp = orders_df['user_id'].value_counts()
user_transactions_count = tmp.reset_index()
print(f"Dimensions of orders data frame before filter: {orders_df.shape} \n")
user_transactions_count.columns = ['user_id', 'count']
# Select a random subset of users if subset_user is specified (for rapid prototyping)
user_transactions_count['select'] = (np.random.rand(user_transactions_count.shape[0]) < SUBSET_OF_USERS)
orders_df = pd.merge(orders_df, user_transactions_count.loc[(user_transactions_count['select']) & (user_transactions_count['count']>=MIN_USER_TRANSACTIONS)], on = 'user_id')
print(f"Dimensions of orders data frame after filter: {orders_df.shape} \n")
orders_df = orders_df.drop(labels="select", axis=1)
orders_df.head()

Dimensions of orders data frame before filter: (3346083, 4) 

Dimensions of orders data frame after filter: (338929, 6) 



Unnamed: 0,order_id,user_id,order_number,eval_set,count
0,1737705,17,1,prior,41
1,1681401,17,2,prior,41
2,2680214,17,3,prior,41
3,3197376,17,4,prior,41
4,3237467,17,5,prior,41


In [25]:
# Remove orders from users with less than MIN_USER_TRANSACTIONS transactions
order_products_df = order_products_df.merge(orders_df[["order_id"]], on="order_id")
print(order_products_df.shape)
order_products_df.head()

(3430305, 2)


Unnamed: 0,order_id,product_id
0,349,33000
1,349,11361
2,349,27695
3,349,47672
4,349,45633


###  Filter out orders less than MIN_PRODUCT_TRANSACTIONS transactions

In [26]:
# Load and merge products, departments, aisles
products = pd.read_csv("data/products.csv")
departments = pd.read_csv("data/departments.csv")
products = pd.merge(products, departments, on="department_id")
aisles = pd.read_csv("data/aisles.csv")
products = pd.merge(products, aisles, on="aisle_id")

del departments, aisles

In [27]:
# Filter out orders less than MIN_PRODUCT_TRANSACTIONS transactions
product_transactions_count = order_products_df['product_id'].value_counts()

print(f"Dimensions of order product data frame before filter: {order_products_df.shape} \n")
order_products_df = order_products_df[order_products_df["product_id"].isin(product_transactions_count.index[product_transactions_count >= MIN_PRODUCT_TRANSACTIONS].values)]
print(f"Dimensions of order product data frame after filter: {order_products_df.shape} \n")

print(f"Dimensions of product metadata data frame before filter: {products.shape} \n")
products = products[products["product_id"].isin(product_transactions_count.index[product_transactions_count >= MIN_PRODUCT_TRANSACTIONS].values)]
print(f"Dimensions of product metadata data frame after filter: {products.shape} \n")

Dimensions of order product data frame before filter: (3430305, 2) 

Dimensions of order product data frame after filter: (1964262, 2) 

Dimensions of product metadata data frame before filter: (49688, 6) 

Dimensions of product metadata data frame after filter: (1189, 6) 



In [28]:
# Remove products with less than MIN_PRODUCT_TRANSACTIONS transactions
order_products_df = order_products_df.merge(orders_df, on="order_id")
print(order_products_df.shape)
order_products_df.head()

(1964262, 6)


Unnamed: 0,order_id,product_id,user_id,order_number,eval_set,count
0,349,33000,156353,9,train,9
1,349,27695,156353,9,train,9
2,349,47672,156353,9,train,9
3,349,45633,156353,9,train,9
4,349,11520,156353,9,train,9


In [29]:
# turn sequence of products into list of products for each order for each user
order_products_df = order_products_df.groupby(["eval_set", "user_id", "order_number", "order_id"])["product_id"].apply(lambda x: [f"product_{k}" for k in x])
order_products_df = order_products_df.reset_index()
order_products_df.head()

Unnamed: 0,eval_set,user_id,order_number,order_id,product_id
0,prior,17,1,1737705,[product_47141]
1,prior,17,2,1681401,"[product_42356, product_16797]"
2,prior,17,4,3197376,[product_47141]
3,prior,17,6,2616505,[product_47141]
4,prior,17,8,2430354,[product_47141]


In [13]:
# Get the indexes for the test set (last transaction) and validation set (second-to-last) transaction
temp_users = order_products_df.groupby('user_id')
test_set_index = []
validation_set_index = []
for (k, d) in temp_users:
    if len(d) > 1:
        test_set_index.append(d.index[-1])
    if len(d) > 2:
        validation_set_index.append(d.index[-2])

In [14]:
order_products_df.loc[:, "eval_set"] = "train"
order_products_df.loc[validation_set_index, "eval_set"] = "validation"
order_products_df.loc[test_set_index, "eval_set"] = "test"

In [15]:
train_data = order_products_df[['user_id', 'product_id']].loc[order_products_df['eval_set'] == 'train']
validation_data = order_products_df[['user_id', 'product_id']].loc[order_products_df['eval_set'] == 'validation']
test_data = order_products_df[['user_id', 'product_id']].loc[order_products_df['eval_set'] == 'test']

In [16]:
print(f'Shape: {train_data.shape}, Average Basket Size (train): {np.mean(train_data["product_id"].apply(lambda x: len(x)))}')
print(f'Shape: {validation_data.shape}, Average Basket Size (validation): {np.mean(validation_data["product_id"].apply(lambda x: len(x)))}')
print(f'Shape: {test_data.shape}, Average Basket Size (test): {np.mean(test_data["product_id"].apply(lambda x: len(x)))}')

Shape: (275571, 2), Average Basket Size (train): 6.239985339531373
Shape: (20025, 2), Average Basket Size (validation): 6.018576779026217
Shape: (20414, 2), Average Basket Size (test): 6.083129225041638


In [32]:
order_products_df[["eval_set", "user_id", "order_id", "order_number", "product_id"]].head(10)

Unnamed: 0,eval_set,user_id,order_id,order_number,product_id
0,prior,17,1737705,1,[product_47141]
1,prior,17,1681401,2,"[product_42356, product_16797]"
2,prior,17,3197376,4,[product_47141]
3,prior,17,2616505,6,[product_47141]
4,prior,17,2430354,8,[product_47141]
5,prior,17,2373492,9,[product_49131]
6,prior,17,805025,11,"[product_47141, product_16797, product_9387]"
7,prior,17,912404,12,[product_47141]
8,prior,17,603534,14,[product_38444]
9,prior,17,1719551,16,[product_47141]


In [35]:
products.reset_index(drop=True).head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,aisle
0,22362,Original Rice Krispies Treats,61,19,snacks,cookies cakes
1,40063,Gluten Free Chocolate Chip Cookies,61,19,snacks,cookies cakes
2,40199,Chocolate Chip Cookies,61,19,snacks,cookies cakes
3,45374,Newman O's Creme Filled Chocolate Cookies,61,19,snacks,cookies cakes
4,45866,Fig Newmans Fruit Filled Cookies,61,19,snacks,cookies cakes
5,5322,Gluten Free Dark Chocolate Chunk Chewy with a ...,3,19,snacks,energy granola bars
6,10753,Peanut Butter Bar,3,19,snacks,energy granola bars
7,14778,Organic Chocolate Chip Chewy Granola Bars,3,19,snacks,energy granola bars
8,16254,ZBar Organic Chocolate Brownie Energy Snack,3,19,snacks,energy granola bars
9,17224,Oats & Honey Gluten Free Granola,3,19,snacks,energy granola bars


In [42]:
list(products["department"].unique())

['snacks',
 'pantry',
 'beverages',
 'frozen',
 'personal care',
 'dairy eggs',
 'household',
 'babies',
 'meat seafood',
 'dry goods pasta',
 'breakfast',
 'canned goods',
 'produce',
 'missing',
 'international',
 'deli',
 'alcohol',
 'bakery',
 'bulk']

In [None]:
products.to_csv(metadata_path, index=False)
order_products_df.to_csv(orders_path, index=False)

### Item2Vec
Saving data as [user_id, product_id, product_id, product_id ...]

In [None]:
np.save(train_orders_path, (train_data["user_id"].apply(lambda x: [f"user_{x}"]) + train_data["product_id"]).values)
np.save(validation_orders_path, (validation_data["user_id"].apply(lambda x: [f"user_{x}"]) + validation_data["product_id"]).values)
np.save(test_orders_path, (test_data["user_id"].apply(lambda x: [f"user_{x}"]) + test_data["product_id"]).values)
# data = np.load('preprocessed_data/orders.npy', allow_pickle=True)

In [43]:
test = pd.read_csv("preprocessed_data/order_data.csv")

In [48]:
test[test['eval_set'] == 'train']

Unnamed: 0,eval_set,user_id,order_number,product_id
0,train,1,1,"['product_196', 'product_14084', 'product_1242..."
1,train,1,2,"['product_196', 'product_10258', 'product_1242..."
2,train,1,3,"['product_196', 'product_12427', 'product_1025..."
3,train,1,4,"['product_196', 'product_12427', 'product_1025..."
4,train,1,5,"['product_196', 'product_12427', 'product_1025..."
...,...,...,...,...
3214868,train,206209,8,"['product_38167', 'product_43961', 'product_19..."
3214869,train,206209,9,"['product_38167', 'product_23892', 'product_68..."
3214870,train,206209,10,"['product_9405', 'product_6846', 'product_1570..."
3214871,train,206209,11,"['product_6846', 'product_9405', 'product_4121..."
