In [2]:
import random
import os
import threading
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, wait

from tqdm.notebook import tqdm, trange
from pathlib import Path
from random import randint
from ui.python.Layout import Layout
import numpy as np
import plotly.express as px
import pandas as pd

In [3]:
MAX_WORKERS = 24

# Preprocessing

In [4]:
df = pd.read_csv('./../data/datasets/ECommerce_consumer behaviour.csv')
df

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name
0,2425083,49125,1,2,18,,17,1,0,13,pantry,baking ingredients
1,2425083,49125,1,2,18,,91,2,0,16,dairy eggs,soy lactosefree
2,2425083,49125,1,2,18,,36,3,0,16,dairy eggs,butter
3,2425083,49125,1,2,18,,83,4,0,4,produce,fresh vegetables
4,2425083,49125,1,2,18,,83,5,0,4,produce,fresh vegetables
...,...,...,...,...,...,...,...,...,...,...,...,...
2019496,3390742,199430,16,3,18,5.0,83,8,0,4,produce,fresh vegetables
2019497,458285,128787,42,2,19,3.0,115,1,1,7,beverages,water seltzer sparkling water
2019498,458285,128787,42,2,19,3.0,32,2,1,4,produce,packaged produce
2019499,458285,128787,42,2,19,3.0,32,3,1,4,produce,packaged produce


In [7]:
df = df[['order_id', 'order_number', 'department', 'department_id', 'product_id', 'product_name']]
df

Unnamed: 0,order_id,order_number,department,department_id,product_id,product_name
0,2425083,1,pantry,13,17,baking ingredients
1,2425083,1,dairy eggs,16,91,soy lactosefree
2,2425083,1,dairy eggs,16,36,butter
3,2425083,1,produce,4,83,fresh vegetables
4,2425083,1,produce,4,83,fresh vegetables
...,...,...,...,...,...,...
2019496,3390742,16,produce,4,83,fresh vegetables
2019497,458285,42,beverages,7,115,water seltzer sparkling water
2019498,458285,42,produce,4,32,packaged produce
2019499,458285,42,produce,4,32,packaged produce


In [20]:
# number of products per department
df.groupby('department').agg({'product_name': 'count'}).sort_values('product_name', ascending=False)

Unnamed: 0_level_0,product_name
department,Unnamed: 1_level_1
produce,588996
dairy eggs,336915
snacks,180692
beverages,168126
frozen,139536
pantry,116262
bakery,72983
canned goods,66053
deli,65176
dry goods pasta,54054


In [24]:
most_popular_categories = df.groupby('department').agg({'product_name': 'count'}).sort_values('product_name', ascending=False).head(13)
most_popular_categories

Unnamed: 0_level_0,product_name
department,Unnamed: 1_level_1
produce,588996
dairy eggs,336915
snacks,180692
beverages,168126
frozen,139536
pantry,116262
bakery,72983
canned goods,66053
deli,65176
dry goods pasta,54054


In [13]:
# group products by department and display
df.groupby('department').agg({'product_name': lambda x: list(set(x))})

Unnamed: 0_level_0,product_name
department,Unnamed: 1_level_1
alcohol,"[spirits, specialty wines champagnes, white wi..."
babies,"[diapers wipes, baby bath body care, baby acce..."
bakery,"[tortillas flat bread, bakery desserts, breakf..."
beverages,"[soft drinks, juice nectars, energy sports dri..."
breakfast,"[granola, hot cereal pancake mixes, cereal, br..."
bulk,"[bulk grains rice dried goods, bulk dried frui..."
canned goods,"[canned meat seafood, soup broth bouillon, can..."
dairy eggs,"[milk, yogurt, refrigerated pudding desserts, ..."
deli,"[tofu meat alternatives, prepared soups salads..."
dry goods pasta,"[grains rice dried goods, instant foods, fresh..."


In [36]:
dep = df.groupby('department').agg({'product_name': lambda x: list(set(x))})
dep['names_count'] = dep['product_name'].apply(lambda x: len(x))
dep.sort_values('names_count', ascending=False)

Unnamed: 0_level_0,product_name,names_count
department,Unnamed: 1_level_1,Unnamed: 2_level_1
personal care,"[facial care, cold flu allergy, first aid, ora...",17
pantry,"[condiments, pickled goods olives, preserved d...",12
frozen,"[ice cream ice, frozen meat seafood, frozen ve...",11
snacks,"[candy chocolate, mint gum, trail mix snack mi...",11
dairy eggs,"[milk, yogurt, refrigerated pudding desserts, ...",10
household,"[more household, cleaning products, dish deter...",10
beverages,"[soft drinks, juice nectars, energy sports dri...",8
meat seafood,"[seafood counter, poultry counter, hot dogs ba...",7
alcohol,"[spirits, specialty wines champagnes, white wi...",5
canned goods,"[canned meat seafood, soup broth bouillon, can...",5


In [37]:
best_df = dep.join(most_popular_categories, on='department', how='left', rsuffix='_r', lsuffix='_l')

In [38]:
# rename columns
best_df.rename(columns={'product_name_l': 'product_names', 'product_name_r': 'product_count'}, inplace=True)

In [40]:
best_df.sort_values('names_count', ascending=False)

Unnamed: 0_level_0,product_names,names_count,product_count
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
personal care,"[facial care, cold flu allergy, first aid, ora...",17,
pantry,"[condiments, pickled goods olives, preserved d...",12,116262.0
frozen,"[ice cream ice, frozen meat seafood, frozen ve...",11,139536.0
snacks,"[candy chocolate, mint gum, trail mix snack mi...",11,180692.0
dairy eggs,"[milk, yogurt, refrigerated pudding desserts, ...",10,336915.0
household,"[more household, cleaning products, dish deter...",10,46446.0
beverages,"[soft drinks, juice nectars, energy sports dri...",8,168126.0
meat seafood,"[seafood counter, poultry counter, hot dogs ba...",7,44271.0
alcohol,"[spirits, specialty wines champagnes, white wi...",5,
canned goods,"[canned meat seafood, soup broth bouillon, can...",5,66053.0


In [41]:
best_df = best_df.dropna()
best_df

Unnamed: 0_level_0,product_names,names_count,product_count
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bakery,"[tortillas flat bread, bakery desserts, breakf...",5,72983.0
beverages,"[soft drinks, juice nectars, energy sports dri...",8,168126.0
breakfast,"[granola, hot cereal pancake mixes, cereal, br...",4,44605.0
canned goods,"[canned meat seafood, soup broth bouillon, can...",5,66053.0
dairy eggs,"[milk, yogurt, refrigerated pudding desserts, ...",10,336915.0
deli,"[tofu meat alternatives, prepared soups salads...",5,65176.0
dry goods pasta,"[grains rice dried goods, instant foods, fresh...",5,54054.0
frozen,"[ice cream ice, frozen meat seafood, frozen ve...",11,139536.0
household,"[more household, cleaning products, dish deter...",10,46446.0
meat seafood,"[seafood counter, poultry counter, hot dogs ba...",7,44271.0


In [44]:
selected_departments = best_df.index.to_list()
selected_departments

['bakery',
 'beverages',
 'breakfast',
 'canned goods',
 'dairy eggs',
 'deli',
 'dry goods pasta',
 'frozen',
 'household',
 'meat seafood',
 'pantry',
 'produce',
 'snacks']