In [1]:
pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_excel('Online_Retail.xlsx')
# df.head()
print(df)

       InvoiceNo StockCode                          Description  Quantity  \
0         536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1         536365     71053                  WHITE METAL LANTERN         6   
2         536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3         536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4         536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   
5         536365     22752         SET 7 BABUSHKA NESTING BOXES         2   
6         536365     21730    GLASS STAR FROSTED T-LIGHT HOLDER         6   
7         536366     22633               HAND WARMER UNION JACK         6   
8         536366     22632            HAND WARMER RED POLKA DOT         6   
9         536367     84879        ASSORTED COLOUR BIRD ORNAMENT        32   
10        536367     22745           POPPY'S PLAYHOUSE BEDROOM          6   
11        536367     22748            POPPY'S PLAYHOUSE KITCHEN         6   

In [3]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [4]:
df.dropna(inplace=True)

In [19]:
df['StockCode']= df['StockCode'].astype(str)


In [20]:
customers = df["CustomerID"].unique().tolist()
# print(customers)
len(customers)

4373

In [21]:
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]
# print(customers_train)
# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]
# print(train_df)

In [22]:
purchases_train = []

# populate the list with the product codes
for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_train.append(temp)

# print(purchases_train)

100%|██████████| 3936/3936 [00:04<00:00, 977.86it/s] 


In [23]:
purchases_val = []

# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

100%|██████████| 437/437 [00:00<00:00, 1298.08it/s]


In [25]:
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

print(model)

Word2Vec(vocab=3174, size=100, alpha=0.03)


In [27]:
products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

print(products_dict)

{'20906': [nan], '21925': ['UNION STRIPE CUSHION COVER '], '90060B': ['FIRE POLISHED GLASS NECKL GOLD'], '22596': ['CHRISTMAS STAR WISH LIST CHALKBOARD'], '23489': ['VINTAGE BELLS GARLAND'], '90011D': ['PURPLE CRYSTAL DROP EARRINGS'], '90059A': ['DIAMANTE HAIR GRIP PACK/2 CRYSTAL'], '20963': ['APPLE BATH SPONGE'], '35599D': ['PINK AND WHITE CHRISTMAS TREE 120CM'], '20700': ['GREEN CAT FLORAL CUSHION COVER '], '85049H': ['URBAN BLACK RIBBONS '], '46776b': ['WOVEN BERRIES CUSHION COVER '], '22173': ['METAL 4 HOOK HANGER FRENCH CHATEAU'], '22705': ['WRAP GREEN PEARS '], '84709B': [nan], '35958': [nan], '20795': ['LARGE BLUE PROVENCAL CERAMIC BALL'], '23465': ['TUSCAN VILLA BIRD FEEDER'], '90034': ['WHITE SILVER NECKLACE SHELL GLASS'], '51014A': ['FEATHER PEN,HOT PINK'], '21369': ['MIRRORED WALL ART SPLODGES'], '20964': [nan], '84424A': [nan], '72140E': ['BEST DAD CANDLE LETTERS'], '23843': ['PAPER CRAFT , LITTLE BIRDIE'], '21195': [nan], '22670': ['FRENCH WC SIGN BLUE METAL'], '22641': ['

In [28]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms  

In [29]:
similar_products(model['90019A'])

[('AMBER DROP EARRINGS W LONG BEADS', 0.7780001163482666),
 ('ANT COPPER RED BOUDICCA BRACELET', 0.7681774497032166),
 ('SILVER M.O.P ORBIT DROP EARRINGS', 0.765012800693512),
 ('PINK BOUDICCA LARGE BRACELET', 0.7550866007804871),
 ('GREEN HEART OF GLASS BRACELET', 0.7452159523963928),
 ('DROP DIAMANTE EARRINGS PURPLE', 0.7441219091415405)]

In [30]:
def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [33]:
aggregate_vectors(purchases_val[0])

array([-0.02008149, -0.06712858,  0.04870657,  0.13385847, -0.25122377,
       -0.02050675, -0.19403677,  0.1565021 , -0.08546602,  0.0300641 ,
        0.11597387, -0.13039754, -0.06811432,  0.02605054, -0.02776088,
        0.00572286,  0.15436076,  0.15501931, -0.07332736,  0.11599042,
       -0.1905461 ,  0.07765587,  0.11867723, -0.02748243, -0.2597612 ,
       -0.25847027, -0.04965959, -0.0176734 ,  0.10683104, -0.02971852,
        0.17366052,  0.06784417,  0.3021359 ,  0.09905026, -0.13317439,
       -0.07610453,  0.10448904, -0.11490462, -0.09752823, -0.01962013,
        0.08885534, -0.23626906, -0.24586074,  0.06600986,  0.03345012,
        0.51480776,  0.09869163,  0.30410892, -0.00581542,  0.02266225,
        0.04189283, -0.10721178,  0.05539206,  0.07599442, -0.00176217,
       -0.02936422,  0.30737686, -0.03586278, -0.06513169,  0.30690938,
       -0.10457037, -0.1324463 , -0.11254132,  0.11759341,  0.12893815,
       -0.13116501,  0.25694048,  0.10833333, -0.35292163, -0.17

In [34]:
similar_products(aggregate_vectors(purchases_val[0]))

[('JUMBO STORAGE BAG SKULLS', 0.7265976667404175),
 ('LUNCH BAG RED RETROSPOT', 0.7186723351478577),
 ('LUNCH BAG CARS BLUE', 0.7134215235710144),
 ('JUMBO STORAGE BAG SUKI', 0.7103363275527954),
 ('STRAWBERRY CHARLOTTE BAG', 0.7051469683647156),
 ('JUMBO  BAG BAROQUE BLACK WHITE', 0.7047221064567566)]