In [27]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

import warnings;
warnings.filterwarnings('ignore')

In [6]:
df=pd.read_excel('Online Retail.xlsx')

In [7]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [10]:
df['CustomerID'].describe()

count    406829.000000
mean      15287.690570
std        1713.600303
min       12346.000000
25%       13953.000000
50%       15152.000000
75%       16791.000000
max       18287.000000
Name: CustomerID, dtype: float64

## Data Cleaning

In [11]:
# check for the missing values
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [12]:
df.dropna(inplace=True)

In [14]:
# Let's check if the rows corresponds to missing values have been dropped
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   StockCode    406829 non-null  object        
 2   Description  406829 non-null  object        
 3   Quantity     406829 non-null  int64         
 4   InvoiceDate  406829 non-null  datetime64[ns]
 5   UnitPrice    406829 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      406829 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [15]:
print(df['CustomerID'].value_counts())

17841.0    7983
14911.0    5903
14096.0    5128
12748.0    4642
14606.0    2782
           ... 
15070.0       1
15753.0       1
17065.0       1
16881.0       1
16995.0       1
Name: CustomerID, Length: 4372, dtype: int64


In [17]:
customers = df['CustomerID'].unique().tolist()
print("Total number of Customers:", len(customers))

Total number of Customers: 4372


In [25]:
df['StockCode']=df['StockCode'].astype(str)

In [23]:
random.shuffle(customers)

train_samples = [customers[i] for i in range(round(0.9*len(customers)))]
train_df = df[df['CustomerID'].isin(train_samples)]
validation_df = df[~df['CustomerID'].isin(train_samples)]

In [28]:
#capture purchase history of the customer
purchase_train = []
for i in tqdm(train_samples):
    temp = train_df[train_df['CustomerID']==i]["StockCode"].tolist()
    purchase_train.append(temp)

100%|█| 3935/39


In [49]:
purchase_val=[]
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df['CustomerID']==i]["StockCode"].tolist()
    purchase_val.append(temp)

100%|█| 437/437


In [53]:
# word2vec training
model= Word2Vec(window=10,sg=1,hs=0,
               negative=10,alpha=0.03,min_alpha=0.0007,
               seed=14)
model.build_vocab(purchase_train, progress_per=200)
model.train(purchase_train, total_examples = model.corpus_count, epochs=10, report_delay=1)

(3671408, 3709730)

In [54]:
model.init_sims(replace=True)

In [56]:
print(model)

Word2Vec(vocab=3182, vector_size=100, alpha=0.03)


In [71]:
#extract all vectors
X=model.wv.vectors
X.shape

(3182, 100)

In [80]:
products = train_df[["StockCode","Description"]]

In [83]:
print("Length of products before removing Duplicates:", len(products))

# remove duplicates

products.drop_duplicates(inplace=True,subset='StockCode',keep="last")

product_dict=products.groupby('StockCode')['Description'].apply(list).to_dict()

Length of products before removing Duplicates: 370973


In [87]:
product_dict

{10002: ['INFLATABLE POLITICAL GLOBE '],
 10080: ['GROOVY CACTUS INFLATABLE'],
 10120: ['DOGGY RUBBER'],
 10125: ['MINI FUNKY DESIGN TAPES'],
 10133: ['COLOURING PENCILS BROWN TUBE'],
 10135: ['COLOURING PENCILS BROWN TUBE'],
 11001: ['ASSTD DESIGN RACING CAR PEN'],
 15030: ['FAN BLACK FRAME '],
 15034: ['PAPER POCKET TRAVELING FAN '],
 15036: ['ASSORTED COLOURS SILK FAN'],
 15039: ['SANDALWOOD FAN'],
 16008: ['SMALL FOLDING SCISSOR(POINTED EDGE)'],
 16010: ['FOLDING CAMPING SCISSOR W/KNIF & S'],
 16011: ['ANIMAL STICKERS'],
 16012: ['FOOD/DRINK SPONGE STICKERS'],
 16014: ['SMALL CHINESE STYLE SCISSOR'],
 16015: ['MEDIUM CHINESE STYLE SCISSOR'],
 16016: ['LARGE CHINESE STYLE SCISSOR'],
 16033: ['MINI HIGHLIGHTER PENS'],
 16043: ['POP ART PUSH DOWN RUBBER '],
 16045: ['POPART WOODEN PENCILS ASST'],
 16046: ['TEATIME PEN CASE & PENS'],
 16048: ['TEATIME ROUND PENCIL SHARPENER '],
 16049: ['TEATIME GEL PENS ASST'],
 16052: ['TEATIME PUSH DOWN RUBBER'],
 16054: ['POPART RECT PENCIL SHARPEN

In [91]:
product_dict['84029E']

['RED WOOLLY HOTTIE WHITE HEART.']

In [105]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.wv.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (product_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms

In [106]:
similar_products(model.wv['90019A'])

[('AMBER DROP EARRINGS W LONG BEADS', 0.7685103416442871),
 ('SILVER M.O.P ORBIT DROP EARRINGS', 0.7673244476318359),
 ('BLUE MURANO TWIST BRACELET', 0.747230052947998),
 ('GOLD/M.O.P PENDANT ORBIT NECKLACE', 0.7436476945877075),
 ('PINK BOUDICCA LARGE BRACELET', 0.7393186688423157),
 ('WHITE VINT ART DECO CRYSTAL NECKLAC', 0.7365745306015015)]

In [122]:
def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model.wv[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [123]:
aggregate_vectors(purchase_val[0]).shape


(100,)

In [125]:
similar_products(aggregate_vectors(purchase_val[0]))


[('MINI PAINT SET VINTAGE ', 0.7128187417984009),
 ('SPACEBOY LUNCH BOX ', 0.703393816947937),
 ('SET/6 RED SPOTTY PAPER CUPS', 0.6899441480636597),
 ('ROUND SNACK BOXES SET OF 4 FRUITS ', 0.6898894309997559),
 ('PLASTERS IN TIN SPACEBOY', 0.6882485151290894),
 ('RETROSPOT PARTY BAG + STICKER SET', 0.6870256662368774)]