# Product Recommender - Collaborative Filtering

In [1]:
import pandas as pd
import joblib

## Load Retail Dataset

In [9]:
df = pd.read_csv('https://www.dropbox.com/s/njanlcj51k6dlr5/retail.csv?dl=1').iloc[:, 1:]

In [10]:
df.shape

# The dataset has 541,909 rows and 8 columns

(541909, 8)

In [11]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


## Data Preparation

###  Handle NaNs in CustomerID field

In [12]:
df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [14]:
# line items per invoice

df['InvoiceNo'].value_counts()

573585     1114
581219      749
581492      731
580729      721
558475      705
           ... 
C538794       1
573816        1
577068        1
C577889       1
554551        1
Name: InvoiceNo, Length: 25900, dtype: int64

In [15]:
# line items per invoice with missing CustomeID

df[df['CustomerID'].isna()]['InvoiceNo'].value_counts()

573585     1114
581219      749
581492      731
580729      721
558475      705
           ... 
573115        1
C553355       1
577829        1
549346        1
553163        1
Name: InvoiceNo, Length: 3710, dtype: int64

In [17]:
df.dropna(subset=['CustomerID'], inplace = True)

In [18]:
df.shape

# After dropping NAs, the rowas are now 406,829.

(406829, 8)

In [19]:
df['CustomerID'] = df['CustomerID'].astype('int')
df['StockCode'] = df['StockCode'].astype('str')

In [20]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [21]:
# Unique Customers & Unique Items

df['CustomerID'].nunique(), df['StockCode'].nunique()

(4372, 3684)

## - Customer-Item Matrix

In [51]:
# Create customer-item matrix with pivot table.

customer_item_matrix = df.pivot_table(index = 'CustomerID', columns = 'StockCode', values = 'Quantity', 
                                      aggfunc = 'sum')

In [52]:
customer_item_matrix.shape

# The rows represent unique customer and the columns represent unique item

(4372, 3684)

In [67]:
# Random customer whose id is 15000

customer_item_matrix.loc[15000:].head()

# NaN means the customer didn't buy the corresponding items.

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214Y,90214Z,BANK CHARGES,C2,CRUK,D,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [54]:
# Show only items that customer 12481 bought.

customer_item_matrix.loc[15000][customer_item_matrix.loc[15000].notna()]

StockCode
21002      2.0
21034      3.0
21114     10.0
21156      2.0
21261      1.0
          ... 
47591D     2.0
84030E     2.0
84879      4.0
85049A     3.0
85049G     2.0
Name: 15000, Length: 76, dtype: float64

In [56]:
# In order to understand the purchase pattern, the binary format is sometimes more useful. The following code converts
# items that are purchased to 1 and items not purchased to 0.

customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)

customer_item_matrix.loc[15000:].head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214Y,90214Z,BANK CHARGES,C2,CRUK,D,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [57]:
customer_item_matrix.loc[15000].sum()

# Customer 15000 bought 76 different items

76

##  User-based Collaborative Filtering

###  User-to-User Similarity Matrix

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

user_user_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix))
user_user_sim_matrix.shape

(4372, 4372)

In [60]:
user_user_sim_matrix.head()

CustomerID,12346,12347,12348,12349,12350,12352,12353,12354,12355,12356,...,18273,18274,18276,18277,18278,18280,18281,18282,18283,18287
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347,0.0,1.0,0.063022,0.04613,0.047795,0.038814,0.0,0.025876,0.136641,0.094742,...,0.0,0.0,0.054656,0.0,0.032844,0.062318,0.0,0.113776,0.109364,0.012828
12348,0.0,0.063022,1.0,0.024953,0.051709,0.027995,0.0,0.027995,0.118262,0.146427,...,0.0,0.0,0.118262,0.0,0.0,0.0,0.0,0.0,0.170905,0.083269
12349,0.0,0.04613,0.024953,1.0,0.056773,0.138314,0.0,0.030737,0.032461,0.144692,...,0.0,0.0,0.0,0.0,0.039014,0.0,0.0,0.067574,0.137124,0.030475
12350,0.0,0.047795,0.051709,0.056773,1.0,0.031846,0.0,0.0,0.0,0.033315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044866,0.0


In [78]:
user_user_sim_matrix.columns = customer_item_matrix.index

user_user_sim_matrix['CustomerID'] = customer_item_matrix.index

user_user_sim_matrix = user_user_sim_matrix.set_index('CustomerID')

user_user_sim_matrix

CustomerID,12346,12347,12348,12349,12350,12352,12353,12354,12355,12356,...,18273,18274,18276,18277,18278,18280,18281,18282,18283,18287
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
12347,0.0,1.000000,0.063022,0.046130,0.047795,0.038814,0.0,0.025876,0.136641,0.094742,...,0.0,0.0,0.054656,0.000000,0.032844,0.062318,0.000000,0.113776,0.109364,0.012828
12348,0.0,0.063022,1.000000,0.024953,0.051709,0.027995,0.0,0.027995,0.118262,0.146427,...,0.0,0.0,0.118262,0.000000,0.000000,0.000000,0.000000,0.000000,0.170905,0.083269
12349,0.0,0.046130,0.024953,1.000000,0.056773,0.138314,0.0,0.030737,0.032461,0.144692,...,0.0,0.0,0.000000,0.000000,0.039014,0.000000,0.000000,0.067574,0.137124,0.030475
12350,0.0,0.047795,0.051709,0.056773,1.000000,0.031846,0.0,0.000000,0.000000,0.033315,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.044866,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18280,0.0,0.062318,0.000000,0.000000,0.000000,0.000000,0.0,0.041523,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.105409,1.000000,0.119523,0.000000,0.000000,0.000000
18281,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.049629,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.119523,1.000000,0.000000,0.046613,0.000000
18282,0.0,0.113776,0.000000,0.067574,0.000000,0.037905,0.0,0.000000,0.160128,0.079305,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.017800,0.000000
18283,0.0,0.109364,0.170905,0.137124,0.044866,0.072870,0.0,0.113354,0.034204,0.093170,...,0.0,0.0,0.017102,0.043602,0.000000,0.000000,0.046613,0.017800,1.000000,0.096334


### Making Recommendations

In [58]:
# Top 20 individuals who are similar to customer id 15000. Customer 13917 is the most similar except the customer 
# 15000 himself.

user_user_sim_matrix.loc[15000].sort_values(ascending = False)[:20]

CustomerID
15000    1.000000
13917    0.258093
14518    0.229416
13914    0.220755
16574    0.216777
13889    0.207322
13529    0.207322
18041    0.203841
16669    0.202863
17059    0.201383
15744    0.198680
16245    0.194822
16345    0.193892
14506    0.192748
16561    0.191706
15450    0.188484
14233    0.188484
16360    0.186896
15411    0.186627
13558    0.186171
Name: 15000, dtype: float64

In [61]:
items_bought_by_15000 = set(customer_item_matrix.loc[15000][customer_item_matrix.loc[15000] != 0].index)

items_bought_by_15000

{'21002',
 '21034',
 '21114',
 '21156',
 '21261',
 '21380',
 '21481',
 '21484',
 '21577',
 '21580',
 '21754',
 '21755',
 '21808',
 '21823',
 '21824',
 '22077',
 '22078',
 '22082',
 '22089',
 '22090',
 '22110',
 '22118',
 '22119',
 '22121',
 '22138',
 '22271',
 '22367',
 '22430',
 '22568',
 '22569',
 '22577',
 '22578',
 '22579',
 '22582',
 '22583',
 '22584',
 '22585',
 '22604',
 '22617',
 '22643',
 '22731',
 '22732',
 '22733',
 '22734',
 '22865',
 '22866',
 '22867',
 '22899',
 '22945',
 '23263',
 '23264',
 '23265',
 '23266',
 '23298',
 '23388',
 '23389',
 '23390',
 '23393',
 '23394',
 '23395',
 '23396',
 '23398',
 '23399',
 '23418',
 '23419',
 '23433',
 '23541',
 '23542',
 '35970',
 '47590A',
 '47590B',
 '47591D',
 '84030E',
 '84879',
 '85049A',
 '85049G'}

In [74]:
items_bought_by_13917 = set(customer_item_matrix.loc[13917][customer_item_matrix.loc[13917] != 0].index)

items_bought_by_13917

{'21089',
 '21385',
 '21484',
 '22110',
 '22161',
 '22231',
 '22457',
 '22487',
 '22577',
 '22578',
 '22595',
 '22865',
 '22867',
 '23263',
 '23264',
 '23266'}

In [75]:
items_to_recommend_to_13917 = items_bought_by_15000 - items_bought_by_13917
items_to_recommend_to_13917

{'21002',
 '21034',
 '21114',
 '21156',
 '21261',
 '21380',
 '21481',
 '21577',
 '21580',
 '21754',
 '21755',
 '21808',
 '21823',
 '21824',
 '22077',
 '22078',
 '22082',
 '22089',
 '22090',
 '22118',
 '22119',
 '22121',
 '22138',
 '22271',
 '22367',
 '22430',
 '22568',
 '22569',
 '22579',
 '22582',
 '22583',
 '22584',
 '22585',
 '22604',
 '22617',
 '22643',
 '22731',
 '22732',
 '22733',
 '22734',
 '22866',
 '22899',
 '22945',
 '23265',
 '23298',
 '23388',
 '23389',
 '23390',
 '23393',
 '23394',
 '23395',
 '23396',
 '23398',
 '23399',
 '23418',
 '23419',
 '23433',
 '23541',
 '23542',
 '35970',
 '47590A',
 '47590B',
 '47591D',
 '84030E',
 '84879',
 '85049A',
 '85049G'}

In [76]:
df.loc[df['StockCode'].isin(items_to_recommend_to_13917), ['StockCode', 'Description']].drop_duplicates().\
set_index('StockCode')

# The customer 15000 who is similar to customer 13917 bought the following items, which are recommended for customer
# 13917.

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
84879,ASSORTED COLOUR BIRD ORNAMENT
21754,HOME BUILDING BLOCK WORD
21755,LOVE BUILDING BLOCK WORD
85049A,TRADITIONAL CHRISTMAS RIBBONS
84030E,ENGLISH ROSE HOT WATER BOTTLE
...,...
23542,70'S ALPHABET WALL ART
23542,WALL ART 70'S ALPHABET
23388,WOODLAND MINI RUCKSACK
23541,"WALL ART ,PUDDINGS"


##  Item-based Collaborative Filtering

### Item-to-Item Similarity Matrix

In [90]:
item_item_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix.T))

item_item_sim_matrix.columns = customer_item_matrix.T.index

item_item_sim_matrix['StockCode'] = customer_item_matrix.T.index

item_item_sim_matrix = item_item_sim_matrix.set_index('StockCode')

item_item_sim_matrix

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214Y,90214Z,BANK CHARGES,C2,CRUK,D,DOT,M,PADS,POST
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,1.000000,0.000000,0.094868,0.091287,0.0,0.000000,0.090351,0.063246,0.098907,0.095346,...,0.000000,0.0,0.000000,0.029361,0.0,0.0,0.0,0.059423,0.0,0.070057
10080,0.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.032774,0.045883,0.047836,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.017244,0.0,0.000000
10120,0.094868,0.000000,1.000000,0.115470,0.0,0.000000,0.057143,0.060000,0.041703,0.060302,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.075165,0.0,0.000000
10123C,0.091287,0.000000,0.115470,1.000000,0.0,0.000000,0.164957,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
10124A,0.000000,0.000000,0.000000,0.000000,1.0,0.447214,0.063888,0.044721,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
DOT,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.104257,0.150756,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,1.0,0.000000,0.0,0.000000
M,0.059423,0.017244,0.075165,0.000000,0.0,0.000000,0.075165,0.067648,0.054855,0.101983,...,0.000000,0.0,0.071307,0.000000,0.0,0.0,0.0,1.000000,0.0,0.045793
PADS,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.050000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,1.0,0.000000


###  Making Recommendations

In [97]:
top_10_similar_items = list(item_item_sim_matrix.loc['15030'].sort_values(ascending = False).iloc[:10].index)
top_10_similar_items

# The following ten items are the most similar items to item 15030

['15030',
 '84746',
 '84569C',
 '21167',
 '85179C',
 '17084A',
 '90059B',
 '16169P',
 '90059C',
 '79336']

In [98]:
df.loc[df['StockCode'].isin(top_10_similar_items), ['StockCode', 'Description']].drop_duplicates().\
set_index('StockCode').loc[top_10_similar_items]

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
15030,FAN BLACK FRAME
84746,PINK EASTER HENS+FLOWER
84569C,PACK 4 FLOWER/BUTTERFLY PATCHES
21167,WHITE SAGE INCENSE
85179C,PINK BITTY LIGHT CHAIN
17084A,TRANQUILITY MASALA INCENSE
90059B,DIAMANTE HAIR GRIP PACK/2 BLACK DIA
16169P,WRAP GREEN RUSSIAN FOLKART
90059C,DIAMANTE HAIR GRIP PACK/2 MONTANA
79336,LIGHT PINK FLOCK GLASS CANDLEHOLDER
