In [1]:
%pip install -U implicit
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
import matplotlib.image as mpimg
import matplotlib.pyplot as plt


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [69]:
#Takes 30 seconds to run
# Download manually from here: https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data?select=transactions_train.csv
# And https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data?select=articles.csv
# Load article dataset
article_df = pd.read_csv("~/Downloads/articles.csv")

tx = pd.read_csv("~/Downloads/transactions_train.csv", encoding="utf-8", dtype={'article_id': str}, parse_dates=['t_dat'])
tx.head(10)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2
5,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687001,0.016932,2
6,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221001,0.020322,2
7,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,688873012,0.030492,1
8,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,501323011,0.053373,1
9,2018-09-20,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,598859003,0.045746,2


In [113]:
# For the sake of compute time, we will significantly reduce our dataset:
print(len(tx))
cut_all_before = tx['t_dat'].max() - pd.Timedelta(days=10)
tx = tx[(tx['t_dat'] >= cut_all_before)]
df = tx
print(len(tx))

31788324
376104


In [115]:
# replace user id and article id with unique integer for indexing the collaborative filter matrix
USERS = df.customer_id.unique().tolist()
ITEMS = df.article_id.unique().tolist()

USER_IDS = dict(list(enumerate(USERS)))
ITEM_IDS = dict(list(enumerate(ITEMS)))

USER_MAP = {u: uidx for uidx, u in USER_IDS.items()}
ITEM_MAP = {i: iidx for iidx, i in ITEM_IDS.items()}

df['customer_id'] = df['customer_id'].map(USER_MAP)
df['ITEM'] = df['article_id'].map(ITEM_MAP)
print(df.head(10))


              t_dat  customer_id  article_id     price  sales_channel_id  ITEM
31412220 2020-09-12            0  0640021012  0.050831                 2     0
31412221 2020-09-12            0  0621381012  0.033881                 2     1
31412222 2020-09-12            0  0880017001  0.025407                 2     2
31412223 2020-09-12            0  0880017001  0.025407                 2     2
31412224 2020-09-12            1  0875272011  0.050831                 2     3
31412225 2020-09-12            1  0875272012  0.050831                 2     4
31412226 2020-09-12            2  0929744001  0.042356                 2     5
31412227 2020-09-12            2  0865799005  0.033881                 2     6
31412228 2020-09-12            2  0456163060  0.033881                 2     7
31412229 2020-09-12            2  0933989002  0.042356                 2     8


In [116]:
print(len(ITEMS), len(USERS))

20902 101147


In [117]:
# Let's partition 3 days of data into the validation dataset and rest into training
validation_cut = df['t_dat'].max() - pd.Timedelta(days=3)

df_train = df[(df['t_dat'] < validation_cut)]
df_val = df[df['t_dat'] >= validation_cut]
print(len(df_train), len(df_val))

242823 133281


In [118]:
# Create sparse matrix. Rows are customers, columns are items. 
# A 1 in position x,y indicates customer x purchased item y

row = df_train['customer_id'].values
col = df_train['ITEM'].values
data = np.ones(df_train.shape[0])
s_train = coo_matrix((data, (row, col)), shape=(len(USERS), len(ITEMS))).tocsr()


row = df_val['customer_id'].values
col = df_val['ITEM'].values
data = np.ones(df_val.shape[0])
s_val = coo_matrix((data, (row, col)), shape=(len(USERS), len(ITEMS))).tocsr()


In [119]:
# 1 minute.
model = AlternatingLeastSquares(factors= 60, iterations=12, regularization=0.01)
model.fit(s_train)


  0%|          | 0/12 [00:00<?, ?it/s]

In [120]:
map12 = mean_average_precision_at_k(model, s_train, s_val, K=12)
print(map12)

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0059512877006151115


In [59]:
# If we have the time we can do a parameter search
best_map12 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            model = AlternatingLeastSquares(factors=factors, iterations=iterations, regularization=regularization)
            model.fit(s_train)
            map12 = mean_average_precision_at_k(model, s_train, s_val, K=12)

            print(map12, factors, iterations, regularization)
            if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@12 found. Updating: {best_params}")

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0018621606172493828 40 3 0.01
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 3, 'regularization': 0.01}


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0020898579406907637 40 12 0.01
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 12, 'regularization': 0.01}


  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0021225378464011318 40 14 0.01
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 14, 'regularization': 0.01}


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0021500889354063735 40 15 0.01
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 15, 'regularization': 0.01}


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0021409801446639516 40 20 0.01


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0018312644853235788 50 3 0.01


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.002315525731637842 50 12 0.01
Best MAP@12 found. Updating: {'factors': 50, 'iterations': 12, 'regularization': 0.01}


  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.002246585246497552 50 14 0.01


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0021981560312543122 50 15 0.01


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0022292905433562893 50 20 0.01


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0019970714826944656 60 3 0.01


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/39843 [00:00<?, ?it/s]

0.0023285720302941466 60 12 0.01
Best MAP@12 found. Updating: {'factors': 60, 'iterations': 12, 'regularization': 0.01}


  0%|          | 0/14 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [121]:
def submit(model, csr_train, submission_name="submissions.csv"):
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(USERS))
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=12, filter_already_liked_items=True)
        for i, userid in enumerate(batch):
            customer_id = USER_IDS[userid]
            user_items = ids[i]
            article_ids = [ITEM_IDS[item_id] for item_id in user_items]
            preds.append((customer_id, ' '.join(article_ids), scores[i]))

    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction', 'scores'])
    df_preds.to_csv(submission_name, index=False)
    
    display(df_preds.head())
    print(df_preds.shape)
    
    return df_preds

In [122]:
df_preds = submit(model, s_train)

Unnamed: 0,customer_id,prediction,scores
0,0,0918525001 0909371001 0706016003 0889550002 04...,"[0.042417042, 0.042009294, 0.02727838, 0.02633..."
1,1,0857163001 0158340001 0919273002 0863646001 08...,"[0.021883525, 0.021758948, 0.017639263, 0.0149..."
2,2,0865799006 0898713001 0863583001 0896169002 09...,"[0.12547427, 0.10886144, 0.10778493, 0.1058113..."
3,3,0870328003 0898573003 0909371001 0902388001 08...,"[0.018557673, 0.012407387, 0.011843513, 0.0114..."
4,4,0910601002 0751592001 0915526002 0917447002 09...,"[0.021416845, 0.015621647, 0.0109662, 0.010435..."


(101147, 3)


In [124]:
"""
Let's check recommendations for the customer.
First take the customer_id and then acquire what he has bought in the past
Starting from the products purchased in the past, the model suggests 10 items sorted by decreasing score (where the first is the most recommended)
"""
ID = 1025
customer_ID = df_preds.customer_id[ID]
items_bought = tx.loc[tx['customer_id'] == customer_ID].article_id.unique().tolist()
items_suggested = df_preds.prediction[ID].split()
scores = df_preds.scores[ID]

print(customer_ID)
print(items_bought)
print(items_suggested)
print(scores)

1025
['0897108001', '0790368006', '0905492001', '0919829001', '0807882001', '0822115004', '0904584002', '0897189002', '0946387001', '0923340001']
['0923340002', '0942187001', '0881942001', '0857163001', '0711053003', '0894780001', '0803757004', '0929275001', '0910601003', '0931769001', '0806388001', '0923037001']
[0.12966013 0.08276729 0.08248056 0.08058612 0.08053263 0.07302035
 0.07193971 0.07045572 0.06553063 0.06474035 0.06264269 0.0581349 ]


In [128]:
article_df.loc[article_df['article_id'] == 853097001]



Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
92270,853097001,853097,Patricia cotton blouse,258,Blouse,Garment Upper body,1010016,Solid,10,White,...,Blouse,A,Ladieswear,1,Ladieswear,6,Womens Casual,1010,Blouses,Longer-style blouse in airy cotton poplin with...


In [129]:
article_df.loc[article_df['article_id'] == 923340002]


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
104467,923340002,923340,Lion top,254,Top,Garment Upper body,1010016,Solid,10,White,...,Jersey fancy,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1005,Jersey Fancy,Fitted top in jersey with an asymmetric neckli...
