In [18]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [19]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Import Data

In [20]:
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions_train_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

# Customer Meta Data

Lets see how the data in customer dataframe looks like

In [21]:
print(f"Number of rows in customers is {customers_df.shape[0]}")
print(f"Number of columns in customers is {customers_df.shape[1]}")
print("\nInformation of Customers:")
customers_df.info()

Number of rows in customers is 1371980
Number of columns in customers is 7

Information of Customers:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355969 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 73.3+ MB


In [22]:
#lets see first 5 rows of customer dataframe
customers_df.head(5)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [23]:
#Handling Missing values
#so the unique values in both Active and FN are 1 or Nan. Assuming the missing values are not active customers, we can impute them with '0'
customers_df['FN'] = customers_df['FN'].fillna(0)
customers_df['Active'] = customers_df['Active'].fillna(0)

In [24]:
customers_df['FN'].isna().sum(), customers_df['Active'].isna().sum()

(0, 0)

In [25]:
#we need to handle missing values for columns "club_member_status" and "fashion_news_frequency"
#As we don't have information on category they belong to lets replace the missing values with "UNKNOWN"
customers_df["club_member_status"] = customers_df["club_member_status"].fillna("UNKNOWN")
customers_df["fashion_news_frequency"] = customers_df["fashion_news_frequency"].fillna("UNKNOWN")

In [26]:
customers_df["age"].isna().sum()

15861

lets impute nan with mean values

In [27]:
mean_age = round(customers_df["age"].mean())
mean_age

36

In [28]:
#customers_df["age"].fillna(mean_age, inplace = True)
customers_df.fillna({"age": mean_age}, inplace=True)
customers_df["age"].isna().sum()

0

Though Age is a numerical column we can convert that into categorical column by making groups. Categories we are going to consider are:
* 0-20
* 20-40
* 40-60
* 60-80
* 80-100

In [29]:
bins = [0, 20, 40, 60, 80, 100]
labels = ['0-20', '20-40', '40-60', '60-80', '80-100']
#customers_df["age"] = pd.cut(customers_df["age"], 5)
customers_df["age"] = pd.cut(customers_df["age"] , bins = bins, labels = labels, right = False )

In [30]:
customers_df["age"].unique()

['40-60', '20-40', '60-80', '0-20', '80-100']
Categories (5, object): ['0-20' < '20-40' < '40-60' < '60-80' < '80-100']

There by we have imputed all missing values in customers_df

In [31]:
#As we might not be using postal code in our prediction dropping postal code column
customers_df = customers_df.drop(columns = "postal_code")

In [32]:
cust_df = customers_df[["customer_id","age"]]
cust_df.head()

Unnamed: 0,customer_id,age
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,40-60
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,20-40
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,20-40
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,40-60
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,40-60


In [33]:
customers_df = pd.get_dummies(customers_df, columns=["club_member_status", "fashion_news_frequency", "age"], drop_first = False)

# Transaction Data

Lets see how the data in transactions dataframe looks like

In [34]:
print(f"Number of rows in transactions is {transactions_train_df.shape[0]}")
print(f"Number of columns in transactions is {transactions_train_df.shape[1]}")
print("\nInformation of transactions:")
transactions_train_df.info()

Number of rows in transactions is 31788324
Number of columns in transactions is 5

Information of transactions:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.2+ GB


In [35]:
transactions_train_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [36]:
#converting t_dat from "object" type to "datetime" type
transactions_train_df["t_dat"] = pd.to_datetime(transactions_train_df['t_dat'])

# Articles Data

Lets see how the data in Articles dataframe looks like

In [37]:
print(f"Number of rows in articles is {articles_df.shape[0]}")
print(f"Number of columns in articles is {articles_df.shape[1]}")
print("\nInformation of articles:")
articles_df.info()

Number of rows in articles is 105542
Number of columns in articles is 25

Information of articles:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_v

In [38]:
#lets see first 5 rows of articles dataframe
articles_df.head(5)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


# Train Test Split

In [40]:
#lets split customer data(only customer_id and age) into train and test by stratifying age_groups
train_cust , test_cust = train_test_split(cust_df, test_size = 0.3, stratify = cust_df["age"], random_state = 42)
train_cust.shape, test_cust.shape

((960386, 2), (411594, 2))

In [42]:
#Now lets split transaction data based on customer id from train_cust, test_cust
train_trans = transactions_train_df[transactions_train_df["customer_id"].isin(train_cust["customer_id"])]
test_trans = transactions_train_df[transactions_train_df["customer_id"].isin(test_cust["customer_id"])]
train_trans.shape, test_trans.shape

((22233668, 5), (9554656, 5))

# Baseline Model

# Training:

In [43]:
merged_cust_trans_train = train_cust.merge(train_trans, on = "customer_id", how = "inner")
merged_cust_trans_train.head()

Unnamed: 0,customer_id,age,t_dat,article_id,price,sales_channel_id
0,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,2020-03-03,779781005,0.042356,2
1,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,2020-03-03,779781005,0.042356,2
2,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,2020-03-03,568601030,0.050831,2
3,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,2020-03-03,779781006,0.042356,2
4,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,2020-03-03,779781006,0.042356,2


In [44]:
merged_cust_trans_train.shape

(22233668, 6)

In [46]:
agg_merged = merged_cust_trans_train.groupby(["age","article_id"]).agg(article_count = ("article_id","count")).sort_values(by = ["age", "article_count"], ascending = False).reset_index()
agg_merged.head()

  agg_merged = merged_cust_trans_train.groupby(["age","article_id"]).agg(article_count = ("article_id","count")).sort_values(by = ["age", "article_count"], ascending = False).reset_index()


Unnamed: 0,age,article_id,article_count
0,80-100,399256005,14
1,80-100,610776001,13
2,80-100,744291001,12
3,80-100,591334019,11
4,80-100,664074001,11


In [47]:
agg_merged['rank'] = agg_merged.groupby('age')['article_count'].rank(ascending = False, method = "first")
agg_merged = agg_merged[agg_merged['rank']<=12]
agg_merged

  agg_merged['rank'] = agg_merged.groupby('age')['article_count'].rank(ascending = False, method = "first")


Unnamed: 0,age,article_id,article_count,rank
0,80-100,399256005,14,1.0
1,80-100,610776001,13,2.0
2,80-100,744291001,12,3.0
3,80-100,591334019,11,4.0
4,80-100,664074001,11,5.0
5,80-100,695632002,11,6.0
6,80-100,751471001,11,7.0
7,80-100,751551001,11,8.0
8,80-100,562245025,10,9.0
9,80-100,569984001,10,10.0


In [48]:
#now we are combining customer_id with agg_merged based on their age to recommend them top 12 articles
results_df_train  = train_cust.merge(agg_merged, on = ["age"], how = 'left')
results_df_train

Unnamed: 0,customer_id,age,article_id,article_count,rank
0,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,706016001,9072,1.0
1,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,706016002,6930,2.0
2,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,372860001,6535,3.0
3,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,610776002,6116,4.0
4,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,40-60,372860002,5958,5.0
...,...,...,...,...,...
11524627,6289d7a464ca79448bfd5a0e513bb369b72a451c16f274...,20-40,399223001,10289,8.0
11524628,6289d7a464ca79448bfd5a0e513bb369b72a451c16f274...,20-40,610776001,10192,9.0
11524629,6289d7a464ca79448bfd5a0e513bb369b72a451c16f274...,20-40,372860002,10030,10.0
11524630,6289d7a464ca79448bfd5a0e513bb369b72a451c16f274...,20-40,720125001,9999,11.0


In [50]:
results_df_train["customer_id"].nunique()

960386

In [53]:
results_df_train = results_df_train[["customer_id", "article_id"]]
results_df_train

Unnamed: 0,customer_id,article_id
0,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,706016001
1,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,706016002
2,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,372860001
3,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,610776002
4,e7fc58634d788927f7c9fe9e47ead2d32e3226241f2a83...,372860002
...,...,...
11524627,6289d7a464ca79448bfd5a0e513bb369b72a451c16f274...,399223001
11524628,6289d7a464ca79448bfd5a0e513bb369b72a451c16f274...,610776001
11524629,6289d7a464ca79448bfd5a0e513bb369b72a451c16f274...,372860002
11524630,6289d7a464ca79448bfd5a0e513bb369b72a451c16f274...,720125001


# Testing

In [55]:
test_cust.head()

Unnamed: 0,customer_id,age
802912,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,20-40
367048,44a7146358f3cebc2f04879257c20aa6f1da33b8394707...,20-40
868643,a21effe0cdc8918d2ded0e2fecb215eb6db44ac9264d3a...,40-60
650917,797c21e294446a9e97bb67adbc01ccd902c99c4ae88923...,20-40
1003975,bb573b9313c4defa961a8e408aa04888413d055eec35ea...,20-40


In [58]:
test_cust.iloc[0]["customer_id"]

'95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c43689617670440d488'

In [61]:
results_df_train[results_df_train["customer_id"]=='95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c43689617670440d488']

Unnamed: 0,customer_id,article_id


we dont see any result because customer_id belongs to test data. So we need to predict values for test_data now.

In [62]:
results_df_test  = test_cust.merge(agg_merged, on = ["age"], how = 'left')
results_df_test

Unnamed: 0,customer_id,age,article_id,article_count,rank
0,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,20-40,706016001,23613,1.0
1,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,20-40,706016002,15606,2.0
2,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,20-40,372860001,14586,3.0
3,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,20-40,610776002,13648,4.0
4,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,20-40,464297007,13148,5.0
...,...,...,...,...,...
4939123,2630e6c9e5c9a00275fb0b5df4a96e217def6363b32cf8...,20-40,399223001,10289,8.0
4939124,2630e6c9e5c9a00275fb0b5df4a96e217def6363b32cf8...,20-40,610776001,10192,9.0
4939125,2630e6c9e5c9a00275fb0b5df4a96e217def6363b32cf8...,20-40,372860002,10030,10.0
4939126,2630e6c9e5c9a00275fb0b5df4a96e217def6363b32cf8...,20-40,720125001,9999,11.0


In [64]:
results_df_test = results_df_test[["customer_id", "article_id"]]
results_df_test

Unnamed: 0,customer_id,article_id
0,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,706016001
1,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,706016002
2,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,372860001
3,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,610776002
4,95e4543e38d0233823148b9dcc5e8a45351054bfa5b21c...,464297007
...,...,...
4939123,2630e6c9e5c9a00275fb0b5df4a96e217def6363b32cf8...,399223001
4939124,2630e6c9e5c9a00275fb0b5df4a96e217def6363b32cf8...,610776001
4939125,2630e6c9e5c9a00275fb0b5df4a96e217def6363b32cf8...,372860002
4939126,2630e6c9e5c9a00275fb0b5df4a96e217def6363b32cf8...,720125001


In [65]:
results_df_test["customer_id"].nunique()

411594

We can get ground truth values from test_trans data

In [67]:
test_trans_filtered = test_trans[test_trans["customer_id"].isin(test_cust["customer_id"])]
test_trans_filtered = test_trans_filtered[["customer_id","article_id"]]

In [68]:
test_trans_filtered

Unnamed: 0,customer_id,article_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023
12,0008968c0d451dbc5a9968da03196fe20051965edde741...,531310002
13,0008968c0d451dbc5a9968da03196fe20051965edde741...,529841001
44,001127bffdda108579e6cb16080440e89bf1250a776c6e...,397068015
...,...,...
31788292,ff94f31e864d9b655643ac4d2adab3611c7241adb5d34c...,901666001
31788293,ff94f31e864d9b655643ac4d2adab3611c7241adb5d34c...,884319003
31788300,ffc2e7c210e3ea602e6d229116773cc0588c929f8cc70b...,858856002
31788301,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,895730002


In [69]:
test_trans_filtered["customer_id"].nunique()

408617

In [70]:
y_true = test_trans_filtered["article_id"]
y_pred = results_df_test["article_id"]

In [71]:
def precision_at_12(y_true, y_pred):
       relevant_items = set(y_true)
       top_k_items = set(y_pred)
       return len(top_k_items & relevant_items)/len(relevant_items)

In [72]:
precision = precision_at_12(y_true, y_pred)
precision

0.00041197614658111296

lets randomly pick a customer_id from test data and check its precision

In [77]:
random_customer_id = np.random.choice(test_cust["customer_id"].unique())
random_customer_id

'1a671df7b5979f20cb785b889de17691d6b6c073f22fbe7639ed37903fdb2a13'

In [78]:
test_trans[test_trans["customer_id"] == random_customer_id]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
1814648,2018-10-28,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,666327001,0.016932,1
4525852,2019-01-04,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,687411004,0.011847,1
9670322,2019-05-02,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,723529004,0.025407,1
9670323,2019-05-02,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,687176001,0.010153,1
9670324,2019-05-02,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,684981002,0.003373,1
9670325,2019-05-02,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,464297007,0.016932,1
9670326,2019-05-02,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,590928019,0.030492,1
9670327,2019-05-02,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,720137001,0.003373,1
9670328,2019-05-02,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,620573006,0.016932,1
15475281,2019-08-15,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,739826019,0.016932,1


In [79]:
test_trans_filtered[test_trans_filtered["customer_id"] == random_customer_id]

Unnamed: 0,customer_id,article_id
1814648,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,666327001
4525852,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,687411004
9670322,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,723529004
9670323,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,687176001
9670324,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,684981002
9670325,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,464297007
9670326,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,590928019
9670327,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,720137001
9670328,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,620573006
15475281,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,739826019


In [80]:
results_df_test[results_df_test["customer_id"] == random_customer_id]

Unnamed: 0,customer_id,article_id
2941956,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,706016001
2941957,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,706016002
2941958,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,372860001
2941959,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,610776002
2941960,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,372860002
2941961,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,568601006
2941962,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,399256001
2941963,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,673677002
2941964,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,751471001
2941965,1a671df7b5979f20cb785b889de17691d6b6c073f22fbe...,579541001


In [81]:
y_pred_for_random_cust = (results_df_test[results_df_test["customer_id"] == random_customer_id])["article_id"]
y_true_for_random_cust = (test_trans_filtered[test_trans_filtered["customer_id"] == random_customer_id])["article_id"]

In [82]:
precision = precision_at_12(y_true_for_random_cust, y_pred_for_random_cust)
precision

0.0

In [84]:
print(y_true_for_random_cust, y_pred_for_random_cust)

1814648     666327001
4525852     687411004
9670322     723529004
9670323     687176001
9670324     684981002
9670325     464297007
9670326     590928019
9670327     720137001
9670328     620573006
15475281    739826019
16927979    749384002
16927980    738133006
16927981    740519002
16927982    740498001
16927983    678696015
16927984    680263013
16927985    739346005
30879334    708138013
Name: article_id, dtype: int64 2941956    706016001
2941957    706016002
2941958    372860001
2941959    610776002
2941960    372860002
2941961    568601006
2941962    399256001
2941963    673677002
2941964    751471001
2941965    579541001
2941966    562245001
2941967    562245046
Name: article_id, dtype: int64
