# Model recommendation with lighfm

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import json
import dask.dataframe as dd

from utils.custom_treat_data_funcs import transform_text_to_list, explode_df_columns

### Loading Path Infos from Config (JSON) File

In [2]:
with open('config.json', 'r') as f:
    config = json.load(f)

### Loading Custom Functions and Artifactos for Model

In [3]:
import pickle
from utils.custom_data_structs import UserItemData
from utils.model_funcs import recommend_by_model_scores, recommend_by_most_popular, recommend_by_weighted_random
from utils.model_funcs import count_histories_by_popularity, get_df_most_unpopular_histories
from utils.model_funcs import list_intersection

loaded_model = pickle.load(open('artifacts/lightfm_model.pkl', 'rb'))
loaded_user_item_data:UserItemData = pickle.load(open('artifacts/user_item_data.pkl', 'rb'))

### Defining some parameters and dtypes

In [4]:
dtype_df_valid = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"timestampHistory" : 'string'
}

dtype_df_train_score = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"score" : 'Float32'
}

LIMIT_N_ROWS = 1000
N_PARTS_DASK = 3

## READ DATA

### Retrieve Validation Data (df_valid)

In [5]:
df_valid = pd.read_csv(config["VALID_DF"],dtype=dtype_df_valid)
# df_valid = pd.read_csv(config["VALID_DF"],dtype=dtype_df_valid,nrows=LIMIT_N_ROWS)
df_valid.dropna()

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'  '01c5...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,['b7b90e18-7613-4ca0-a8fc-fd69addfcd85'  '835f...,[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,['9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6'  'b8eb...,[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
112179,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660546612592]
112180,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660597026440]
112181,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,['3d52cd6b-706e-49f0-9215-0340010a9845'],[1660678862844]
112182,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,['d730c4a6-e8f6-4fde-b73a-afbe148479cd'],[1660584228926]


### Retrieve "Items" data (df_news)
Contains all metadata related to the history itself
Important:
* "page" column is equal to "history" column on df_valid

In [6]:
df_news = pd.read_csv(config["DF_ITEMS_FEATURE"])
df_news.drop(columns=["Unnamed: 0"],inplace=True)
df_news.head(3)

Unnamed: 0,page,age_exp_normalized,ageCategories
0,7371a9b5-5824-4c57-8704-00a74feebe79,0.151439,very-old
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,0.140788,very-old
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,0.12261,very-old


## DATA TREATMENTS

### Explode validation data on "History" (and "timestampHistory") - for further Popularity
Creates an "exploded_df" that will be used to check the "popularity" of a given history/item/page

In [7]:
cols_to_explode = ["history", "timestampHistory"]
cols_and_id = cols_to_explode.copy()
cols_and_id.insert(0,"userId")
cols_and_id.append("userType")
cols_and_id = tuple(cols_and_id)
exploded_df = explode_df_columns(df_valid.loc[:,cols_and_id], cols_to_explode)

In [8]:
df_valid

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'  '01c5...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,['b7b90e18-7613-4ca0-a8fc-fd69addfcd85'  '835f...,[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,['9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6'  'b8eb...,[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
112179,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660546612592]
112180,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660597026440]
112181,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,['3d52cd6b-706e-49f0-9215-0340010a9845'],[1660678862844]
112182,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,['d730c4a6-e8f6-4fde-b73a-afbe148479cd'],[1660584228926]


In [9]:
exploded_df

Unnamed: 0,userId,history,timestampHistory,userType
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,1660533136590,Logged
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,01c59ff6-fb82-4258-918f-2910cb2d4c52,1660672113513,Logged
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,77901133-aee7-4f7b-afc0-652231d76fe9,1660556860253,Logged
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,1660561649242,Logged
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,1660533830245,Logged
...,...,...,...,...
178863,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,1660546612592,Non-Logged
178864,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,1660597026440,Non-Logged
178865,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,3d52cd6b-706e-49f0-9215-0340010a9845,1660678862844,Non-Logged
178866,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,d730c4a6-e8f6-4fde-b73a-afbe148479cd,1660584228926,Non-Logged


### Checking for non-compliant "history" values
* history should have a hash format, with 8d-4d-4d-4d-12d = 32d (column "check_size" does this check)
* every character should be a letter in [a-f] or a number in [0-9] (column "check_chars" does this check)
* This results in 4 hifens and 5 groups of chars (column "check_hifens" does this check)
* example of "good" hash: a1b2c3d4-a1b2-c1d2-e1f2-a111b222c333d000
* example of "bad" data: an url not hashed, such as https://globo.com.... (those will be removed)
* finally, column "check_history" verifies all those aforementioned points

In [10]:
exploded_df["check_hifens"] = exploded_df["history"].str.split('-').apply(lambda x : len(x) == 5).astype("bool")
exploded_df["check_size"] = exploded_df["history"].str.replace(r'-', '', regex=True).apply(lambda x : len(x) == 32).astype("bool")
exploded_df["check_chars"] = exploded_df["history"].str.replace(r'-', '', regex=True).str.replace(r'[a-f0-9]', '', regex=True).apply(lambda x : x == '').astype("bool")
exploded_df["check_history"] = (exploded_df["check_hifens"] & exploded_df["check_size"] & exploded_df["check_chars"]).astype("bool")
exploded_df["check_history"].value_counts()

check_history
True     172442
False      6426
Name: count, dtype: int64

### Remove trash and unused data
Removing "bad" data in "history" from above checks, and also removing unused columns for this "exploded_df_valid" purpose

In [11]:
exploded_df_valid = exploded_df[~exploded_df["check_history"]==False][["userId","userType","history"]]
exploded_df_valid

Unnamed: 0,userId,userType,history
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,857aa90f-a7ec-410d-ba82-dfa4f85d4e71
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,b7b90e18-7613-4ca0-a8fc-fd69addfcd85
...,...,...,...
178863,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,ecb1f348-cd55-47f7-99f9-bb2c84e93f96
178864,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,ecb1f348-cd55-47f7-99f9-bb2c84e93f96
178865,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,3d52cd6b-706e-49f0-9215-0340010a9845
178866,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,d730c4a6-e8f6-4fde-b73a-afbe148479cd


### Retrieve a Compact representation of df_valid, without "trash" data on "history"

In [12]:
df_valid = exploded_df_valid.groupby(["userId","userType"]).history.unique().to_frame().reset_index()
df_valid.dropna(inplace=True)

  df_valid = exploded_df_valid.groupby(["userId","userType"]).history.unique().to_frame().reset_index()


## START RECOMMENDATIONS

## Get most popular items from the "exploded_df_valid"

In [13]:
top_k_popular = recommend_by_most_popular(exploded_df_valid)
print(top_k_popular)

['eb23272d-8e6c-479d-b972-eabeb5f6f3dd', '9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6', 'be89a7da-d9fa-49d4-9fdc-388c27a15bc8', 'd730c4a6-e8f6-4fde-b73a-afbe148479cd', 'aeab0e46-f1e4-41e9-821b-571255c41f69', '4c3d47a1-6f4b-424f-8944-6c227e686c5c']


## Exemple on how to use "recommend_by_weighted_random" function
This is used to add a "random" factor to the recommendation, thus, adding an "exploration phase" to the algorithm.
* This "guided random" recommendation shows potential unusual items to the user
* `df_count_histories`: store the df once, so it can be reused for each user later
* `df_unpopular_histories`: composed by all "history" values not on `top_k_popular`
* `unpopular_weights`: most popular histories have more chance to be selected.

In [14]:
df_count_histories = count_histories_by_popularity(exploded_df_valid)
df_unpopular_histories = get_df_most_unpopular_histories(df_count_histories)
unpopular_weights = tuple(df_unpopular_histories.values.tolist())

top_k_randomw = recommend_by_weighted_random(df_unpopular_histories, unpopular_weights)
print(top_k_randomw)

['1615f2f4-f1e7-4606-8897-2e356f4d0376', 'eaedb43b-f62e-4d66-833b-9f69c7157443', '5c074c79-81d7-4e94-97dd-69dba3f7e8d1']


## Get Recommendation from LightFM model to all Users

In [15]:
ddf_valid = dd.from_pandas(df_valid, npartitions=N_PARTS_DASK)

df_valid["recommendedByModel"] = ddf_valid.apply(lambda x :
    recommend_by_model_scores(
        x.userId,
        x.userType,
        loaded_user_item_data,
        loaded_model),
        axis=1,
        meta=('string','string')).compute()


## Get Recommendation from "recommend_by_weighted_random" to all Users

In [16]:
ddf_valid = dd.from_pandas(df_valid, npartitions=N_PARTS_DASK)

df_valid["recommendedByRandomW"] = ddf_valid.apply(lambda x :
    recommend_by_weighted_random(df_unpopular_histories, unpopular_weights),
    axis=1, meta=('object','object')).compute()


## Get Most Popular Recommendations to all Users
* This one does not depends on the user
* Can be obtained once (stored in `top_k_popular`) and just replicated to all dataset
* Firstly, transformed into string to write to all cells
* Secondly, transformed back to list

In [17]:
df_valid["recommendedByPopular"] = str(top_k_popular)
df_valid["recommendedByPopular"] = df_valid["recommendedByPopular"].apply(transform_text_to_list)


## Full Recommendation List
Composed in parts by
1. Items recommended by the LightFM model (good for known user/items - warm start)
2. Items recommended based on popularity (good for unknown users - cold start)
3. Items recommended randomly (good for unknown users - cold start)
* `N_RECOMMENDS = K_LIGHTFM_ITEMS + K_POPULAR_ITEMS + K_SAMPLED_ITEMS`

In [18]:
df_valid["recommendedMerged"] = df_valid["recommendedByRandomW"] + df_valid["recommendedByPopular"] + df_valid["recommendedByModel"]

In [19]:
df_valid.head(2)

Unnamed: 0,userId,userType,history,recommendedByModel,recommendedByRandomW,recommendedByPopular,recommendedMerged
1,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,Non-Logged,[d88f5d3a-8657-4955-88fb-71702d92909a],"[a203c57c-8693-45c8-a4bb-31f73bae4a8d, 1b31e22...","[82e7b237-e7ea-4cf5-b44c-55385e2cdbb0, f229981...","[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...","[82e7b237-e7ea-4cf5-b44c-55385e2cdbb0, f229981..."
2,0003f534fe8124d98eef78b01baa3b69e3ffcda6e5a657...,Logged,[6813846e-9f28-4238-8ea5-4e3436064657],"[a203c57c-8693-45c8-a4bb-31f73bae4a8d, 1b31e22...","[4d58e8c1-6a6b-418b-b764-822b8f71ff02, bc62531...","[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...","[4d58e8c1-6a6b-418b-b764-822b8f71ff02, bc62531..."


### Removing unused columns

In [20]:
df_valid.drop(columns=["recommendedByModel","recommendedByRandomW","recommendedByPopular"],inplace=True)
df_valid_recommends = df_valid.drop(columns=["userType","history"])
df_valid_recommends

Unnamed: 0,userId,recommendedMerged
1,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,"[82e7b237-e7ea-4cf5-b44c-55385e2cdbb0, f229981..."
2,0003f534fe8124d98eef78b01baa3b69e3ffcda6e5a657...,"[4d58e8c1-6a6b-418b-b764-822b8f71ff02, bc62531..."
4,00043e84a871699b666933c3ca7989598879d47b325901...,"[dec36c02-2274-4327-858d-4ef057196055, 77e806e..."
7,0004749749d85d78d13c6c9113130278a18f069f71ccca...,"[4d1189df-e0ba-425d-a5ef-3e3e36fbe955, 7238401..."
8,0004e1ddec9a5d67faa56bb734d733628a7841c10c7255...,"[34825663-ce69-4da2-81c7-999d6c046711, 79839cb..."
...,...,...
217477,fffcb5799256bd50b519b5688f54a6021ca3ef1af66b8b...,"[b7b90e18-7613-4ca0-a8fc-fd69addfcd85, 0087664..."
217478,fffd676b5b33c111de923c2ce93f767f206bf7218d43a6...,"[61e1181d-a231-4fd0-b585-ea6d51a97c0f, 878ac7b..."
217480,fffdf305577419378c3be19c54c443fd2744e0f2dc65dc...,"[68bc8994-ebef-4e48-8478-e7fe1619ae58, b19d801..."
217482,fffe0be2e6d6f38ae63123c674646bc8e759578305ae97...,"[dd30900e-d023-4431-a86b-f3aab6bfc7d9, e34b667..."


### Exploding the "df_valid_recommends" for further merge with "freshness" info (age)

In [21]:
cols_to_explode = ["recommendedMerged"]
cols_and_id = ["userId"]
cols_and_id = tuple(cols_and_id+cols_to_explode)
print(cols_and_id)
df_valid_recommends_exploded = explode_df_columns(df_valid_recommends.loc[:,cols_and_id], cols_to_explode)
df_valid_recommends_exploded

('userId', 'recommendedMerged')


Unnamed: 0,userId,recommendedMerged
0,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,82e7b237-e7ea-4cf5-b44c-55385e2cdbb0
1,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,f2299812-e631-4f1a-952d-ddeb0fcfb1e6
2,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,9cfa3b4e-5c38-465a-8244-5fddf26538a4
3,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,eb23272d-8e6c-479d-b972-eabeb5f6f3dd
4,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6
...,...,...
1631140,fffef1890d6f8c974caa19bb8fb46b3b4cb5dc9dc006bc...,24b07fbc-bdf4-41ad-b50c-a64d9ee7a8cc
1631141,fffef1890d6f8c974caa19bb8fb46b3b4cb5dc9dc006bc...,5d15c856-43ed-474f-b2e2-6d2cc625ce7f
1631142,fffef1890d6f8c974caa19bb8fb46b3b4cb5dc9dc006bc...,c022aa46-74d5-4d9f-a880-6a885459d692
1631143,fffef1890d6f8c974caa19bb8fb46b3b4cb5dc9dc006bc...,a1ab0627-63dc-4e54-bfb1-1546ba084f2a


## Merging Full Recommendation Info with History Age/Freshness
And double checking for NAs

In [22]:
df_valid_exploded_rec_age = pd.merge(df_valid_recommends_exploded, df_news, left_on='recommendedMerged', right_on='page', how='left')
df_valid_exploded_rec_age.drop(columns=["page","ageCategories"],inplace=True) # 'page' is the same as 'history', 'ageCategories' will not be used, only numeric
df_valid_exploded_rec_age[~df_valid_exploded_rec_age["age_exp_normalized"].notna()]

Unnamed: 0,userId,recommendedMerged,age_exp_normalized


## Using Freshness/Age Info
* Among all recommendations, keep just the most top N recent ones.
* Assumes `N_RECENT < N_RECOMMENDS`

In [23]:
N_RECENT = 5
df_valid_exploded_rec_age_top_recent = df_valid_exploded_rec_age.sort_values("age_exp_normalized",ascending=False).groupby("userId").head(N_RECENT).sort_index()
df_valid_exploded_rec_age_top_recent

Unnamed: 0,userId,recommendedMerged,age_exp_normalized
0,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,82e7b237-e7ea-4cf5-b44c-55385e2cdbb0,1.0
4,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6,1.0
5,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,1.0
6,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,d730c4a6-e8f6-4fde-b73a-afbe148479cd,1.0
8,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,4c3d47a1-6f4b-424f-8944-6c227e686c5c,1.0
...,...,...,...
1631133,fffef1890d6f8c974caa19bb8fb46b3b4cb5dc9dc006bc...,eb23272d-8e6c-479d-b972-eabeb5f6f3dd,1.0
1631134,fffef1890d6f8c974caa19bb8fb46b3b4cb5dc9dc006bc...,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6,1.0
1631135,fffef1890d6f8c974caa19bb8fb46b3b4cb5dc9dc006bc...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,1.0
1631136,fffef1890d6f8c974caa19bb8fb46b3b4cb5dc9dc006bc...,d730c4a6-e8f6-4fde-b73a-afbe148479cd,1.0


### Removing unnecessary columns again

In [24]:
df_final_recommends_exploded_top_recent = df_valid_exploded_rec_age_top_recent.drop(columns="age_exp_normalized")
df_final_recommends_exploded_top_recent.head(2)

Unnamed: 0,userId,recommendedMerged
0,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,82e7b237-e7ea-4cf5-b44c-55385e2cdbb0
4,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6


### Retrieve a Compact representation of "df_final_recommends_exploded_top_recent"
* Further Validation histories & Recommended histories back to list format

In [25]:
df_final_recommends = df_final_recommends_exploded_top_recent.groupby("userId").recommendedMerged.unique()

## Merging Recommendations with Validation Histories

In [26]:
df_final_recommends = pd.merge(df_final_recommends, df_valid[["userId","history"]], left_on='userId', right_on='userId', how='left')
df_final_recommends

Unnamed: 0,userId,recommendedMerged,history
0,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,"[82e7b237-e7ea-4cf5-b44c-55385e2cdbb0, 9c764c3...",[d88f5d3a-8657-4955-88fb-71702d92909a]
1,0003f534fe8124d98eef78b01baa3b69e3ffcda6e5a657...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...",[6813846e-9f28-4238-8ea5-4e3436064657]
2,00043e84a871699b666933c3ca7989598879d47b325901...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...",[aeab0e46-f1e4-41e9-821b-571255c41f69]
3,0004749749d85d78d13c6c9113130278a18f069f71ccca...,"[72384013-5d8a-489f-ae0e-53cc739e4f36, eb23272...",[487330a8-b48e-4183-80f7-a1baf7c4c7e0]
4,0004e1ddec9a5d67faa56bb734d733628a7841c10c7255...,"[dbb1f75b-afd3-4041-939a-5b4a0fd17376, eb23272...",[be89a7da-d9fa-49d4-9fdc-388c27a15bc8]
...,...,...,...
108738,fffcb5799256bd50b519b5688f54a6021ca3ef1af66b8b...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...",[ec0717a6-a3fc-4e38-93f5-481d61967fdf]
108739,fffd676b5b33c111de923c2ce93f767f206bf7218d43a6...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...",[444db2d3-2856-4307-8a36-09b15651fb8b]
108740,fffdf305577419378c3be19c54c443fd2744e0f2dc65dc...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...",[082ca223-fb94-4eac-a1b4-5be083fa4bf6]
108741,fffe0be2e6d6f38ae63123c674646bc8e759578305ae97...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...",[f1721193-a006-419c-848e-85f5269e42bd]


## CHECK PERFORMANCE FOR THE RECOMMENDATION SYSTEM

### Verify intersection between Recommended and Validation Histories
* Results in a list with all histories' intersection

In [27]:
df_final_recommends["matched_recommendations"] = df_final_recommends.apply(lambda x :
    list_intersection(x.history, x.recommendedMerged),
    axis=1
)

## Count the size of the intersection set
* How Many recommended histories are within the validation set, for each user?
Answer below!

In [28]:
df_final_recommends["count_matches"] = df_final_recommends["matched_recommendations"].apply(lambda x : len(x))

In [29]:
df_final_recommends[df_final_recommends["count_matches"]>0].head(3)

Unnamed: 0,userId,recommendedMerged,history,matched_recommendations,count_matches
4,0004e1ddec9a5d67faa56bb734d733628a7841c10c7255...,"[dbb1f75b-afd3-4041-939a-5b4a0fd17376, eb23272...",[be89a7da-d9fa-49d4-9fdc-388c27a15bc8],[be89a7da-d9fa-49d4-9fdc-388c27a15bc8],1
5,000543f8504df3e639b1b0421db9d5173b40c91c44c46f...,"[b7b90e18-7613-4ca0-a8fc-fd69addfcd85, eb23272...","[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 4c3d47a...","[b7b90e18-7613-4ca0-a8fc-fd69addfcd85, eb23272...",3
9,000a2a60d1b7655d40beb4042e0610cc8cd4c2e8a51300...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 9c764c3...",[eb23272d-8e6c-479d-b972-eabeb5f6f3dd],[eb23272d-8e6c-479d-b972-eabeb5f6f3dd],1


## Number of Matches
* How many did we get correct?

In [30]:
number_of_matches = df_final_recommends["count_matches"].sum()
validation_set_total_size = df_valid["userId"].count()
percentage_of_matches = 100*number_of_matches/validation_set_total_size
print(f"# of matches = {number_of_matches}")
print(f"valid size = {validation_set_total_size}")
print(f"% of matches = {percentage_of_matches}")

# of matches = 22054
valid size = 108743
% of matches = 20.280845663628924
