# Model recommendation with lighfm

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import json
import dask.dataframe as dd

from utils.custom_treat_data_funcs import transform_text_to_list, explode_df_columns

### Loading Path Infos from Config (JSON) File

In [2]:
with open('config.json', 'r') as f:
    config = json.load(f)

### Loading Custom Functions and Artifactos for Model

In [3]:
import pickle
from utils.custom_data_structs import UserItemData
from utils.model_funcs import recommend_by_model_scores, recommend_by_most_popular, recommend_by_weighted_random
from utils.model_funcs import count_histories_by_popularity, get_df_most_unpopular_histories
from utils.model_funcs import list_intersection

loaded_model = pickle.load(open('artifacts/lightfm_model.pkl', 'rb'))
loaded_user_item_data:UserItemData = pickle.load(open('artifacts/user_item_data.pkl', 'rb'))

### Defining some parameters and dtypes

In [4]:
dtype_df_valid = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"timestampHistory" : 'string'
}

dtype_df_train_score = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"score" : 'Float32'
}

LIMIT_N_ROWS = 1000
N_PARTS_DASK = 3

## READ DATA

### Retrieve Validation Data (df_valid)

In [5]:
# df_valid = pd.read_csv(config["VALID_DF"],dtype=dtype_df_valid)
df_valid = pd.read_csv(config["VALID_DF"],dtype=dtype_df_valid,nrows=LIMIT_N_ROWS)
df_valid.dropna()

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'  '01c5...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,['b7b90e18-7613-4ca0-a8fc-fd69addfcd85'  '835f...,[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,['9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6'  'b8eb...,[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
995,0bca64a686750abbb1ba077318fb7d3b094f50c55374a5...,Logged,['a89983c3-e220-4a5f-ab23-5edd8b2b914c'],[1660646489438]
996,71925c3e31da91c811d2b445d169cdf737889ccd02e7e3...,Logged,['cf7c4c6c-38e5-43bb-ad54-c33ad7253ac2'],[1660591462479]
997,0fee67e553a97ea13828b0ae5f3a9d6450418f19c05c47...,Logged,['6f1a3d83-b2aa-4eae-98de-3b07ab0fdec3'],[1660533586069]
998,9cad524b959bbbcd00995bc25fbea95dedcce71dd7bee9...,Logged,['6f1a3d83-b2aa-4eae-98de-3b07ab0fdec3'  'a41b...,[1660558252865 1660702436161]


### Retrieve "Items" data (df_news)
Contains all metadata related to the history itself
Important:
* "page" column is equal to "history" column on df_valid

In [6]:
df_news = pd.read_csv(config["DF_ITEMS_FEATURE"])
df_news.drop(columns=["Unnamed: 0"],inplace=True)
df_news.head(3)

Unnamed: 0,page,age_exp_normalized,ageCategories
0,7371a9b5-5824-4c57-8704-00a74feebe79,0.151439,very-old
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,0.140788,very-old
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,0.12261,very-old


## DATA TREATMENTS

### Explode validation data on "History" (and "timestampHistory") - for further Popularity
Creates an "exploded_df" that will be used to check the "popularity" of a given history/item/page

In [7]:
cols_to_explode = ["history", "timestampHistory"]
cols_and_id = cols_to_explode.copy()
cols_and_id.insert(0,"userId")
cols_and_id.append("userType")
cols_and_id = tuple(cols_and_id)
exploded_df = explode_df_columns(df_valid.loc[:,cols_and_id], cols_to_explode)

In [8]:
df_valid

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'  '01c5...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,['b7b90e18-7613-4ca0-a8fc-fd69addfcd85'  '835f...,[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,['9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6'  'b8eb...,[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
995,0bca64a686750abbb1ba077318fb7d3b094f50c55374a5...,Logged,['a89983c3-e220-4a5f-ab23-5edd8b2b914c'],[1660646489438]
996,71925c3e31da91c811d2b445d169cdf737889ccd02e7e3...,Logged,['cf7c4c6c-38e5-43bb-ad54-c33ad7253ac2'],[1660591462479]
997,0fee67e553a97ea13828b0ae5f3a9d6450418f19c05c47...,Logged,['6f1a3d83-b2aa-4eae-98de-3b07ab0fdec3'],[1660533586069]
998,9cad524b959bbbcd00995bc25fbea95dedcce71dd7bee9...,Logged,['6f1a3d83-b2aa-4eae-98de-3b07ab0fdec3'  'a41b...,[1660558252865 1660702436161]


In [9]:
exploded_df

Unnamed: 0,userId,history,timestampHistory,userType
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,1660533136590,Logged
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,01c59ff6-fb82-4258-918f-2910cb2d4c52,1660672113513,Logged
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,77901133-aee7-4f7b-afc0-652231d76fe9,1660556860253,Logged
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,1660561649242,Logged
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,1660533830245,Logged
...,...,...,...,...
2287,71925c3e31da91c811d2b445d169cdf737889ccd02e7e3...,cf7c4c6c-38e5-43bb-ad54-c33ad7253ac2,1660591462479,Logged
2288,0fee67e553a97ea13828b0ae5f3a9d6450418f19c05c47...,6f1a3d83-b2aa-4eae-98de-3b07ab0fdec3,1660533586069,Logged
2289,9cad524b959bbbcd00995bc25fbea95dedcce71dd7bee9...,6f1a3d83-b2aa-4eae-98de-3b07ab0fdec3,1660558252865,Logged
2290,9cad524b959bbbcd00995bc25fbea95dedcce71dd7bee9...,a41bed7d-5376-479f-b503-a8b046a4c7e3,1660702436161,Logged


### Checking for non-compliant "history" values
* history should have a hash format, with 8d-4d-4d-4d-12d = 32d (column "check_size" does this check)
* every character should be a letter in [a-f] or a number in [0-9] (column "check_chars" does this check)
* This results in 4 hifens and 5 groups of chars (column "check_hifens" does this check)
* example of "good" hash: a1b2c3d4-a1b2-c1d2-e1f2-a111b222c333d000
* example of "bad" data: an url not hashed, such as https://globo.com.... (those will be removed)
* finally, column "check_history" verifies all those aforementioned points

In [10]:
exploded_df["check_hifens"] = exploded_df["history"].str.split('-').apply(lambda x : len(x) == 5).astype("bool")
exploded_df["check_size"] = exploded_df["history"].str.replace(r'-', '', regex=True).apply(lambda x : len(x) == 32).astype("bool")
exploded_df["check_chars"] = exploded_df["history"].str.replace(r'-', '', regex=True).str.replace(r'[a-f0-9]', '', regex=True).apply(lambda x : x == '').astype("bool")
exploded_df["check_history"] = (exploded_df["check_hifens"] & exploded_df["check_size"] & exploded_df["check_chars"]).astype("bool")
exploded_df["check_history"].value_counts()

check_history
True     2207
False      85
Name: count, dtype: int64

### Remove trash and unused data
Removing "bad" data in "history" from above checks, and also removing unused columns for this "exploded_df_valid" purpose

In [11]:
exploded_df_valid = exploded_df[~exploded_df["check_history"]==False][["userId","userType","history"]]
exploded_df_valid

Unnamed: 0,userId,userType,history
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,857aa90f-a7ec-410d-ba82-dfa4f85d4e71
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,b7b90e18-7613-4ca0-a8fc-fd69addfcd85
...,...,...,...
2287,71925c3e31da91c811d2b445d169cdf737889ccd02e7e3...,Logged,cf7c4c6c-38e5-43bb-ad54-c33ad7253ac2
2288,0fee67e553a97ea13828b0ae5f3a9d6450418f19c05c47...,Logged,6f1a3d83-b2aa-4eae-98de-3b07ab0fdec3
2289,9cad524b959bbbcd00995bc25fbea95dedcce71dd7bee9...,Logged,6f1a3d83-b2aa-4eae-98de-3b07ab0fdec3
2290,9cad524b959bbbcd00995bc25fbea95dedcce71dd7bee9...,Logged,a41bed7d-5376-479f-b503-a8b046a4c7e3


### Retrieve a Compact representation of df_valid, without "trash" data on "history"

In [12]:
df_valid = exploded_df_valid.groupby(["userId","userType"]).history.unique().to_frame().reset_index()
df_valid

  df_valid = exploded_df_valid.groupby(["userId","userType"]).history.unique().to_frame().reset_index()


Unnamed: 0,userId,userType,history
0,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,Logged,"[d730c4a6-e8f6-4fde-b73a-afbe148479cd, be89a7d..."
1,00aee36362731c3a7c6891e82680e98400f0feea68e9b9...,Logged,"[d730c4a6-e8f6-4fde-b73a-afbe148479cd, 109adb0..."
2,00bcf7e66456386d571b01ce096bd50379d16c7e3c1e01...,Logged,[72384013-5d8a-489f-ae0e-53cc739e4f36]
3,00c35ef7c5a6c85ab17202687cd0e1c49b852d944ad11b...,Logged,[4c3d47a1-6f4b-424f-8944-6c227e686c5c]
4,00c4a6454163650d3424a18bdeb824df669d009e68b8e0...,Logged,[5bf8b3a3-3916-40ca-8672-a9f3df311825]
...,...,...,...
974,fdc023689a8f9eb312e7b03ea1c18bae5d319e3de74d73...,Logged,[433dbb85-cc21-4624-a74b-3db9d34e0e4f]
975,fdd2d7a7d9f8a10833467662863b5becd9db2447f64e84...,Logged,[6d5452ec-5eec-4a57-b9bf-a00530176d31]
976,fe020b3c69c19db98ecd74b1c04cdc0e12993ca280a855...,Logged,"[a60b3e2f-c009-4a8b-9bcd-068e7d3c927a, 1b4dd5f..."
977,ffb3c1cb04ed5a54f49ef35dc5cd1e482d689d2020408b...,Logged,[44b83851-8a5a-49fa-9b6d-66ca16952cde]


## START RECOMMENDATIONS

## Get most popular items from the "exploded_df_valid"

In [13]:
top_k_popular = recommend_by_most_popular(exploded_df_valid)
print(top_k_popular)

['aeab0e46-f1e4-41e9-821b-571255c41f69', '9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6', 'be89a7da-d9fa-49d4-9fdc-388c27a15bc8', 'eb23272d-8e6c-479d-b972-eabeb5f6f3dd', '82e7b237-e7ea-4cf5-b44c-55385e2cdbb0', '4c3d47a1-6f4b-424f-8944-6c227e686c5c']


## Exemple on how to use "recommend_by_weighted_random" function
This is used to add a "random" factor to the recommendation, thus, adding an "exploration phase" to the algorithm.
* This "guided random" recommendation shows potential unusual items to the user
* `df_count_histories`: store the df once, so it can be reused for each user later
* `df_unpopular_histories`: composed by all "history" values not on `top_k_popular`
* `unpopular_weights`: most popular histories have more chance to be selected.

In [14]:
df_count_histories = count_histories_by_popularity(exploded_df_valid)
df_unpopular_histories = get_df_most_unpopular_histories(df_count_histories)
unpopular_weights = tuple(df_unpopular_histories.values.tolist())

top_k_randomw = recommend_by_weighted_random(df_unpopular_histories, unpopular_weights)
print(top_k_randomw)

['27d9047e-e42a-46e1-862b-6089b1707d6f', '72384013-5d8a-489f-ae0e-53cc739e4f36', 'b1e41505-8fd5-40ea-9363-b45ee48aa556']


## Get Recommendation from LightFM model to all Users

In [15]:
ddf_valid = dd.from_pandas(df_valid, npartitions=N_PARTS_DASK)

df_valid["recommendedByModel"] = ddf_valid.apply(lambda x :
    recommend_by_model_scores(
        x.userId,
        x.userType,
        loaded_user_item_data,
        loaded_model),
        axis=1,
        meta=('string','string')).compute()


## Get Recommendation from "recommend_by_weighted_random" to all Users

In [16]:
ddf_valid = dd.from_pandas(df_valid, npartitions=N_PARTS_DASK)

df_valid["recommendedByRandomW"] = ddf_valid.apply(lambda x :
    recommend_by_weighted_random(df_unpopular_histories, unpopular_weights),
    axis=1, meta=('object','object')).compute()


## Get Most Popular Recommendations to all Users
* This one does not depends on the user
* Can be obtained once (stored in `top_k_popular`) and just replicated to all dataset
* Firstly, transformed into string to write to all cells
* Secondly, transformed back to list

In [17]:
df_valid["recommendedByPopular"] = str(top_k_popular)
df_valid["recommendedByPopular"] = df_valid["recommendedByPopular"].apply(transform_text_to_list)


## Full Recommendation List
Composed in parts by
1. Items recommended by the LightFM model (good for known user/items - warm start)
2. Items recommended based on popularity (good for unknown users - cold start)
3. Items recommended randomly (good for unknown users - cold start)
* `N_RECOMMENDS = K_LIGHTFM_ITEMS + K_POPULAR_ITEMS + K_SAMPLED_ITEMS`

In [18]:
df_valid["recommendedMerged"] = df_valid["recommendedByRandomW"] + df_valid["recommendedByPopular"] + df_valid["recommendedByModel"]

In [19]:
df_valid.head(2)

Unnamed: 0,userId,userType,history,recommendedByModel,recommendedByRandomW,recommendedByPopular,recommendedMerged
0,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,Logged,"[d730c4a6-e8f6-4fde-b73a-afbe148479cd, be89a7d...","[78fe8ccc-99cf-4da8-88d1-080a13ab6ca4, 870c8a9...","[f5b89aea-8dea-4489-83ea-02a07ecbd21a, d205dbf...","[aeab0e46-f1e4-41e9-821b-571255c41f69, 9c764c3...","[f5b89aea-8dea-4489-83ea-02a07ecbd21a, d205dbf..."
1,00aee36362731c3a7c6891e82680e98400f0feea68e9b9...,Logged,"[d730c4a6-e8f6-4fde-b73a-afbe148479cd, 109adb0...","[78fe8ccc-99cf-4da8-88d1-080a13ab6ca4, 870c8a9...","[f2f45310-e13d-4deb-acb9-170ffc330335, a07ab9d...","[aeab0e46-f1e4-41e9-821b-571255c41f69, 9c764c3...","[f2f45310-e13d-4deb-acb9-170ffc330335, a07ab9d..."


### Removing unused columns

In [20]:
df_valid.drop(columns=["recommendedByModel","recommendedByRandomW","recommendedByPopular"],inplace=True)
df_valid_recommends = df_valid.drop(columns=["userType","history"])
df_valid_recommends

Unnamed: 0,userId,recommendedMerged
0,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,"[f5b89aea-8dea-4489-83ea-02a07ecbd21a, d205dbf..."
1,00aee36362731c3a7c6891e82680e98400f0feea68e9b9...,"[f2f45310-e13d-4deb-acb9-170ffc330335, a07ab9d..."
2,00bcf7e66456386d571b01ce096bd50379d16c7e3c1e01...,"[9ce62af9-27b5-40ce-acc3-c88fb373f3ff, 3039fc3..."
3,00c35ef7c5a6c85ab17202687cd0e1c49b852d944ad11b...,"[55d189a4-2f28-49c0-a7f4-ba9b4f232874, 08945c8..."
4,00c4a6454163650d3424a18bdeb824df669d009e68b8e0...,"[31e64f8d-ca41-4831-8673-98c707014658, 9c61f59..."
...,...,...
974,fdc023689a8f9eb312e7b03ea1c18bae5d319e3de74d73...,"[55d189a4-2f28-49c0-a7f4-ba9b4f232874, d2e8d47..."
975,fdd2d7a7d9f8a10833467662863b5becd9db2447f64e84...,"[ed82a14e-ae58-477a-b12f-0e16c114a095, ac820b4..."
976,fe020b3c69c19db98ecd74b1c04cdc0e12993ca280a855...,"[31e64f8d-ca41-4831-8673-98c707014658, 9c61f59..."
977,ffb3c1cb04ed5a54f49ef35dc5cd1e482d689d2020408b...,"[7b364fed-eb04-4424-a208-380b05b12946, 496fd6e..."


### Exploding the "df_valid_recommends" for further merge with "freshness" info (age)

In [21]:
cols_to_explode = ["recommendedMerged"]
cols_and_id = ["userId"]
cols_and_id = tuple(cols_and_id+cols_to_explode)
print(cols_and_id)
df_valid_recommends_exploded = explode_df_columns(df_valid_recommends.loc[:,cols_and_id], cols_to_explode)
df_valid_recommends_exploded

('userId', 'recommendedMerged')


Unnamed: 0,userId,recommendedMerged
0,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,f5b89aea-8dea-4489-83ea-02a07ecbd21a
1,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,d205dbff-5b3d-48aa-ad19-bfd65d196583
2,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,d031af1b-f939-47c1-a589-e6d691b66d91
3,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,aeab0e46-f1e4-41e9-821b-571255c41f69
4,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6
...,...,...
14680,fff46e72c87ef6d8e149b0a60f3346a84256b2d30c04bc...,78fe8ccc-99cf-4da8-88d1-080a13ab6ca4
14681,fff46e72c87ef6d8e149b0a60f3346a84256b2d30c04bc...,50392ead-f267-4b08-ab03-7f7c4440a8b5
14682,fff46e72c87ef6d8e149b0a60f3346a84256b2d30c04bc...,baba0fcf-80b5-4c39-90c1-bde6bd56b72f
14683,fff46e72c87ef6d8e149b0a60f3346a84256b2d30c04bc...,9ae52dae-de91-4969-b4ea-c85a1e6f7491


## Merging Full Recommendation Info with History Age/Freshness
And double checking for NAs

In [22]:
df_valid_exploded_rec_age = pd.merge(df_valid_recommends_exploded, df_news, left_on='recommendedMerged', right_on='page', how='left')
df_valid_exploded_rec_age.drop(columns=["page","ageCategories"],inplace=True) # 'page' is the same as 'history', 'ageCategories' will not be used, only numeric
df_valid_exploded_rec_age[~df_valid_exploded_rec_age["age_exp_normalized"].notna()]

Unnamed: 0,userId,recommendedMerged,age_exp_normalized


## Using Freshness/Age Info
* Among all recommendations, keep just the most top N recent ones.
* Assumes `N_RECENT < N_RECOMMENDS`

In [23]:
N_RECENT = 5
df_valid_exploded_rec_age_top_recent = df_valid_exploded_rec_age.sort_values("age_exp_normalized",ascending=False).groupby("userId").head(N_RECENT).sort_index()
df_valid_exploded_rec_age_top_recent

Unnamed: 0,userId,recommendedMerged,age_exp_normalized
4,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6,1.0
5,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,1.0
6,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,eb23272d-8e6c-479d-b972-eabeb5f6f3dd,1.0
7,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,82e7b237-e7ea-4cf5-b44c-55385e2cdbb0,1.0
8,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,4c3d47a1-6f4b-424f-8944-6c227e686c5c,1.0
...,...,...,...
14674,fff46e72c87ef6d8e149b0a60f3346a84256b2d30c04bc...,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6,1.0
14675,fff46e72c87ef6d8e149b0a60f3346a84256b2d30c04bc...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,1.0
14676,fff46e72c87ef6d8e149b0a60f3346a84256b2d30c04bc...,eb23272d-8e6c-479d-b972-eabeb5f6f3dd,1.0
14677,fff46e72c87ef6d8e149b0a60f3346a84256b2d30c04bc...,82e7b237-e7ea-4cf5-b44c-55385e2cdbb0,1.0


### Removing unnecessary columns again

In [24]:
df_final_recommends_exploded_top_recent = df_valid_exploded_rec_age_top_recent.drop(columns="age_exp_normalized")
df_final_recommends_exploded_top_recent.head(2)

Unnamed: 0,userId,recommendedMerged
4,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6
5,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8


### Retrieve a Compact representation of "df_final_recommends_exploded_top_recent"
* Further Validation histories & Recommended histories back to list format

In [25]:
df_final_recommends = df_final_recommends_exploded_top_recent.groupby("userId").recommendedMerged.unique()

## Merging Recommendations with Validation Histories

In [26]:
df_final_recommends = pd.merge(df_final_recommends, df_valid[["userId","history"]], left_on='userId', right_on='userId', how='left')
df_final_recommends

Unnamed: 0,userId,recommendedMerged,history
0,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, be89a7d...","[d730c4a6-e8f6-4fde-b73a-afbe148479cd, be89a7d..."
1,00aee36362731c3a7c6891e82680e98400f0feea68e9b9...,"[27112b95-15dc-4363-9d7a-0baf0d23b206, 9c764c3...","[d730c4a6-e8f6-4fde-b73a-afbe148479cd, 109adb0..."
2,00bcf7e66456386d571b01ce096bd50379d16c7e3c1e01...,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, be89a7d...",[72384013-5d8a-489f-ae0e-53cc739e4f36]
3,00c35ef7c5a6c85ab17202687cd0e1c49b852d944ad11b...,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, be89a7d...",[4c3d47a1-6f4b-424f-8944-6c227e686c5c]
4,00c4a6454163650d3424a18bdeb824df669d009e68b8e0...,"[31e64f8d-ca41-4831-8673-98c707014658, 9c61f59...",[5bf8b3a3-3916-40ca-8672-a9f3df311825]
...,...,...,...
974,fdc023689a8f9eb312e7b03ea1c18bae5d319e3de74d73...,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, be89a7d...",[433dbb85-cc21-4624-a74b-3db9d34e0e4f]
975,fdd2d7a7d9f8a10833467662863b5becd9db2447f64e84...,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, be89a7d...",[6d5452ec-5eec-4a57-b9bf-a00530176d31]
976,fe020b3c69c19db98ecd74b1c04cdc0e12993ca280a855...,"[31e64f8d-ca41-4831-8673-98c707014658, 9c61f59...","[a60b3e2f-c009-4a8b-9bcd-068e7d3c927a, 1b4dd5f..."
977,ffb3c1cb04ed5a54f49ef35dc5cd1e482d689d2020408b...,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, be89a7d...",[44b83851-8a5a-49fa-9b6d-66ca16952cde]


## CHECK PERFORMANCE FOR THE RECOMMENDATION SYSTEM

### Verify intersection between Recommended and Validation Histories
* Results in a list with all histories' intersection

In [27]:
df_final_recommends["matched_recommendations"] = df_final_recommends.apply(lambda x :
    list_intersection(x.history, x.recommendedMerged),
    axis=1
)

## Count the size of the intersection set
* How Many recommended histories are within the validation set, for each user?
Answer below!

In [28]:
df_final_recommends["count_matches"] = df_final_recommends["matched_recommendations"].apply(lambda x : len(x))

In [29]:
df_final_recommends[df_final_recommends["count_matches"]>0].head(3)

Unnamed: 0,userId,recommendedMerged,history,matched_recommendations,count_matches
0,0067175c1d42a35553b857d4d67f6d950598455f0f43de...,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, be89a7d...","[d730c4a6-e8f6-4fde-b73a-afbe148479cd, be89a7d...",[be89a7da-d9fa-49d4-9fdc-388c27a15bc8],1
1,00aee36362731c3a7c6891e82680e98400f0feea68e9b9...,"[27112b95-15dc-4363-9d7a-0baf0d23b206, 9c764c3...","[d730c4a6-e8f6-4fde-b73a-afbe148479cd, 109adb0...",[be89a7da-d9fa-49d4-9fdc-388c27a15bc8],1
3,00c35ef7c5a6c85ab17202687cd0e1c49b852d944ad11b...,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, be89a7d...",[4c3d47a1-6f4b-424f-8944-6c227e686c5c],[4c3d47a1-6f4b-424f-8944-6c227e686c5c],1


## Number of Matches
* How many did we get correct?

In [None]:
number_of_matches = df_final_recommends["count_matches"].sum()
validation_set_total_size = df_valid["userId"].count()
percentage_of_matches = 100*number_of_matches/validation_set_total_size
print(f"# of matches = {number_of_matches}")
print(f"valid size = {validation_set_total_size}")
print(f"% of matches = {percentage_of_matches}")

# of matches = 222
valid size = 979
% of matches = 22.67620020429009
