### Overview

![4_predict](../docs/nbs/Model_Training-training_4.jpg)

# Model recommendation with lighfm

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import json
import dask.dataframe as dd

from utils.custom_treat_data_funcs import transform_text_to_list, explode_df_columns

### Loading Path Infos from Config (JSON) File

In [2]:
with open('config.json', 'r') as f:
    config = json.load(f)

### Loading Custom Functions and Artifactos for Model

In [3]:
import pickle
from utils.custom_data_structs import UserItemData

# from utils.model_funcs import get_user_item_data
from utils.model_funcs import recommend_by_model_scores
from utils.model_funcs import read_popular_dict_into_list, read_random_dict_into_list
from utils.model_funcs import list_intersection

loaded_user_item_data:UserItemData = pickle.load(open('artifacts/user_item_data.pkl', 'rb'))

loaded_model = pickle.load(open('artifacts/lightfm_model.pkl', 'rb'))
loaded_dict_popular_histories = pickle.load(open('artifacts/dict_popular_histories.pkl', 'rb'))
loaded_dict_random_histories = pickle.load(open('artifacts/dict_random_histories.pkl', 'rb'))

### Defining some parameters and dtypes

In [4]:
dtype_df_valid = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"timestampHistory" : 'string'
}

dtype_df_train_score = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"score" : 'Float32',
"historyFreshnessNormalized" : 'Float32'
}

LIMIT_N_ROWS = 10000
N_PARTS_DASK = 3

## READ DATA

### Retrieve Validation Data (df_valid)

In [5]:
df_valid = pd.read_csv(config["VALID_DF"],dtype=dtype_df_valid)
# df_valid = pd.read_csv(config["VALID_DF"],dtype=dtype_df_valid,nrows=LIMIT_N_ROWS)
df_valid.dropna()

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'  '01c5...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,['b7b90e18-7613-4ca0-a8fc-fd69addfcd85'  '835f...,[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,['9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6'  'b8eb...,[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
112179,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660546612592]
112180,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660597026440]
112181,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,['3d52cd6b-706e-49f0-9215-0340010a9845'],[1660678862844]
112182,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,['d730c4a6-e8f6-4fde-b73a-afbe148479cd'],[1660584228926]


### Retrieve "Items" data (df_news)
Contains all metadata related to the history itself
Important:
* "page" column is equal to "history" column on df_valid

In [6]:
df_news = pd.read_csv(config["DF_ITEMS_FEATURE"])
df_news.drop(columns=["Unnamed: 0"],inplace=True)
df_news.head(3)

Unnamed: 0,page,age_exp_normalized,ageCategories
0,7371a9b5-5824-4c57-8704-00a74feebe79,0.151439,very-old
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,0.140788,very-old
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,0.12261,very-old


## DATA TREATMENTS

### Explode validation data on "History" (and "timestampHistory") - for further Popularity
Creates an "exploded_df" that will be used to check the "popularity" of a given history/item/page

In [7]:
cols_to_explode = ["history", "timestampHistory"]
cols_and_id = cols_to_explode.copy()
cols_and_id.insert(0,"userId")
cols_and_id.append("userType")
cols_and_id = tuple(cols_and_id)
exploded_df = explode_df_columns(df_valid.loc[:,cols_and_id], cols_to_explode)

In [8]:
df_valid

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'  '01c5...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,['b7b90e18-7613-4ca0-a8fc-fd69addfcd85'  '835f...,[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,['9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6'  'b8eb...,[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
112179,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660546612592]
112180,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660597026440]
112181,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,['3d52cd6b-706e-49f0-9215-0340010a9845'],[1660678862844]
112182,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,['d730c4a6-e8f6-4fde-b73a-afbe148479cd'],[1660584228926]


In [9]:
exploded_df

Unnamed: 0,userId,history,timestampHistory,userType
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,be89a7da-d9fa-49d4-9fdc-388c27a15bc8,1660533136590,Logged
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,01c59ff6-fb82-4258-918f-2910cb2d4c52,1660672113513,Logged
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,77901133-aee7-4f7b-afc0-652231d76fe9,1660556860253,Logged
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,857aa90f-a7ec-410d-ba82-dfa4f85d4e71,1660561649242,Logged
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,1660533830245,Logged
...,...,...,...,...
178863,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,1660546612592,Non-Logged
178864,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,1660597026440,Non-Logged
178865,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,3d52cd6b-706e-49f0-9215-0340010a9845,1660678862844,Non-Logged
178866,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,d730c4a6-e8f6-4fde-b73a-afbe148479cd,1660584228926,Non-Logged


### Checking for non-compliant "history" values
* history should have a hash format, with 8d-4d-4d-4d-12d = 32d (column "check_size" does this check)
* every character should be a letter in [a-f] or a number in [0-9] (column "check_chars" does this check)
* This results in 4 hifens and 5 groups of chars (column "check_hifens" does this check)
* example of "good" hash: a1b2c3d4-a1b2-c1d2-e1f2-a111b222c333d000
* example of "bad" data: an url not hashed, such as https://globo.com.... (those will be removed)
* finally, column "check_history" verifies all those aforementioned points

### Remove trash and unused data
Removing "bad" data in "history" from above checks, and also removing unused columns for this "exploded_df_valid" purpose

In [10]:
from utils.custom_treat_data_funcs import history_check_hash_format

exploded_df["check_history"] = exploded_df["history"].apply(history_check_hash_format)
print(exploded_df["check_history"].value_counts())
exploded_df_valid = exploded_df[~exploded_df["check_history"]==False][["userId","userType","history"]]
exploded_df_valid

check_history
True     172442
False      6426
Name: count, dtype: int64


Unnamed: 0,userId,userType,history
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9
3,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,857aa90f-a7ec-410d-ba82-dfa4f85d4e71
4,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,b7b90e18-7613-4ca0-a8fc-fd69addfcd85
...,...,...,...
178863,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,ecb1f348-cd55-47f7-99f9-bb2c84e93f96
178864,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,ecb1f348-cd55-47f7-99f9-bb2c84e93f96
178865,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,3d52cd6b-706e-49f0-9215-0340010a9845
178866,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,d730c4a6-e8f6-4fde-b73a-afbe148479cd


### Retrieve a Compact representation of df_valid, without "trash" data on "history"

In [11]:
df_valid = exploded_df_valid.groupby(["userId","userType"]).history.unique().to_frame().reset_index()
df_valid.dropna(inplace=True)

  df_valid = exploded_df_valid.groupby(["userId","userType"]).history.unique().to_frame().reset_index()


## START RECOMMENDATIONS

## Get most popular items from the "exploded_df_valid"

In [12]:
top_k_popular = read_popular_dict_into_list(loaded_dict_popular_histories["history"])
print(top_k_popular)

['d2593c3d-2347-40d9-948c-b6065e8459a9', 'f6b5d170-48b9-4f8e-88d4-c84b6668f3bd', '1f32787b-de2b-49be-8c20-ddaeae34cc22', '6a83890a-d9e9-4f6b-a6c6-90d031785bbf', 'f0a78e58-ec7e-494c-9462-fbd6446a9a89', '4c63d7cd-4902-4ffb-9b94-578b1b2151f0', '855d20b7-53f2-4678-a10f-55402d085018']


## Exemple on how to use "recommend_by_weighted_random" function
This is used to add a "random" factor to the recommendation, thus, adding an "exploration phase" to the algorithm.
* This "guided random" recommendation shows potential unusual items to the user
* `df_count_histories`: store the df once, so it can be reused for each user later
* `df_unpopular_histories`: composed by all "history" values not on `top_k_popular`
* `unpopular_weights`: most popular histories have more chance to be selected.

In [13]:
random_k_histories = read_random_dict_into_list(loaded_dict_random_histories["history"])
print(random_k_histories)

['6b9af32b-d168-4acd-a6e3-99665e6ee0c2', 'f82ac580-aa0c-4165-a357-27e12cb4f3cf', 'a9d50956-b9dd-475c-a3d5-20b141c3fcd8', 'd755b265-ef3a-4977-80a4-b75ab44944a0', '9410dadd-0d3b-43de-9186-e934a3da6be3', '1c663751-20d8-4764-bcee-25bbe1a5b287']


## Get Recommendation from LightFM model to all Users

In [14]:
ddf_valid = dd.from_pandas(df_valid, npartitions=N_PARTS_DASK)

result_temp = ddf_valid.apply(lambda x :
    recommend_by_model_scores(
        x.userId,
        loaded_user_item_data,
        loaded_model),
        axis=1,
        meta=('string','string')).compute()

df_valid["recommendedByModel"] = result_temp

try
239557
[42.002617   0.8389786 -8.834183  ... -3.3290431 -4.1762676  2.484836 ]


## Get Recommendation from "recommend_by_weighted_random" to all Users

In [15]:
df_valid["recommendedByRandomW"] = df_valid["history"].apply(lambda x :
            read_random_dict_into_list(loaded_dict_random_histories["history"]))

## Get Most Popular Recommendations to all Users
* This one does not depends on the user
* Can be obtained once (stored in `top_k_popular`) and just replicated to all dataset
* Firstly, transformed into string to write to all cells
* Secondly, transformed back to list

In [16]:
df_valid["recommendedByPopular"] = str(top_k_popular)
df_valid["recommendedByPopular"] = df_valid["recommendedByPopular"].apply(transform_text_to_list)


## Full Recommendation List
Composed in parts by
1. Items recommended by the LightFM model (good for known user/items - warm start)
2. Items recommended based on popularity (good for unknown users - cold start)
3. Items recommended randomly (good for unknown users - cold start)
* `N_RECOMMENDS = K_LIGHTFM_ITEMS + K_POPULAR_ITEMS + K_SAMPLED_ITEMS`

In [17]:
df_valid["recommendedMerged"] = df_valid["recommendedByRandomW"] + df_valid["recommendedByPopular"] + df_valid["recommendedByModel"]

In [18]:
df_valid

Unnamed: 0,userId,userType,history,recommendedByModel,recommendedByRandomW,recommendedByPopular,recommendedMerged
1,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,Non-Logged,[d88f5d3a-8657-4955-88fb-71702d92909a],"[a203c57c-8693-45c8-a4bb-31f73bae4a8d, 1b31e22...","[1e1b3876-eac3-4aab-aac9-6e9eea85d449, bad3ccf...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[1e1b3876-eac3-4aab-aac9-6e9eea85d449, bad3ccf..."
2,0003f534fe8124d98eef78b01baa3b69e3ffcda6e5a657...,Logged,[6813846e-9f28-4238-8ea5-4e3436064657],"[a203c57c-8693-45c8-a4bb-31f73bae4a8d, 1b31e22...","[ae306b4a-0acb-4b3b-b3ce-eaf8bcd5800c, d0f1198...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[ae306b4a-0acb-4b3b-b3ce-eaf8bcd5800c, d0f1198..."
4,00043e84a871699b666933c3ca7989598879d47b325901...,Logged,[aeab0e46-f1e4-41e9-821b-571255c41f69],"[78fe8ccc-99cf-4da8-88d1-080a13ab6ca4, 870c8a9...","[38e587ad-1ccd-41e7-9925-c863da80e7f4, a7aaddb...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[38e587ad-1ccd-41e7-9925-c863da80e7f4, a7aaddb..."
7,0004749749d85d78d13c6c9113130278a18f069f71ccca...,Non-Logged,[487330a8-b48e-4183-80f7-a1baf7c4c7e0],"[78fe8ccc-99cf-4da8-88d1-080a13ab6ca4, bf445aa...","[401ae1cd-ad1d-4929-b815-3e7fa98f55e7, 1b606d6...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[401ae1cd-ad1d-4929-b815-3e7fa98f55e7, 1b606d6..."
8,0004e1ddec9a5d67faa56bb734d733628a7841c10c7255...,Logged,[be89a7da-d9fa-49d4-9fdc-388c27a15bc8],"[78fe8ccc-99cf-4da8-88d1-080a13ab6ca4, 870c8a9...","[d0f11985-f883-452e-b817-226d924e5557, ad4b88a...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[d0f11985-f883-452e-b817-226d924e5557, ad4b88a..."
...,...,...,...,...,...,...,...
217477,fffcb5799256bd50b519b5688f54a6021ca3ef1af66b8b...,Non-Logged,[ec0717a6-a3fc-4e38-93f5-481d61967fdf],"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[964850d4-5df8-4d62-bee9-beb0e677cc5f, 5fb5aa2...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[964850d4-5df8-4d62-bee9-beb0e677cc5f, 5fb5aa2..."
217478,fffd676b5b33c111de923c2ce93f767f206bf7218d43a6...,Logged,[444db2d3-2856-4307-8a36-09b15651fb8b],"[bf445aa0-38de-49ca-9c5f-8bb9af9c9849, 0155832...","[bdf7dde7-1e7c-49b9-8dec-5fbdbd0c859a, a3be043...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[bdf7dde7-1e7c-49b9-8dec-5fbdbd0c859a, a3be043..."
217480,fffdf305577419378c3be19c54c443fd2744e0f2dc65dc...,Logged,[082ca223-fb94-4eac-a1b4-5be083fa4bf6],"[78fe8ccc-99cf-4da8-88d1-080a13ab6ca4, 870c8a9...","[167c7437-0115-4cab-98c4-e2f6066b9933, 237df8c...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[167c7437-0115-4cab-98c4-e2f6066b9933, 237df8c..."
217482,fffe0be2e6d6f38ae63123c674646bc8e759578305ae97...,Logged,[f1721193-a006-419c-848e-85f5269e42bd],"[78fe8ccc-99cf-4da8-88d1-080a13ab6ca4, 870c8a9...","[a382d43f-1e7d-43ff-bf43-8c76bb079c89, 74ec4cc...","[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...","[a382d43f-1e7d-43ff-bf43-8c76bb079c89, 74ec4cc..."


### Removing unused columns

In [19]:
df_valid.drop(columns=["recommendedByModel","recommendedByRandomW","recommendedByPopular"],inplace=True)
df_valid_recommends = df_valid.drop(columns=["userType"])
df_valid_recommends

Unnamed: 0,userId,history,recommendedMerged
1,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,[d88f5d3a-8657-4955-88fb-71702d92909a],"[1e1b3876-eac3-4aab-aac9-6e9eea85d449, bad3ccf..."
2,0003f534fe8124d98eef78b01baa3b69e3ffcda6e5a657...,[6813846e-9f28-4238-8ea5-4e3436064657],"[ae306b4a-0acb-4b3b-b3ce-eaf8bcd5800c, d0f1198..."
4,00043e84a871699b666933c3ca7989598879d47b325901...,[aeab0e46-f1e4-41e9-821b-571255c41f69],"[38e587ad-1ccd-41e7-9925-c863da80e7f4, a7aaddb..."
7,0004749749d85d78d13c6c9113130278a18f069f71ccca...,[487330a8-b48e-4183-80f7-a1baf7c4c7e0],"[401ae1cd-ad1d-4929-b815-3e7fa98f55e7, 1b606d6..."
8,0004e1ddec9a5d67faa56bb734d733628a7841c10c7255...,[be89a7da-d9fa-49d4-9fdc-388c27a15bc8],"[d0f11985-f883-452e-b817-226d924e5557, ad4b88a..."
...,...,...,...
217477,fffcb5799256bd50b519b5688f54a6021ca3ef1af66b8b...,[ec0717a6-a3fc-4e38-93f5-481d61967fdf],"[964850d4-5df8-4d62-bee9-beb0e677cc5f, 5fb5aa2..."
217478,fffd676b5b33c111de923c2ce93f767f206bf7218d43a6...,[444db2d3-2856-4307-8a36-09b15651fb8b],"[bdf7dde7-1e7c-49b9-8dec-5fbdbd0c859a, a3be043..."
217480,fffdf305577419378c3be19c54c443fd2744e0f2dc65dc...,[082ca223-fb94-4eac-a1b4-5be083fa4bf6],"[167c7437-0115-4cab-98c4-e2f6066b9933, 237df8c..."
217482,fffe0be2e6d6f38ae63123c674646bc8e759578305ae97...,[f1721193-a006-419c-848e-85f5269e42bd],"[a382d43f-1e7d-43ff-bf43-8c76bb079c89, 74ec4cc..."


### Retrieve a Compact representation of "df_final_recommends_exploded_top_recent"
* Further Validation histories & Recommended histories back to list format

## Merging Recommendations with Validation Histories

In [20]:
df_final_recommends = df_valid_recommends
df_final_recommends

Unnamed: 0,userId,history,recommendedMerged
1,00025217b55f104ede326550e7b46c3969a5a5333fcd29...,[d88f5d3a-8657-4955-88fb-71702d92909a],"[1e1b3876-eac3-4aab-aac9-6e9eea85d449, bad3ccf..."
2,0003f534fe8124d98eef78b01baa3b69e3ffcda6e5a657...,[6813846e-9f28-4238-8ea5-4e3436064657],"[ae306b4a-0acb-4b3b-b3ce-eaf8bcd5800c, d0f1198..."
4,00043e84a871699b666933c3ca7989598879d47b325901...,[aeab0e46-f1e4-41e9-821b-571255c41f69],"[38e587ad-1ccd-41e7-9925-c863da80e7f4, a7aaddb..."
7,0004749749d85d78d13c6c9113130278a18f069f71ccca...,[487330a8-b48e-4183-80f7-a1baf7c4c7e0],"[401ae1cd-ad1d-4929-b815-3e7fa98f55e7, 1b606d6..."
8,0004e1ddec9a5d67faa56bb734d733628a7841c10c7255...,[be89a7da-d9fa-49d4-9fdc-388c27a15bc8],"[d0f11985-f883-452e-b817-226d924e5557, ad4b88a..."
...,...,...,...
217477,fffcb5799256bd50b519b5688f54a6021ca3ef1af66b8b...,[ec0717a6-a3fc-4e38-93f5-481d61967fdf],"[964850d4-5df8-4d62-bee9-beb0e677cc5f, 5fb5aa2..."
217478,fffd676b5b33c111de923c2ce93f767f206bf7218d43a6...,[444db2d3-2856-4307-8a36-09b15651fb8b],"[bdf7dde7-1e7c-49b9-8dec-5fbdbd0c859a, a3be043..."
217480,fffdf305577419378c3be19c54c443fd2744e0f2dc65dc...,[082ca223-fb94-4eac-a1b4-5be083fa4bf6],"[167c7437-0115-4cab-98c4-e2f6066b9933, 237df8c..."
217482,fffe0be2e6d6f38ae63123c674646bc8e759578305ae97...,[f1721193-a006-419c-848e-85f5269e42bd],"[a382d43f-1e7d-43ff-bf43-8c76bb079c89, 74ec4cc..."


## CHECK PERFORMANCE FOR THE RECOMMENDATION SYSTEM

### Verify intersection between Recommended and Validation Histories
* Results in a list with all histories' intersection

In [21]:
df_final_recommends["matched_recommendations"] = df_final_recommends.apply(lambda x :
    list_intersection(x.history, x.recommendedMerged),
    axis=1
)

## Count the size of the intersection set
* How Many recommended histories are within the validation set, for each user?
Answer below!

In [22]:
df_final_recommends["count_matches"] = df_final_recommends["matched_recommendations"].apply(lambda x : len(x))

In [23]:
df_final_recommends[df_final_recommends["count_matches"]>0]#.head(3)

Unnamed: 0,userId,history,recommendedMerged,matched_recommendations,count_matches
54,0011f6794e4e340826a13d1db435696d5129b22cb944f0...,[1f32787b-de2b-49be-8c20-ddaeae34cc22],"[6b391b65-9988-4d88-8240-029d7e2382d5, f4350c8...",[1f32787b-de2b-49be-8c20-ddaeae34cc22],1
185,0035978ac33f75671cbfd6a7eaf5832f2723b4fc1a56a4...,[1f32787b-de2b-49be-8c20-ddaeae34cc22],"[0b69b7d3-3b11-4cd8-9306-23572be9a053, 807d5f5...",[1f32787b-de2b-49be-8c20-ddaeae34cc22],1
256,004cb0601a44726eebba476fa91f124361e8bd654d1c99...,"[1f32787b-de2b-49be-8c20-ddaeae34cc22, 4c3d47a...","[237df8c5-7599-481b-a62a-a3c4c23e0498, bb8c52f...",[1f32787b-de2b-49be-8c20-ddaeae34cc22],1
1265,0171c172bf55a2a584f9dc429512f4f32a9f0f09800352...,[1f32787b-de2b-49be-8c20-ddaeae34cc22],"[01bc9330-2841-46e0-a13b-a1fad9a66148, bfcbb60...",[1f32787b-de2b-49be-8c20-ddaeae34cc22],1
1399,019914b354b5a0ce7cdd61eb9e0f5d142b1d0b9f2add97...,[1f32787b-de2b-49be-8c20-ddaeae34cc22],"[c204edf7-5800-49c1-a191-eb29c5b6274c, 4487047...",[1f32787b-de2b-49be-8c20-ddaeae34cc22],1
...,...,...,...,...,...
214072,fc09950d098eb15feaa57a0425c64e51abf7659cb8e6f3...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, ad2c51f...","[b7e17c35-a165-4afc-9057-eab54a9036b0, a91da75...",[1f32787b-de2b-49be-8c20-ddaeae34cc22],1
214368,fc6762abf868a84763adfe829033dd6ecbe3f11e38b0ad...,"[eb23272d-8e6c-479d-b972-eabeb5f6f3dd, 1f32787...","[a7f59860-f675-4397-bd7f-8c8027bb836b, 33eab9d...",[1f32787b-de2b-49be-8c20-ddaeae34cc22],1
214385,fc6bac54d5019ec79b0a942daf7ce0e1b025cbcf9af967...,[f0a78e58-ec7e-494c-9462-fbd6446a9a89],"[a206a392-7d88-4a07-a8df-eab6231fcd09, 4c0672d...",[f0a78e58-ec7e-494c-9462-fbd6446a9a89],1
214719,fccccd6601581316cebb59a46e1a7e6aa0b15b3c073f57...,[f6b5d170-48b9-4f8e-88d4-c84b6668f3bd],"[96458fe6-0c7a-48b5-9742-dde25a32bac3, ef5b734...",[f6b5d170-48b9-4f8e-88d4-c84b6668f3bd],1


## Number of Matches
* How many did we get correct?

In [24]:
number_of_matches = df_final_recommends["count_matches"].sum()
validation_set_total_size = df_valid["userId"].count()
percentage_of_matches = 100*number_of_matches/validation_set_total_size
print(f"# of matches = {number_of_matches}")
print(f"valid size = {validation_set_total_size}")
print(f"% of matches = {percentage_of_matches}")

# of matches = 476
valid size = 108743
% of matches = 0.43772932510598384
