# Model recommendation with lighfm

### Import libraries

In [1]:
import pandas as pd
import json

### Loading Path Infos from Config (JSON) File

In [2]:
with open('config.json', 'r') as f:
    config = json.load(f)

### Loading Custom Functions and Artifacts for Model

In [3]:
# import pickle
from utils.custom_data_structs import UserItemData
from utils.model_funcs import count_histories_by_popularity
from utils.model_funcs import get_df_random_histories
from utils.model_funcs import get_dict_most_popular_histories, get_dict_random_histories

### Defining some parameters and dtypes

In [4]:
dtype_df_valid = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"timestampHistory" : 'string'
}

dtype_df_train_score = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"score" : 'Float32',
"historyFreshnessNormalized" : 'Float32'
}

LIMIT_N_ROWS = 500000
N_PARTS_DASK = 3

## READ DATA

### Retrieve Validation Data (df_valid)

In [5]:
df_merged = pd.read_csv(config["DF_TRAIN_SCORES"], dtype=dtype_df_train_score)
df_merged.drop(columns=["Unnamed: 0"],inplace=True)
df_merged

Unnamed: 0,userId,history,userType,historyFreshnessNormalized,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,0.980416,0.704604
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,0.613061,0.721303
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,0.880859,0.637834
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,0.945895,0.622225
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,0.13293,0.68147
...,...,...,...,...,...
6335310,5889d6ebbf62e6c115e0a280063dc8189cca490cbfea56...,7a349b09-badc-40a9-a194-83d959aeb50c,Non-Logged,0.966442,0.663881
6335311,5889d6ebbf62e6c115e0a280063dc8189cca490cbfea56...,6f344c45-e731-41b4-8c65-9967ebc03096,Non-Logged,0.937478,0.843734
6335312,5889d6ebbf62e6c115e0a280063dc8189cca490cbfea56...,4c586bb4-f71d-4b39-9df8-e38ac3f632a0,Non-Logged,0.939154,0.473613
6335313,5889d6ebbf62e6c115e0a280063dc8189cca490cbfea56...,855d20b7-53f2-4678-a10f-55402d085018,Non-Logged,0.929145,0.669966


## START RECOMMENDATIONS

## Get most popular items from the "exploded_df_valid"

In [6]:
df_count_histories = count_histories_by_popularity(df_merged)
dict_popular_histories = get_dict_most_popular_histories(df_count_histories)
df_count_histories[:10].describe()

Unnamed: 0,historyFreshnessNormalized,counts
count,10.0,10.0
mean,0.963946,14598.3
std,0.022796,2365.938905
min,0.929145,12159.0
25%,0.944652,13081.25
50%,0.966443,13706.5
75%,0.983063,15187.0
max,0.992829,19639.0


## Exemple on how to use "recommend_by_weighted_random" function
This is used to add a "random" factor to the recommendation, thus, adding an "exploration phase" to the algorithm.
* This "guided random" recommendation shows potential unusual items to the user
* `df_count_histories`: store the df once, so it can be reused for each user later
* `df_unpopular_histories`: composed by all "history" values not on `top_k_popular`
* `unpopular_weights`: most popular histories have more chance to be selected.

In [7]:
df_random = get_df_random_histories(df_count_histories)
dict_random_histories = get_dict_random_histories(df_random)
df_random.describe()

Unnamed: 0,historyFreshnessNormalized,counts
count,1000.0,1000.0
mean,0.95986,420.003
std,0.022984,377.697109
min,0.901428,6.0
25%,0.940834,67.0
50%,0.959538,312.5
75%,0.980416,697.5
max,1.0,1254.0


In [8]:
import pickle

pickle.dump(dict_popular_histories, open('artifacts/dict_popular_histories.pkl', 'wb'))
pickle.dump(dict_random_histories, open('artifacts/dict_random_histories.pkl', 'wb'))

In [9]:
import pickle
from utils.model_funcs import read_popular_dict_into_list
from utils.model_funcs import read_random_dict_into_list

In [10]:
loaded_dict_popular_histories = pickle.load(open('artifacts/dict_popular_histories.pkl', 'rb'))
loaded_dict_random_histories = pickle.load(open('artifacts/dict_random_histories.pkl', 'rb'))

In [11]:
loaded_dict_popular_histories

{'history': {0: 'd2593c3d-2347-40d9-948c-b6065e8459a9',
  1: 'f6b5d170-48b9-4f8e-88d4-c84b6668f3bd',
  2: '1f32787b-de2b-49be-8c20-ddaeae34cc22',
  3: '6a83890a-d9e9-4f6b-a6c6-90d031785bbf',
  4: 'f0a78e58-ec7e-494c-9462-fbd6446a9a89',
  5: '4c63d7cd-4902-4ffb-9b94-578b1b2151f0'},
 'historyFreshnessNormalized': {0: 0.9408344626426697,
  1: 0.9561064839363098,
  2: 0.9928287267684937,
  3: 0.9681763052940369,
  4: 0.9804162383079529,
  5: 0.935804545879364},
 'counts': {0: 19639, 1: 17664, 2: 15435, 3: 14443, 4: 13778, 5: 13635}}

In [12]:
loaded_dict_random_histories

{'history': {2118: '7a4f734e-2c4b-43c4-a127-f5bf9cb43d32',
  29815: '1e35a6d6-24c5-4037-a678-76904e59e1ce',
  6924: '8879a9fd-9056-4221-92fd-06afd6390fc7',
  3765: 'f7591a85-ca9a-49de-8482-fba5a85c6c3c',
  1353: '2b02dd1f-8c2a-47ea-b963-c0db5aeb4c1f',
  1118: '7c9b8846-a95c-40ff-b7e7-732169cc1f0d',
  16072: 'a46942a9-7cf2-47c5-9b8f-04eec5e09ea1',
  3798: 'cfa9fe95-0ecd-4d09-a672-09429b2c154f',
  6070: '162ccdd2-ae54-4f82-8713-51a17fa16c64',
  1040: '15571275-5d8b-4799-ac24-8ad272c4115d',
  36009: '5ce3c88d-b22c-4d45-bf87-e816e769f706',
  12857: '9f616a76-624b-4818-8727-51f509890061',
  1511: 'b82e867b-99b2-4629-b40e-b3d69ec033a9',
  1422: '550cef95-cf4c-4fbf-a5db-6fcbc64eb92b',
  1427: 'cfd2a7e6-c4a9-40b8-af6a-5cc8f6555724',
  1820: '4bb25c74-f3bc-489b-903b-6cef6d722c90',
  3020: '1205451b-91ea-4424-b36f-9e3a88f1f73e',
  2406: '68da6da0-2bc3-4bbc-9d0e-4b50e760baf9',
  1769: '31a25404-d57a-437c-a187-7c6967a4c06c',
  3946: '877c70f8-4c4d-45a3-81f1-28e8e41df0db',
  1309: '661064de-acc8-48

In [13]:
top_k_popular = read_popular_dict_into_list(loaded_dict_popular_histories["history"])
print(top_k_popular)

['d2593c3d-2347-40d9-948c-b6065e8459a9', 'f6b5d170-48b9-4f8e-88d4-c84b6668f3bd', '1f32787b-de2b-49be-8c20-ddaeae34cc22', '6a83890a-d9e9-4f6b-a6c6-90d031785bbf', 'f0a78e58-ec7e-494c-9462-fbd6446a9a89', '4c63d7cd-4902-4ffb-9b94-578b1b2151f0']


In [14]:
random_k_histories = read_random_dict_into_list(loaded_dict_random_histories["history"])
print(random_k_histories)

['0310e81e-f678-4bce-a1cd-59baa9052ad9', '50f7c289-4574-4ada-a910-75ca16e9bc11', '9682baf5-9bae-42c2-9ec9-04eec06ec26f']
