# Feature pipeline: Computing the Features

In [23]:
import pandas as pd
from recsys.config import Settings
from recsys.utils.utils import IDConverter
from recsys.features import users, events, interactions, labels, BettingDataset
import pickle

In [24]:
settings = Settings()
SOURCE_DIR = settings.SOURCE_DATA_DIR
processed_dir = settings.PROCESSED_DATA_DIR

In [25]:
# Prepare data
user_df = pd.read_csv(SOURCE_DIR / 'users.csv')
event_df = pd.read_csv(SOURCE_DIR / 'events.csv')
interactions_df = pd.read_csv(SOURCE_DIR / 'bets.csv')

In [26]:
bet_events = set(interactions_df.event_id.to_list())
evnts = set(event_df.event_id.to_list())
list_of_events = evnts.union(bet_events)
event_id_converter = IDConverter()
for id in list_of_events:
    event_id_converter.convert(id)

### IDConverter class
The IDConverter class maintains bidirectional mappings between hex IDs and integers, allowing us to:

- Convert input IDs to integers for model training
- Convert back to original IDs when making recommendations
- Maintain data consistency throughout the pipeline

In [27]:
player_id_converter = IDConverter()
user_df['player_id'] = user_df['player_id'].apply(player_id_converter.convert)
brand_id_converter = IDConverter()
user_df['brand_id'] = user_df['brand_id'].apply(brand_id_converter.convert)
# bet_id, outcome_id, market_id
bet_id_converter = IDConverter()
interactions_df['bet_id'] = interactions_df['bet_id'].apply(bet_id_converter.convert)
outcome_id_converter = IDConverter()
interactions_df['outcome_id'] = interactions_df['outcome_id'].apply(outcome_id_converter.convert)
market_id_converter = IDConverter()
interactions_df['market_id'] = interactions_df['market_id'].apply(market_id_converter.convert)

event_df['event_id'] = event_df['event_id'].map(event_id_converter.id_to_int)
sport_id_converter = IDConverter()
event_df['sport_id'] = event_df['sport_id'].apply(sport_id_converter.convert)
league_id_converter = IDConverter()
event_df['league_id'] = event_df['league_id'].apply(league_id_converter.convert)

In [28]:
interactions_df.player_id = interactions_df.player_id.map(player_id_converter.id_to_int)
interactions_df.brand_id = interactions_df.brand_id.map(brand_id_converter.id_to_int)
interactions_df.event_id = interactions_df.event_id.map(event_id_converter.id_to_int)

In [29]:
user_df = user_df.drop_duplicates().copy(deep=True)
event_df = event_df.drop_duplicates().copy(deep=True)
interactions_df = interactions_df.drop_duplicates().copy(deep= True)
user_df.player_reg_date = user_df.player_reg_date.str[:19]

In [30]:
with open(processed_dir / 'user_df.pickle', 'wb') as handle:
    pickle.dump(user_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

user_df.head()


Unnamed: 0,player_id,brand_id,player_reg_date,language
0,1,1,2024-10-11 14:58:35,por
1,2,2,2024-09-10 17:48:46,tur
2,3,1,2024-10-31 09:24:57,por
3,4,1,2024-07-23 08:12:53,por
4,5,2,2023-05-02 07:56:13,tur


In [31]:
with open(processed_dir / 'events_df.pickle', 'wb') as handle:
    pickle.dump(event_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

event_df.head()

Unnamed: 0,event_id,start_time,sport_id,league_id,home_team,away_team
0,97282,2024-10-13 17:00:00.000,1,1,Agafonov Mikhail,Sheikin Ivan
1,35626,2024-10-13 08:51:00.000,2,2,Uruguay,South Africa
2,11704,2024-10-15 08:30:00.000,3,3,Phnom Pehn Crown,Kampong Speu
3,50312,2024-10-16 04:01:00.000,2,2,Greece,France
4,5906,2024-10-13 06:21:00.000,2,4,SOU,AST


In [32]:
interactions_df.head()
with open(processed_dir / 'bets_df.pickle', 'wb') as handle:
    pickle.dump(interactions_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
user_features, user_array = users.process_user_features(users_df=user_df, bets_df=interactions_df)

In [34]:
user_features

Unnamed: 0_level_0,avg_stake,bet_count,avg_odds,win_rate,days_since_reg,language_encoded,brand_id_encoded
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2.569646,1357.0,5014.414133,0.0,111,2,0
2,249.732941,68.0,9.172113,0.0,142,3,1
3,15.198170,306.0,2.966623,0.0,91,2,0
4,6.128571,7.0,2.142857,0.0,191,2,0
5,76.553763,744.0,80.578826,0.0,639,3,1
...,...,...,...,...,...,...,...
1491,634.081676,716.0,3116.674177,0.0,1001,3,1
22078,0.000000,0.0,0.000000,0.0,264,2,0
22101,20.000000,1.0,2.940000,0.0,84,2,0
5340,2.000000,59.0,2285.891695,0.0,359,2,0


In [35]:
event_features, event_array = events.process_event_features(events_df=event_df)

In [36]:
event_features

Unnamed: 0_level_0,time_to_event,sport_id_encoded,league_id_encoded
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
97282,-2623.830000,0,0
35626,-2631.980000,11,813
11704,-2584.330000,22,924
50312,-2564.813333,11,813
5906,-2634.480000,11,1035
...,...,...,...
54496,-2608.780000,11,1146
42372,-2634.996667,0,312
61990,-2604.830000,0,1479
33402,-2628.830000,44,851


In [37]:
valid_user_ids = users.get_valid_users(users_df=user_df)
valid_event_ids = events.get_valid_events(events_df=event_df)

In [38]:
map_interactions = interactions.process_interactions(bets_df=interactions_df, valid_user_ids=valid_user_ids, valid_event_ids=valid_event_ids)

In [39]:
map_interactions

{(569, 71195),
 (7211, 86212),
 (2884, 1444),
 (8034, 69283),
 (1902, 65857),
 (16405, 54079),
 (194, 46408),
 (282, 77554),
 (3056, 58302),
 (626, 22653),
 (251, 58993),
 (95, 45648),
 (751, 50648),
 (233, 90844),
 (559, 73109),
 (3867, 83592),
 (290, 78546),
 (3013, 16726),
 (3199, 44997),
 (1334, 37642),
 (2239, 22967),
 (5584, 15612),
 (1168, 64707),
 (1551, 75025),
 (3013, 84619),
 (1638, 48606),
 (300, 72067),
 (480, 28843),
 (397, 68967),
 (1914, 82432),
 (18708, 2761),
 (915, 50648),
 (498, 32832),
 (3886, 97955),
 (1164, 57768),
 (727, 51351),
 (10648, 6715),
 (1969, 50059),
 (1689, 70421),
 (447, 94344),
 (1084, 7265),
 (127, 73428),
 (3611, 26982),
 (745, 10428),
 (2764, 57096),
 (212, 61643),
 (773, 11695),
 (3787, 21731),
 (765, 57096),
 (368, 78371),
 (6112, 20588),
 (3721, 39578),
 (6360, 7858),
 (14901, 79267),
 (691, 35455),
 (167, 35455),
 (6841, 72641),
 (3998, 52129),
 (3956, 29272),
 (1630, 10942),
 (264, 42935),
 (2293, 67660),
 (5480, 21250),
 (137, 24666),
 (181

- Added proper interaction matrix creation
- Created explicit user-event pairs for training

In [40]:
labelled_data = labels.create_labels(interactions=map_interactions, valid_user_ids=valid_user_ids, valid_event_ids=valid_event_ids)

In [41]:
user_array.shape[1]

7

In [42]:
dataset = BettingDataset(user_df=user_df, event_df=event_df, bets_df=interactions_df)

Dataset created with 180040 samples
Valid users: 22102
Valid events: 59387


In [43]:

with open(processed_dir / 'features_dataset.pickle', 'wb') as handle:
    pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)