In [None]:
#| default_exp datastructure.data_extractor

In [None]:
#| hide

from IPython.core.debugger import set_trace

%load_ext autoreload
%autoreload 2

# Data Extractor
> Extract games and its features from multiple DB collections.

In [None]:
#| export

import datetime
import json
from typing import Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from fastbet.config.mongo import mongo_init
from fastbet.datastructure.game_features import *
from fastbet.datastructure.odds import *
from fastbet.datastructure.team_lineup import *

## Aggregate Data


We provide a function that seeks to retrieve the list of games recorded in our `gameFeatures` MongoDb Collection and aggregate it with its additional features such as `Lineups` information (lineups Collection) and 1x2 and Asian Handicap `odds`(Odds collection).

In [None]:
from fastbet.config.localconfig import CONFIG, DB_HOSTS

In [None]:
# | export


def data_aggregator(
    limit: int = None,  # Number of rows to extract.
) -> pd.DataFrame:  # Mapped games.
    "Returns and aggregates games information from multiple Db collections."

    def _get_odds_columns(
        odds: pd.DataFrame,
    ) -> Tuple:  # set of odds
        "Returns Odds for a given market."
        if odds is None:
            return None, None, None, None
        
        if isinstance(odds, pd.DataFrame):
            odds = odds.head(1).squeeze()

        return odds.odds1, odds.odds2, odds.oddsX, odds.line_id

    def _keep_even_line(
        game_odds: pd.DataFrame,  # All markets for a given game.
    ) -> pd.DataFrame:
        "Returns indexes of the non-even lines."

        if game_odds is None:
            return None
        if not game_odds.shape[0]:
            return None

        # Calculate delta between "odds1" and "odds2" columns
        game_odds = game_odds.copy()
        game_odds["delta"] = abs(game_odds["odds1"] - 2.0) + abs(
            game_odds["odds2"] - 2.0
        )

        return game_odds.loc[game_odds.delta.idxmin(),].drop(columns="delta")

    def _odds(
        game_id: str,  # Real-analytics game identifier.
        game_date: datetime.datetime,  # Find the lastest data document prior to `date`.
    ) -> np.ndarray:  # Odds values.
        "Returns game Odds. It can be 1x2 or Asian Handicap."

        # Extract all odds(1x2, Asian handicap and Total).
        all_game_odds = MarketOdds.get_all_odds(ra_game_id=game_id, date=game_date)

        # select each market
        _odds_1x2 = all_game_odds[all_game_odds.market_type == "1x2"]
        
        _odds_asian = _keep_even_line(
            all_game_odds[all_game_odds.market_type == "asian"]
        )
        _odds_total = _keep_even_line(
            all_game_odds[all_game_odds.market_type == "total"]
        )

        # 1X2 odds.
        odds_1_1x2, odds_2_1x2, odds_x_1x2, _ = _get_odds_columns(_odds_1x2)
        if odds_1_1x2 is None:
            set_trace()

        # Asian Handicap.
        odds_1_ah, odds_2_ah, _, line_ah = _get_odds_columns(_odds_asian)

        # Total(Over/Under) odds.
        odds_1_total, odds_2_total, _, line_total = _get_odds_columns(_odds_total)

        return pd.DataFrame(
            {
                "preGameOdds1": odds_1_1x2,
                "preGameOddsX": odds_x_1x2,
                "preGameOdds2": odds_2_1x2,
                "preGameAhHome": odds_1_ah,
                "preGameAhAway": odds_2_ah,
                "preGameAhLineId": line_ah,
                "preGameOver": odds_1_total,
                "preGameUnder": odds_2_total,
                "preGameTotalLineId": line_total,
            },
            index=[0],
        )

    def _team_features(
        team_id: str,  # Real-analytics game identifier.
        game_date: datetime.datetime,  # Find the lastest data document prior to `date`.
    ) -> Tuple:  # Lineup values (name:position, ids, slots, formation name, timestamp).
        "Returns lineup features of a given team."

        # Lineup features.
        team_features = TeamSheet.get_latest(ra_team_id=team_id, date=game_date)

        # Team name.
        team_name = team_features.name
        # Players and positions.
        team_lineups_names = json.dumps(
            {player.name: player.position for player in team_features.starting}
        )
        # Players ids.
        team_lienups_ids = [player.opta_id for player in team_features.starting]
        # Players slots.
        team_lienups_slots = [player.slot for player in team_features.starting]
        # Formation name.
        formation_name = team_features.starting.first().formation
        # Lineup timestamp.
        lineup_time_stamp = team_features.received_at

        return pd.DataFrame(
            {
                "team_name": team_name,
                "team_lineups_names": [team_lineups_names],
                "team_lienups_ids": [team_lienups_ids],
                "team_lienups_slots": [team_lienups_slots],
                "formation_name": formation_name,
                "lineup_time_stamp": lineup_time_stamp,
            },
            index=[0],
        )
    
    

    # Extract games.
    games = GameFeatures.get_all_games(limit=limit)
    games = pd.DataFrame(games.as_pymongo())
    
    # Filter Data.
    games = games[
        [
            "gameId",
            "game_optaId",
            "gameDate",
            "homeTeamId",
            "homeTeam_optaId",
            "awayTeamId",
            "awayTeam_optaId",
            "tgt_gd",
            "tgt_outcome",
            "tgt_homeTeamGoals",
            "tgt_awayTeamGoals",
        ]
    ]

    # Map results {homewin -> 0 , draw -> 1, awaywin -> 2}.
    games["tgt_outcome"] = games["tgt_outcome"].map({1.0: 0.0, 0.0: 2.0, 0.5: 1.0})

    # compute other features
    def _one_game(row):
        o_1x2_ah_total = _odds(
            game_id=row["gameId"],
            game_date=row["gameDate"],
        )

        ht_feats = _team_features(
            team_id=row["homeTeamId"], game_date=row["gameDate"]
        ).rename(
            columns={
                "team_name": "homeTeamName",
                "team_lineups_names": "homeTeamLineup",
                "team_lienups_ids": "homeTeamLineupIds",
                "team_lienups_slots": "homeTeamLineupSlots",
                "formation_name": "homeTeamFormation",
                "lineup_time_stamp": "home_team_lineup_received_at",
            },
        )

        at_feats = _team_features(
            team_id=row["awayTeamId"], game_date=row["gameDate"]
        ).rename(
            columns={
                "team_name": "awayTeamName",
                "team_lineups_names": "awayTeamLineup",
                "team_lienups_ids": "awayTeamLineupIds",
                "team_lienups_slots": "awayTeamLineupSlots",
                "formation_name": "awayTeamFormation",
                "lineup_time_stamp": "away_team_lineup_received_at",
            },
        )

        res = pd.concat([o_1x2_ah_total, ht_feats, at_feats], axis=1)
        res.loc[:, "gameId"] = row.gameId

        return res

    return games.merge(
        pd.concat(
            [_one_game(row) for _, row in tqdm(games.iterrows(), total=games.shape[0])]
        ).reset_index(drop=True),
        on="gameId",
        how="left",
    )

In [None]:
mongo_init(db_host="public_atlas")

x2 = data_aggregator(
    limit=6,
)


x2[["gameId", 
    "preGameOdds1", "preGameOddsX", "preGameOdds2", 
    "preGameAhHome", "preGameAhAway", "preGameAhLineId", 
    "preGameOver", "preGameUnder", "preGameTotalLineId",
   "tgt_gd", "tgt_outcome", "tgt_homeTeamGoals", "tgt_awayTeamGoals"]]

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,gameId,preGameOdds1,preGameOddsX,preGameOdds2,preGameAhHome,preGameAhAway,preGameAhLineId,preGameOver,preGameUnder,preGameTotalLineId,tgt_gd,tgt_outcome,tgt_homeTeamGoals,tgt_awayTeamGoals
0,174dba7291174b4dbbfa9ea12dd944bb45bdd8ed905524...,2.62,3.53,2.77,1.49,2.75,0.5,,,,0,1.0,2,2
1,219ef70c0e8a803ec1efdb793443edfaa32398690c7829...,1.98,3.36,4.51,2.99,1.42,-1.0,,,,0,1.0,2,2
2,0655e244d8d596b5572e86426e2a7ca6178044efa59437...,2.62,3.6,2.77,4.4,1.21,-1.25,,,,-3,2.0,0,3
3,019c223b4a03917c2f1685beab4d5d278f7bff3913f239...,1.917,3.48,4.62,1.63,2.41,-0.25,,,,2,0.0,2,0
4,0f9ad12eec9f24277ab491f5f26f610eaa918903a34147...,2.04,3.41,4.12,1.75,2.21,-0.25,,,,0,1.0,2,2
5,0af3418e3c2c42119592875b44a1b454faca47a6758765...,1.85,3.3,4.35,1.64,2.4,-0.25,,,,1,0.0,1,0


In [None]:
#| hide

import nbdev

nbdev.nbdev_export()