In [None]:
#| default_exp datastructure.data_extractor

In [None]:
#| hide

from IPython.core.debugger import set_trace

%load_ext autoreload
%autoreload 2

# Data Extractor
> Extract games and its features from multiple DB collections.

In [None]:
#| export

import datetime
import json
from typing import Tuple

import pandas as pd
import numpy as np

from fastbet.config.mongo import mongo_init
from fastbet.datastructure.game_features import *
from fastbet.datastructure.odds import *
from fastbet.datastructure.team_lineup import * 

## Aggregate Data


We provide a function that seeks to retrieve the list of games recorded in our `gameFeatures` MongoDb Collection and aggregate it with its additional features such as `Lineups` information (lineups Collection) and 1x2 and Asian Handicap `odds`(Odds collection).

In [None]:
# | export


def data_aggregator(
    db_hosts: dict,  # All DB hosts.
    config: dict,  # Database config.
    db_host: str = "prod_atlas",  # Database host name.
    limit: int = None,  # Number of rows to extract.
) -> pd.DataFrame:  # Mapped games.
    "Returns and aggregates games information from multiple Db collections."

    def _odds(
        game_id: str,  # Real-analytics game identifier.
        game_date: datetime.datetime,  # Find the lastest data document prior to `date`.
        market_type: str,  # Type of market required; should one of 1x2 and Asian Handicap.
    ) -> np.ndarray:  # Odds values.
        "Returns game Odds. It can be 1x2 or Asian Handicap."

        if market_type == "1x2":
            return MarketOdds.get_odds_features(
                ra_game_id=game_id, market=market_type, date=game_date
            )[["odds1", "oddsX", "odds2"]].values[0]
        else:
            return MarketOdds.get_latest(
                ra_game_id=game_id, market=market_type, date=game_date
            )[["odds1", "odds2", "line_id"]].values[0]

    def _team_features(
        team_id: str,  # Real-analytics game identifier.
        game_date: datetime.datetime,  # Find the lastest data document prior to `date`.
    ) -> Tuple:  # Lineup values (name:position, ids, slots, formation name, timestamp).
        "Returns lineup features of a given team."

        # Lineup features.
        team_features = TeamSheet.get_latest(ra_team_id=team_id, date=game_date)
        
        # Team name.
        team_name = team_features.name
        # Players and positions.
        team_lineups_names = json.dumps(
            {player.name: player.position for player in team_features.starting}
        )
        # Players ids.
        team_lienups_ids = list(player.opta_id for player in team_features.starting)
        # Players slots.
        team_lienups_slots = list(player.slot for player in team_features.starting)
        # Formation name.
        formation_name = team_features.starting.first().formation
        # Lineup timestamp.
        lineup_time_stamp = team_features.received_at

        return (
            team_name,
            team_lineups_names,
            team_lienups_ids,
            team_lienups_slots,
            formation_name,
            lineup_time_stamp,
        )

    # Connect to database.
    #start_time = time.time()
    mongo_init(db_hosts=db_hosts, config=config, db_host=db_host)
    #con_time = time.time()
    #print("--- connection_time: %s seconds ---" % (con_time - start_time))

    # Extract games.
    games = GameFeatures.get_all_games(limit=limit)
    games = pd.DataFrame(games.as_pymongo())
    #game_time = time.time()
    #print("--- games_time: %s seconds ---" % (game_time - con_time))

    # Filter Data.
    games = games[
        [
            "gameId",
            "game_optaId",
            "gameDate",
            "homeTeamId",
            "homeTeam_optaId",
            "awayTeamId",
            "awayTeam_optaId",
            "tgt_gd",
            "tgt_outcome",
        ]
    ]

    # Add 1X2 odds.
    games[["preGameOdds1", "preGameOddsX", "preGameOdds2"]] = games.apply(
        lambda row: _odds(
            game_id=row["gameId"],
            game_date=row["gameDate"],
            market_type="1x2",
        ),
        axis="columns",
        result_type="expand",
    )
    #o1x2_time = time.time()
    #print("--- 1x2_time: %s seconds ---" % (o1x2_time - game_time))
    

    # Add Asian handicap odds.
    games[["preGameAhHome", "preGameAhAway", "LineId"]] = games.apply(
        lambda row: _odds(
            game_id=row["gameId"],
            game_date=row["gameDate"],
            market_type="asian",
        ),
        axis="columns",
        result_type="expand",
    )
    #ah_time = time.time()
    #print("--- ah_time: %s seconds ---" % (ah_time - o1x2_time))

    # Add Home team lineup features.
    games[
        [
            "homeTeamName",
            "homeTeamLineup",
            "homeTeamLineupIds",
            "homeTeamLineupSlots",
            "homeTeamFormation",
            "home_team_lineup_received_at",
        ]
    ] = games.apply(
        lambda row: _team_features(
            team_id=row["homeTeamId"], game_date=row["gameDate"]
        ),
        axis="columns",
        result_type="expand",
    )
    #lup_time = time.time()
    #print("--- lup_time: %s seconds ---" % (lup_time - ah_time))

    # Add away team lineup features.
    games[
        [
            "awayTeamName",
            "awayTeamLineup",
            "awayTeamLineupIds",
            "awayTeamLineupSlots",
            "awayTeamFormation",
            "away_team_lineup_received_at",
        ]
    ] = games.apply(
        lambda row: _team_features(
            team_id=row["awayTeamId"], game_date=row["gameDate"]
        ),
        axis="columns",
        result_type="expand",
    )
    #lup_feats = time.time()
    #print("--- lup_feats: %s seconds ---" % (lup_feats - lup_time))

    # Map results {homewin -> 0 , draw -> 1, awaywin -> 2}.
    games["tgt_outcome"] = games["tgt_outcome"].map({1.0: 0.0, 0.0: 2.0, 0.5: 1.0})

    return games

In [None]:
from fastbet.config.localconfig import CONFIG, DB_HOSTS

In [None]:
data_aggregator(
    db_hosts=DB_HOSTS,
    config=CONFIG,
    db_host= "public_atlas",
    limit=10,
)

--- connection_time: 0.0016415119171142578 seconds ---
--- games_time: 0.08488011360168457 seconds ---
--- 1x2_time: 0.8590593338012695 seconds ---
--- ah_time: 0.9043731689453125 seconds ---
--- lup_time: 0.8735835552215576 seconds ---
--- lup_feats: 0.8477203845977783 seconds ---


Unnamed: 0,gameId,game_optaId,gameDate,homeTeamId,homeTeam_optaId,awayTeamId,awayTeam_optaId,tgt_gd,tgt_outcome,preGameOdds1,...,homeTeamLineupIds,homeTeamLineupSlots,homeTeamFormation,home_team_lineup_received_at,awayTeamName,awayTeamLineup,awayTeamLineupIds,awayTeamLineupSlots,awayTeamFormation,away_team_lineup_received_at
0,174dba7291174b4dbbfa9ea12dd944bb45bdd8ed905524...,990997,2018-08-22 18:45:00,126905d14981e6b97912ad4fec354035ccef26cb8ec4e1...,7,419088133137a53bfdb1b7e2e682d223d33a6fa075bbfe...,94,0,1.0,2.62,...,"[12150, 59115, 122806, 54764, 49773, 37339, 43...","[4, 6, 8, 9, 11, 7, 5, 2, 3, 1, 10]",4-4-1-1,2018-08-22 18:15:00.000,Brentford,"{""Daniel Bentley"": ""GK"", ""Ezri Konsa"": ""DCR"", ...","[79602, 199798, 115382, 114275, 176442, 223911...","[1, 5, 9, 8, 3, 6, 7, 10, 4, 11, 2]",4-2-3-1,2018-08-22 18:15:00.000
1,219ef70c0e8a803ec1efdb793443edfaa32398690c7829...,991003,2018-08-22 18:45:00,aeb2f56fcedbcf4cd5c780179766996c7bf0b308064541...,5,f8daf96ad35eebf1c0a5886c72734ba7dec366d6637052...,108,0,1.0,1.98,...,"[156685, 15398, 154561, 165183, 61602, 167802,...","[11, 9, 1, 3, 4, 5, 6, 2, 10, 8, 7]",4-2-3-1,2018-08-22 18:15:00.000,Reading,"{""Leandro Bacuna"": ""DMR"", ""Andy Yiadom"": ""DR"",...","[74297, 87408, 154133, 86857, 17506, 112133, 2...","[4, 2, 8, 6, 5, 3, 1, 7, 10, 11, 9]",4-4-2,2018-08-22 18:15:00.000
2,0655e244d8d596b5572e86426e2a7ca6178044efa59437...,991013,2018-08-25 14:00:00,9ee012a80cade2df55b71580bf5e238bcd6be6f696fdc1...,45,38ca605bcd29a5a37697ca66e533ae817ced71b6bf275c...,2,-3,2.0,2.62,...,"[52980, 84384, 108799, 83428, 57127, 87396, 20...","[2, 6, 8, 5, 7, 10, 1, 11, 3, 9, 4]",4-2-3-1,2018-08-25 13:30:00.000,Leeds United,"{""Mateusz Klich"": ""MCR"", ""Luke Ayling"": ""DR"", ...","[72222, 66588, 98760, 155405, 220037, 61810, 8...","[8, 2, 9, 4, 1, 5, 10, 7, 3, 11, 6]",4-1-4-1,2018-08-25 13:30:00.000
3,019c223b4a03917c2f1685beab4d5d278f7bff3913f239...,991018,2018-08-25 14:00:00,eb89c068ca204a72408360450847a990c97c5b5ff0ec9f...,110,bbb63e4ea54b0d60b48a1f8440254d7e656dfbfcbef825...,88,2,0.0,1.917,...,"[85352, 3773, 105666, 91972, 40555, 61858, 394...","[6, 9, 1, 10, 4, 7, 3, 11, 2, 5, 8]",4-4-1-1,2018-08-25 13:30:00.000,Hull City,"{""Eric Lichaj"": ""DR"", ""Jordy De Wijs"": ""DCL"", ...","[45139, 173549, 15144, 82771, 240499, 28541, 5...","[2, 6, 1, 4, 8, 9, 10, 3, 7, 11, 5]",4-4-1-1,2018-08-25 13:30:00.000
4,0f9ad12eec9f24277ab491f5f26f610eaa918903a34147...,991014,2018-08-25 16:30:00,04c71986b6503ba5b09a7098ceb79954d20049f21ba45b...,17,95d3bddc19a15d34a7876dcffc1a3e9bc63d809b69308a...,41,0,1.0,2.04,...,"[199796, 56827, 130593, 113564, 16045, 83427, ...","[7, 1, 5, 2, 8, 3, 4, 10, 9, 6, 11]",4-2-3-1,2018-08-25 16:00:00.000,Birmingham City,"{""Kristian Pedersen"": ""DL"", ""Maxime Colin"": ""D...","[229009, 86132, 77800, 114054, 69842, 85365, 8...","[3, 2, 4, 9, 11, 8, 7, 1, 5, 6, 10]",4-4-2,2018-08-25 16:00:00.000
5,0af3418e3c2c42119592875b44a1b454faca47a6758765...,1001261,2018-08-31 20:15:00,39ee7c99b58235a4a19c4ee4d15d2730309d812c0c55d0...,371,c42d39676623aaa1608de4208d738718ff45072232783f...,6685,1,0.0,1.85,...,"[52570, 210200, 68166, 87215, 186628, 158046, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",4-3-3,2018-08-31 19:03:15.972,Tondela,"{""Claudio Ramos"": ""GK"", ""David Bruno"": ""DR"", ""...","[79404, 209129, 104294, 75928, 222751, 16738, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",4-2-3-1,2018-08-31 19:05:24.983
6,0606b76acc1cb72173e5da4820ca9c2063276ea7de7d3e...,991020,2018-09-01 14:00:00,95d3bddc19a15d34a7876dcffc1a3e9bc63d809b69308a...,41,e2bfbb5453a7853e049b9434db74d4d06b8c5560ff7cf9...,52,0,1.0,2.05,...,"[59552, 86132, 15904, 89274, 77800, 49284, 853...","[6, 2, 1, 11, 4, 5, 8, 9, 7, 10, 3]",4-4-2,2018-09-01 13:30:00.000,Queens Park Rangers,"{""Joe Lumley"": ""GK"", ""Eberechi Eze"": ""MR"", ""Ja...","[167888, 232413, 80178, 91913, 106606, 18795, ...","[1, 7, 3, 4, 8, 6, 10, 9, 2, 11, 5]",4-4-2,2018-09-01 13:30:00.000
7,08eafc6acd2e4d985a50ac118ad9fd8b4313bd7f9e3035...,991031,2018-09-01 14:00:00,0db353094ccf93e0005cf378ea862b56e77cacc57b7c5e...,111,58301066042bbdf19de8fe7d41afc53626b5aa79034712...,72,1,0.0,1.61,...,"[225796, 169528, 175832, 113862, 117233, 82205...","[2, 3, 7, 8, 5, 10, 11, 4, 9, 1, 6]",4-2-3-1,2018-09-01 13:30:00.000,Rotherham United,"{""Marek Rodak"": ""GK"", ""Zak Vyner"": ""DR"", ""Sean...","[155529, 193576, 91915, 204863, 112211, 146426...","[1, 2, 5, 8, 4, 10, 11, 6, 7, 3, 9]",4-5-1,2018-09-01 13:30:00.000
8,18bd52725658801316f23d4287a8091d9fd176a3281cf1...,1001259,2018-09-01 15:30:00,8e4523cfc530651d74ec43907d2a958c957455fda4aaa9...,2030,2d946ab426d41fd25bc9f4caac4fbbb35f16590f40b24d...,240,0,1.0,2.33,...,"[91989, 72186, 434399, 39989, 84375, 73954, 78...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",4-2-3-1,2018-09-01 14:23:10.812,Vitoria de Setubal,"{""Joel Pereira"": ""GK"", ""Mano"": ""DR"", ""Andre So...","[168196, 40775, 246997, 39358, 240489, 52827, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",4-2-3-1,2018-09-01 14:27:53.902
9,1fca369b46d3696000dc263366e4e9255cf6003284a323...,985194,2018-09-01 18:00:00,6e535e19e2ee3f50ca59e9e866f03eb5411afc29f5bcbe...,1423,8cec0aa0624add20aa3a849fb96ad62c39b5510f9f45a4...,147,-1,2.0,3.15,...,"[106263, 217597, 229230, 244642, 58875, 32367,...","[6, 7, 10, 3, 5, 4, 2, 1, 9, 11, 8]",4-3-3,2018-09-01 17:06:36.273,Montpellier,"{""Ambroise Oyongo"": ""ML"", ""Paul Lasne"": ""DML"",...","[180562, 72177, 86873, 195430, 73965, 116627, ...","[3, 8, 1, 7, 9, 11, 5, 6, 10, 2, 4]",3-4-2-1,2018-09-01 17:06:36.312


In [None]:
#| hide

import nbdev

nbdev.nbdev_export()