# Real example using components and different K factors

## Import statements

In [1]:
import os
import numpy as np
import pandas as pd

import dfg_rating.viz.jupyter_widgets as DFGWidgets
import dfg_rating.viz.tables as DFGTables
from dfg_rating.model.network.base_network import WhiteNetwork
from dfg_rating.model.rating.elo_rating import ELORating, SplitELORating
from dfg_rating.model.forecast.true_forecast import LogFunctionForecast
from dfg_rating.model.evaluators.accuracy import RankProbabilityScore, Likelihood

## Loading real data file

In [2]:
real_data = pd.read_csv(os.path.join('real_data.csv'),sep = ",")
real_data

Unnamed: 0,_id,id,match_date,tournament_id,tournament_name,season_id,season_name,round_id,round_name,team1_id,...,referee_id,referee_name,stadium_id,stadium_name,duration,match_name,available_events,calc_ts_first,ts,match_iso_date
0,5fd8ef4b693ae9a24dbbae83,1797173,2020-10-20 19:55:00,11,Europe. UEFA Champions League,0,2020-2021,1,Week 1,1,...,$3949.00,Benoit Bastien,$16239.00,Stadion Krestovskyi,$96.83,Zenit - Club Brugge,True,2020-10-20 22:51:04,2021-03-20 04:26:03.96,2020-10-20T19:55:00.000Z
1,5fd8ef4b693ae9a24dbbae6d,1797126,2020-10-21 22:00:00,11,Europe. UEFA Champions League,0,2020-2021,1,Week 1,151,...,$59.00,Bjorn Kuipers,$205.00,Giuseppe Meazza,$98.45,Internazionale - Borussia M'gladbach,True,2020-10-22 00:19:01,2021-03-20 04:26:03.96,2020-10-21T22:00:00.000Z
2,5fd8ef4b693ae9a24dbbae71,1797114,2020-10-21 22:00:00,11,Europe. UEFA Champions League,0,2020-2021,1,Week 1,152,...,$688.00,Michael Oliver,$236.00,Allianz-Arena,$93.29,Bayern Munich - Atletico Madrid,True,2020-10-22 03:59:03,2021-03-20 04:26:03.96,2020-10-21T22:00:00.000Z
3,5fd8ef4b693ae9a24dbbae67,1797149,2020-10-21 22:00:00,11,Europe. UEFA Champions League,0,2020-2021,1,Week 1,157,...,$62.00,Feliks Brych,$103.00,Amsterdam Arena,$97.61,Ajax - Liverpool FC,True,2020-10-21 23:46:03,2021-03-19 13:01:04.668,2020-10-21T22:00:00.000Z
4,5fd8ef4b693ae9a24dbbae77,1797161,2020-10-20 22:00:00,11,Europe. UEFA Champions League,0,2020-2021,1,Week 1,20,...,$4059.00,Davide Massa,$170.00,Stamford Bridge,$95.27,Chelsea - Sevilla,True,2020-10-21 00:18:05,2021-03-19 13:02:00.307,2020-10-20T22:00:00.000Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2361,5fd8ef40693ae9a24dbba5c3,1724882,2020-11-21 15:30:00,39,England. Premier League,0,2020-2021,9,Week 9,249,...,$3909.00,Craig Pawson,$3916.00,St. James' Park,$95.28,Newcastle United - Chelsea,True,2020-11-21 18:33:04,2021-03-21 19:40:00.901,2020-11-21T15:30:00.000Z
2362,5fd8ef40693ae9a24dbba5b1,1724885,2020-11-23 23:00:00,39,England. Premier League,0,2020-2021,9,Week 9,255,...,$652.00,Andre Marriner,$180.00,Molineux,$95.37,Wolverhampton Wanderers - Southampton,True,2020-11-24 04:29:03,2021-03-21 19:40:00.901,2020-11-23T23:00:00.000Z
2363,5fd8ef40693ae9a24dbba5bf,1724884,2020-11-21 20:30:00,39,England. Premier League,0,2020-2021,9,Week 9,67,...,$2829.00,Mike Dean,$19384.00,Tottenham Hotspur Stadium,$95.71,Tottenham Hotspur - Manchester City,True,2020-11-22 00:07:01,2021-03-21 19:40:00.901,2020-11-21T20:30:00.000Z
2364,5fd8ef40693ae9a24dbba5c1,1724876,2020-11-21 18:00:00,39,England. Premier League,0,2020-2021,9,Week 9,71,...,$688.00,Michael Oliver,$43.00,Villa Park,$98.35,Aston Villa - Brighton & Hove Albion,True,2020-11-21 21:31:02,2021-03-21 19:40:00.901,2020-11-21T18:00:00.000Z


In [3]:
real_data["result"] = np.where(
    (real_data["team1_score"] > real_data["team2_score"]),
    "home",
    "draw"
)
real_data["result"] = np.where(
    (real_data["team1_score"] < real_data["team2_score"]),
    "away",
    real_data["result"]
)
real_data[["team1_score", "team2_score", "result"]]

Unnamed: 0,team1_score,team2_score,result
0,1,2,away
1,2,2,draw
2,4,0,home
3,0,1,away
4,0,0,draw
...,...,...,...
2361,0,2,away
2362,1,1,draw
2363,2,0,home
2364,1,2,away


In [4]:
real_data["split_k_group"] = real_data["tournament_id"].astype(str)
real_data["split_k_group"].unique()

array(['11', '17', '20', '24', '31', '37', '39'], dtype=object)

In [5]:
real_data_network = WhiteNetwork(
    data=real_data,
    mapping={
        "node1": {
            "id": "team2_id",
            "name": "team2_name",
        },
        "node2": {
            "id": "team1_id",
            "name": "team1_name",
        },
        "day": "match_date",
        "dayIsTimestamp": True,
        "ts_format": "%Y-%m-%d %H:%M:%S",
        "season": "season_id",
        "winner": {
            "result": "result",
            "translation": {
                "home": "home",
                "draw": "draw",
                "away": "away"
            }
        },
        "round": "day"
    }
)

Network loaded correctly


In [6]:
real_data_network.data.number_of_edges()

2366

In [7]:
champions = uefa = 0
for node1, node2, edge in real_data_network.data.edges(data=True):
    if edge["tournament_id"] == 11:
        champions += 1
    if edge["tournament_id"] == 17:
        uefa += 1

In [8]:
uefa

362

Delete without national league appearances

In [9]:
current_nodes = [node for node in real_data_network.data.nodes()]
for node in current_nodes:
    if real_data_network.data.degree(node) < 20:
        print(real_data_network.data.nodes[node])
        real_data_network.data.remove_node(node)

{'name': 'Linfield'}
{'name': 'SP Tre Fiori'}
{'name': 'Inter Club Escaldes'}
{'name': 'Drita'}
{'name': 'Sileks'}
{'name': 'Qarabag FK'}
{'name': 'Legia Warszawa'}
{'name': 'Zeta'}
{'name': 'U.E. Engordany'}
{'name': 'Astana'}
{'name': 'Dynamo Brest'}
{'name': 'KR'}
{'name': 'Celtic FC'}
{'name': 'Europa'}
{'name': 'Crvena zvezda'}
{'name': 'Omonia Nicosia'}
{'name': 'FC Ararat-Armenia'}
{'name': 'KuPS'}
{'name': 'Molde'}
{'name': 'Suduva'}
{'name': 'Flora'}
{'name': 'Riga'}
{'name': 'Maccabi Tel Aviv'}
{'name': 'Tirana'}
{'name': 'Dinamo Tbilisi'}
{'name': 'Dundalk'}
{'name': 'Celje'}
{'name': 'Djurgarden'}
{'name': 'Ferencvaros'}
{'name': 'Fola Esch'}
{'name': 'Sheriff'}
{'name': 'Ludogorets'}
{'name': 'Buducnost'}
{'name': 'Sarajevo'}
{'name': "Connah's Quay"}
{'name': 'CFR Cluj'}
{'name': 'Floriana'}
{'name': 'B36'}
{'name': "St Joseph's (GIB)"}
{'name': 'Barry Town United'}
{'name': 'NSI'}
{'name': 'Iskra'}
{'name': 'FC Santa Coloma'}
{'name': 'HB'}
{'name': 'Glentoran FC'}
{'nam

In [None]:
app = DFGWidgets.NetworkExplorer(network=real_data_network)
app.run('external')

## Adding a trained ELO Rating and storing results

In [11]:
minimum_k = 20
maximum_k = 100
k_options = [v for v in range(minimum_k, maximum_k + 1, 2)]

In [12]:
same_k_results = []
for k in k_options:
    rating_name = f"elo_rating_{k}"
    forecast_name = f"elo_forecast_{k}"
    elo = ELORating(
        trained=True, 
        rating_name=rating_name,
        **{
            'param_c' : 10,
            'param_d' : 400,
            'param_k' : k,
            'param_w' : 50
        }
    )
    real_data_network.add_rating(elo, rating_name=rating_name)
    real_data_network.add_forecast(
        LogFunctionForecast(
            outcomes=['home', 'draw', 'away'],
            coefficients=[-0.9, 0.3],
            beta_parameter=0.006
        ),
        forecast_name,
        rating_name
    )
    rps = RankProbabilityScore(
        outcomes=['home', 'draw', 'away'],
        forecast_name=forecast_name
    )
    l = Likelihood(
        outcomes=['home', 'draw', 'away'],
        forecast_name=forecast_name  
    )
    real_data_network.add_evaluation([
        (rps, f"{rating_name}_RPS"),
        (l, f"{rating_name}_likelihood")
    ])
    same_k_results += DFGTables.get_evaluation(
        real_data_network, 
        k, 
        evaluators=["RPS", "likelihood"],
        extra_attributes=["tournament_id", "tournament_name"],
        add_true_dimension=False
    )

elo_forecast_20
elo_forecast_22
elo_forecast_24
elo_forecast_26
elo_forecast_28
elo_forecast_30
elo_forecast_32
elo_forecast_34
elo_forecast_36
elo_forecast_38
elo_forecast_40
elo_forecast_42
elo_forecast_44
elo_forecast_46
elo_forecast_48
elo_forecast_50
elo_forecast_52
elo_forecast_54
elo_forecast_56
elo_forecast_58
elo_forecast_60
elo_forecast_62
elo_forecast_64
elo_forecast_66
elo_forecast_68
elo_forecast_70
elo_forecast_72
elo_forecast_74
elo_forecast_76
elo_forecast_78
elo_forecast_80
elo_forecast_82
elo_forecast_84
elo_forecast_86
elo_forecast_88
elo_forecast_90
elo_forecast_92
elo_forecast_94
elo_forecast_96
elo_forecast_98
elo_forecast_100


In [13]:
diff_k_results = []
for inleague_k in k_options:
    for outleague_k in k_options:
        rating_name = f"elo_rating_in{inleague_k}_out{outleague_k}"
        forecast_name = f"elo_forecast_in{inleague_k}_out{outleague_k}"
        elo = SplitELORating(
            trained=True, 
            param_split_k = {
                "11": outleague_k,
                "17": outleague_k,
                "20": inleague_k,
                "24": inleague_k,
                "31": inleague_k,
                "37": inleague_k,
                "39": inleague_k
            },
            param_c = 10,
            param_d = 400,
            param_w = 50
        )
        real_data_network.add_rating(rating=elo, rating_name=rating_name)
        real_data_network.add_forecast(
            LogFunctionForecast(
                outcomes=['home', 'draw', 'away'],
                coefficients=[-0.9, 0.3],
                beta_parameter=0.006
            ),
            forecast_name,
            rating_name
        )
        rps = RankProbabilityScore(
            outcomes=['home', 'draw', 'away'],
            forecast_name=forecast_name
        )
        l = Likelihood(
            outcomes=['home', 'draw', 'away'],
            forecast_name=forecast_name  
        )
        real_data_network.add_evaluation([
            (rps, f"{rating_name}_RPS"),
            (l, f"{rating_name}_likelihood")
        ])
        diff_k_results += DFGTables.get_evaluation(
            real_data_network, 
            f"in{inleague_k}_out{outleague_k}", 
            evaluators=["RPS", "likelihood"],
            extra_attributes=["tournament_id", "tournament_name"],
            add_true_dimension=False,
            elo_rating_in=inleague_k,
            elo_rating_out=outleague_k
        )

elo_forecast_in20_out20
elo_forecast_in20_out22
elo_forecast_in20_out24
elo_forecast_in20_out26
elo_forecast_in20_out28
elo_forecast_in20_out30
elo_forecast_in20_out32
elo_forecast_in20_out34
elo_forecast_in20_out36
elo_forecast_in20_out38
elo_forecast_in20_out40
elo_forecast_in20_out42
elo_forecast_in20_out44
elo_forecast_in20_out46
elo_forecast_in20_out48
elo_forecast_in20_out50
elo_forecast_in20_out52
elo_forecast_in20_out54
elo_forecast_in20_out56
elo_forecast_in20_out58
elo_forecast_in20_out60
elo_forecast_in20_out62
elo_forecast_in20_out64
elo_forecast_in20_out66
elo_forecast_in20_out68
elo_forecast_in20_out70
elo_forecast_in20_out72
elo_forecast_in20_out74
elo_forecast_in20_out76
elo_forecast_in20_out78
elo_forecast_in20_out80
elo_forecast_in20_out82
elo_forecast_in20_out84
elo_forecast_in20_out86
elo_forecast_in20_out88
elo_forecast_in20_out90
elo_forecast_in20_out92
elo_forecast_in20_out94
elo_forecast_in20_out96
elo_forecast_in20_out98
elo_forecast_in20_out100
elo_forecast_in

### K for each league

In [None]:
minimum_k = 30
maximum_k = 60
k_options = [v for v in range(minimum_k, maximum_k + 1, 5)]

In [None]:
import itertools
full_list = [k_options] * 7
test_runs = itertools.product(*full_list)

In [None]:
for k_11, k_17, k_20, k_24, k_31, k_37, k_39 in test_runs:
    print(k_11, k_17, k_20, k_24, k_31, k_37, k_39)

In [None]:
experiment_results = []
for k_11, k_17, k_20, k_24, k_31, k_37, k_39 in test_runs:
    test_run_name = f"11_{k_11}__17_{k_17}__20_{k_20}__24_{k_24}__31_{k_31}__37_{k_37}__39_{k_39}__"
    rating_name = f"elo_rating_{test_run_name}"
    forecast_name = f"elo_forecast_{test_run_name}"
    elo = SplitELORating(
        trained=True, 
        param_split_k = {
        "11": k_11,
        "17": k_17,
        "20": k_20,
        "24": k_24,
        "31": k_31,
        "37": k_37,
        "39": k_39
    })
    real_data_network.add_rating(rating=elo, rating_name=rating_name)
    real_data_network.add_forecast(
        LogFunctionForecast(
            outcomes=['home', 'draw', 'away'],
            coefficients=[-0.9, 0.3],
            beta_parameter=0.006
        ),
        forecast_name,
        rating_name
    )
    rps = RankProbabilityScore(
        outcomes=['home', 'draw', 'away'],
        forecast_name=forecast_name
    )
    l = Likelihood(
        outcomes=['home', 'draw', 'away'],
        forecast_name=forecast_name  
    )
    real_data_network.add_evaluation([
        (rps, f"{rating_name}_RPS"),
        (l, f"{rating_name}_likelihood")
    ])
    experiment_results += DFGTables.get_evaluation(
        real_data_network, 
        test_run_name, 
        evaluators=["RPS", "likelihood"],
        add_true_dimension=False,
        k_11=k_11,
        k_17=k_17,
        k_20=k_20,
        k_24=k_24,
        k_31=k_31,
        k_37=k_37,
        k_39=k_39,
    )

## Reading results

In [34]:
import pandas as pd
import numpy as np
import datetime
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from scipy.stats import pearsonr
import statsmodels.api as sm
from tqdm import tqdm

pio.templates.default = "plotly_white"

pd.options.display.float_format = '{:.8f}'.format

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [15]:
same_k_df = pd.DataFrame(same_k_results)

In [21]:
diff_k_df = pd.DataFrame(diff_k_results)

In [19]:
international_league_ids = [11, 17]

In [28]:
same_k_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77818 entries, 0 to 77817
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   HomeTeam            77818 non-null  object 
 1   AwayTeam            77818 non-null  object 
 2   Season              77818 non-null  int64  
 3   Round               77818 non-null  int64  
 4   Day                 77818 non-null  int64  
 5   Result              77818 non-null  object 
 6   CalculatedForecast  77818 non-null  object 
 7   ELO_Rating_K        77818 non-null  int64  
 8   tournament_id       77818 non-null  int64  
 9   tournament_name     77818 non-null  object 
 10  add_true_dimension  77818 non-null  bool   
 11  RPS                 77818 non-null  float64
 12  likelihood          77818 non-null  float64
 13  Home_elo_rating     77818 non-null  float64
 14  Away_elo_rating     77818 non-null  float64
dtypes: bool(1), float64(4), int64(5), object(5)
memory us

In [22]:
today = datetime.datetime.today().strftime("%A, %d. %B %Y %I:%M%p")
same_k_df.to_csv(os.path.join("results", f"SAME {today}.csv"))
diff_k_df.to_csv(os.path.join("results", f"DIFF {today}.csv"))

In [None]:
past_result_file_name = "Tuesday, 10. May 2022 11:50AM.csv"
experiment_df = pd.read_csv(os.path.join("results", past_result_file_name))

### Split of data
The 10 first rounds are used as a rating initialisation. After a third of the rounds is used as in_sample evaluation.

In [23]:
number_of_rounds = same_k_df.Round.max() + 1
same_k_df.columns

Index(['HomeTeam', 'AwayTeam', 'Season', 'Round', 'Day', 'Result',
       'CalculatedForecast', 'ELO_Rating_K', 'tournament_id',
       'tournament_name', 'add_true_dimension', 'RPS', 'likelihood',
       'Home_elo_rating', 'Away_elo_rating'],
      dtype='object')

In [24]:
in_sample = same_k_df[
    (same_k_df.Round > (number_of_rounds * 0.2)) &
    (same_k_df.Round <= (number_of_rounds * 1.0))
]

In [35]:
in_sample_agg = in_sample.groupby(['ELO_Rating_K' ], as_index=False).agg({
    'RPS': 'mean', 'likelihood': 'mean'
})

The 5 most optimal K when training with just one k for all competitions

In [37]:
in_sample_agg.sort_values(by=['RPS'], ascending=True).head(5)

Unnamed: 0,ELO_Rating_K,RPS,likelihood
11,42,0.21213418,-1.02497322
12,44,0.21215096,-1.02504176
10,40,0.21215173,-1.02501852
13,46,0.21219867,-1.02521415
9,38,0.21220751,-1.02518903


In [29]:
optimal_same_df = in_sample[in_sample.ELO_Rating_K == 42]

Aggregated RPS inside the leagues

In [31]:
optimal_same_df[~optimal_same_df.tournament_id.isin(international_league_ids)].RPS.mean()

0.2111280175290197

Aggregated RPS outside the leagues

In [33]:
optimal_same_df[optimal_same_df.tournament_id.isin(international_league_ids)].RPS.mean()

0.2349125154533411

In [39]:
fig = go.Figure(
    data=go.Scatter(
        x=in_sample_agg["ELO_Rating_K"],
        y=in_sample_agg["RPS"],
        mode='lines+markers'
    )
)
fig.update_layout(
    width=600,
    height=600
)
fig.show()

### Analysis for international matches

In [41]:
in_sample = diff_k_df[
    (diff_k_df.Round > (number_of_rounds * 0.2)) &
    (diff_k_df.Round <= (number_of_rounds * 1.0))
]
in_sample_agg = in_sample.groupby(['ELO_Rating_K' ], as_index=False).agg({
    'RPS': 'mean', 'likelihood': 'mean', 'elo_rating_in': 'first', 'elo_rating_out': 'first'
})

The 5 most optimal k combinations when training with just one k for all competitions

In [43]:
in_sample_agg.sort_values(by=['RPS'], ascending=True).head(10)

Unnamed: 0,ELO_Rating_K,RPS,likelihood,elo_rating_in,elo_rating_out
510,in42_out54,0.21210446,-1.02491174,42,54
511,in42_out56,0.21210476,-1.02491737,42,56
509,in42_out52,0.21210563,-1.02491053,42,52
512,in42_out58,0.2121065,-1.02492737,42,58
508,in42_out50,0.21210827,-1.02491382,42,50
513,in42_out60,0.21210965,-1.02494167,42,60
507,in42_out48,0.21211243,-1.02492166,42,48
469,in40_out54,0.2121141,-1.02493619,40,54
514,in42_out62,0.2121142,-1.02496021,42,62
468,in40_out52,0.21211477,-1.0249338,40,52


In [45]:
fig = go.Figure(
    data=go.Heatmap(
        x=in_sample_agg["elo_rating_in"],
        y=in_sample_agg["elo_rating_out"],
        z=in_sample_agg["RPS"],
        colorscale=px.colors.sequential.gray
    )
)
fig.update_layout(
    width=600,
    height=600
)
fig.show()

In [52]:
optimal_same_df = in_sample[in_sample.ELO_Rating_K == 'in42_out54']

Aggregated RPS inside the leagues

In [53]:
optimal_same_df[~optimal_same_df.tournament_id.isin(international_league_ids)].RPS.mean()

0.21112963726904088

Aggregated RPS outside the leagues

In [54]:
optimal_same_df[optimal_same_df.tournament_id.isin(international_league_ids)].RPS.mean()

0.23417341994059981