# Impact of the network density

Network density describes the protion of of the potential connections in a network that are actual connections. Network density is defined as follow:

$Density = \frac{Actual Connections}{Potential Connections} = \frac{ActualConnections}{\frac{n(n-1)}{2}}$

## Import statements

In [None]:
import time
import networkx as nx

from typing import List
import dfg_rating.viz.jupyter_widgets as DFGWidgets
from dfg_rating.model.network.base_network import BaseNetwork
from dfg_rating.model.network.simple_network import RoundRobinNetwork
from dfg_rating.model.network.random_network import ConfigurationModelNetwork
from dfg_rating.model.rating.controlled_trend_rating import ControlledTrendRating, ControlledRandomFunction
from dfg_rating.model.rating.elo_rating import ELORating


from dfg_rating.model.evaluators.accuracy import RankProbabilityScore, Likelihood
from dfg_rating.model.forecast.true_forecast import LogFunctionForecast

from dfg_rating.viz.tables import get_evaluation

## Data Generation

To generate a full range of networks with different density we increase the number of vertex in the network while keeping the in-out degree of the nodes constant.

In [None]:
in_degree = 100
out_degree = 100

In [None]:
initial_number_of_nodes = 50
maximum_number_of_nodes = 400
nodes_step = 10

### Example network

In [None]:
example_network = ConfigurationModelNetwork(
    teams=500,
    days_between_rounds=3,
    true_forecast=LogFunctionForecast(
        outcomes=['home', 'draw', 'away'],
        coefficients = [-0.9,0.3],
        beta_parameter=0.006
    ),
    true_rating=ControlledTrendRating(
        starting_point=ControlledRandomFunction(distribution='normal', loc=1000, scale=100),
        delta=ControlledRandomFunction(distribution='normal', loc=0, scale=3),
        trend=ControlledRandomFunction(distribution='normal', loc=0, scale=20/365),
        season_delta=ControlledRandomFunction(distribution='normal', loc=0, scale=10)
    ),
    expected_home_matches=100,
    expected_away_matches=100,
    variance_home_matches=0,
    variance_away_matches=0
)

In [None]:
example_network.density(True)

In [None]:
app = DFGWidgets.NetworkExplorer(network=rr)

In [None]:
app.run('inline', port=8001)

### Range of networks

In [None]:
networks_list: List[BaseNetwork] = []

In [None]:
nodes_range = range(initial_number_of_nodes, maximum_number_of_nodes + 1, nodes_step)
for number_of_nodes in nodes_range:
    start_time = time.time()
    networks_list.append(
        ConfigurationModelNetwork(
            teams=number_of_nodes,
            days_between_rounds=3,
            true_rating=ControlledTrendRating(
                starting_point=ControlledRandomFunction(distribution='normal', loc=1000, scale=100),
                delta=ControlledRandomFunction(distribution='normal', loc=0, scale=3),
                trend=ControlledRandomFunction(distribution='normal', loc=0, scale=20/365),
                season_delta=ControlledRandomFunction(distribution='normal', loc=0, scale=10)
            ),
            expected_home_matches=in_degree,
            expected_away_matches=out_degree,
            variance_home_matches=0,
            variance_away_matches=0
        )
    )
    print(f"Added network with {number_of_nodes} number of nodes in {time.time() -  start_time} seconds.")

In [None]:
[n.density(True) for n in networks_list]

### Evaluation metrics

In [None]:
minimum_k = 15
maximum_k = 55
k_options = [v for v in range(minimum_k, maximum_k + 1, 4)]

In [None]:
for k_parameter in k_options:
    start_time = time.time()
    rating_name = f"elo_rating_{k_parameter}"
    forecast_name = f"elo_forecast_{k_parameter}"
    elo = ELORating(trained=True, param_k=k_parameter)
    rps = RankProbabilityScore(
        outcomes=['home', 'draw', 'away'],
        forecast_name=forecast_name
    )
    l = Likelihood(
        outcomes=['home', 'draw', 'away'],
        forecast_name=forecast_name  
    )
    for i, n in enumerate(networks_list):
        print(i, end='\r')
        n.add_rating(
        rating=el
        rating_name=rating_name
        )
        n.add_forecast(
            LogFunctionForecast(
                outcomes=['home', 'draw', 'away'],
                coefficients=[-0.9, 0.3],
                beta_parameter=0.006
            ),
            forecast_name,
            rating_name
        )
        n.add_evaluation(rps, f"{rating_name}_RPS")
        n.add_evaluation(l, f"{rating_name}_likelihood")
    
    print(f"Added ELO Rating with k = {k_parameter} in {time.time() - start_time} seconds.")

## Results

In [None]:
experiment_results = []

In [None]:
for k_parameter in k_options:
    start_time = time.time()
    for i, n in enumerate(networks_list):
        experiment_results += get_evaluation(n, k_parameter, evaluators=['RPS'], **{"Number_of_nodes": nodes_range[i], "Density": n.density(True)})

# Import for Results

In [None]:
import pandas as pd
import numpy as np
import datetime
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from scipy.stats import pearsonr
import statsmodels.api as sm
from tqdm import tqdm

pio.templates.default = "plotly_white"

pd.options.display.float_format = '{:.4f}'.format

pd.set_option("display.max_columns", None)

### New results

In [None]:
experiment_df = pd.DataFrame(experiment_results)

In [None]:
today = datetime.datetime.today().strftime("%A, %d. %B %Y %I:%M%p")
experiment_df.to_csv(os.path.join("Density_results", f"{today}.csv"))

### Read past results

In [None]:
experiment_df = pd.read_csv(os.path.join("..", "..", "scripts","Final_results", "final_density.csv"))

In [None]:
experiment_df[(experiment_df["Number_of_nodes"] == 51) & (experiment_df["ELO_Rating_K"] == 45)][["HomeTeam", "AwayTeam", "Round"]].to_csv("test.csv")

In [None]:
len(experiment_df.Density.unique())

In [None]:
explore = experiment_df[["ELO_Rating_K", "Number_of_nodes", "Density"]].groupby(["Number_of_nodes", "ELO_Rating_K"]).agg({"Density": ["first", "count"]}).reset_index()
#explore.loc[1.0000, 65]
explore[explore["ELO_Rating_K"]==15].head()

Bootstrap function

In [None]:
def bootstrap(data, n_iter=9999):
    n = len(data)
    dist_bootstrapped = np.full(shape=n_iter, fill_value=np.nan)
    for i in tqdm(range(n_iter)):
        resample = np.random.choice(data, size=n, replace=True)
        dist_bootstrapped[i] = np.mean(resample)
    quantiles = np.quantile(a=dist_bootstrapped, q=[0.025, 0.975])
    return quantiles[0], quantiles[1]

### Fig settings

In [None]:
config = {
    'toImageButtonOptions': {
        'format': 'svg', # one of png, svg, jpeg, webp
        'filename': 'figure',
        "height": 400,
        "width": 1000,
        'scale': 1 # Multiply title/legend/axis/canvas sizes by this factor
    }
}
def return_fig_settings(fig_instance, x_title, y_title):
    fig_instance.update_layout(
        xaxis_title=x_title,
        yaxis_title=y_title,
        legend=dict(
            font=dict(
                family='Times New Roman',
                size=14,
                color="Black"
            ),
            orientation='h',
            bordercolor="Black",
            borderwidth=1,
            yanchor="bottom",
            xanchor='right',
            x=1,
            y=1
        )
    )
    font_dict = dict(
        family='Times New Roman',
        size=14,
        color='black'
    )
    fig_instance.update_layout(
        font=font_dict,  # font formatting
        plot_bgcolor='white',  # background color
    )

## Analysis

Raw data from the simulation is at match level. For every match we have each elo_rating K, forecast, RPS and differences

In [None]:
subexplore = explore[explore["ELO_Rating_K"] == 15]
print(subexplore.columns)
subexplore["mean"] = subexplore[('Density', 'count')].mean()
subexplore["active"] = subexplore.Number_of_nodes * 49
fig = go.Figure()
# plot structure
fig.add_trace(go.Bar(
    x=subexplore[('Density', 'first')],
    y=subexplore[('Density', 'count')],
    marker_color='gray',
    showlegend=False
))
# plot
return_fig_settings(fig, 'Network density', 'Number of edges')
fig.update_layout(showlegend=True)
fig.show(config=config)

Some initial densitys are overlapping due to an odd number of nodes and getting a fix number of rounds (where some of them are not participating)

In [None]:
experiment_df[(experiment_df.Number_of_nodes == 2451) & (experiment_df.ELO_Rating_K == 15)]

### Utils

In [None]:
[c for c in experiment_df.columns if c.startswith('Awayelo') ]

In [None]:
experiment_df['Home_ELO_rating'] = experiment_df[['Homeelo_rating_15',
 'Homeelo_rating_17',
 'Homeelo_rating_19',
 'Homeelo_rating_21',
 'Homeelo_rating_23',
 'Homeelo_rating_25',
 'Homeelo_rating_27',
 'Homeelo_rating_29',
 'Homeelo_rating_31',
 'Homeelo_rating_33',
 'Homeelo_rating_35',
 'Homeelo_rating_37',
 'Homeelo_rating_39',
 'Homeelo_rating_41',
 'Homeelo_rating_43',
 'Homeelo_rating_45',
 'Homeelo_rating_47',
 'Homeelo_rating_49',
 'Homeelo_rating_51',
 'Homeelo_rating_53',
 'Homeelo_rating_55',
 'Homeelo_rating_57',
 'Homeelo_rating_59',
 'Homeelo_rating_61',
 'Homeelo_rating_63',
 'Homeelo_rating_65']].fillna(0).sum(axis=1)

In [None]:
test_series.loc[1111]

In [None]:
experiment_df.drop(['Homeelo_rating_15',
       'Awayelo_rating_15', 'Homeelo_rating_17', 'Awayelo_rating_17',
       'Homeelo_rating_19', 'Awayelo_rating_19', 'Homeelo_rating_21',
       'Awayelo_rating_21', 'Homeelo_rating_23', 'Awayelo_rating_23',
       'Homeelo_rating_25', 'Awayelo_rating_25', 'Homeelo_rating_27',
       'Awayelo_rating_27', 'Homeelo_rating_29', 'Awayelo_rating_29',
       'Homeelo_rating_31', 'Awayelo_rating_31', 'Homeelo_rating_33',
       'Awayelo_rating_33', 'Homeelo_rating_35', 'Awayelo_rating_35',
       'Homeelo_rating_37', 'Awayelo_rating_37', 'Homeelo_rating_39',
       'Awayelo_rating_39', 'Homeelo_rating_41', 'Awayelo_rating_41',
       'Homeelo_rating_43', 'Awayelo_rating_43', 'Homeelo_rating_45',
       'Awayelo_rating_45', 'Homeelo_rating_47', 'Awayelo_rating_47',
       'Homeelo_rating_49', 'Awayelo_rating_49', 'Homeelo_rating_51',
       'Awayelo_rating_51', 'Homeelo_rating_53', 'Awayelo_rating_53',
       'Homeelo_rating_55', 'Awayelo_rating_55', 'Homeelo_rating_57',
       'Awayelo_rating_57', 'Homeelo_rating_59', 'Awayelo_rating_59',
       'Homeelo_rating_61', 'Awayelo_rating_61', 'Homeelo_rating_63',
       'Awayelo_rating_63', 'Homeelo_rating_65', 'Awayelo_rating_65'], axis=1, inplace=True)

#### TotalDiff and RatingError

We first add TotalDiff as the addition of the absolute errors in home team rating and away team rating

In [None]:
experiment_df['TrueDiff'] = experiment_df['HomeRating'] - experiment_df['AwayRating']
experiment_df['ELODiff'] = experiment_df['Home_elo_rating'] - experiment_df['Away_elo_rating']
experiment_df['RatingError'] = abs(experiment_df['TrueDiff'].abs() - experiment_df['ELODiff'].abs())
experiment_df['RatingError'].describe()

In [None]:
experiment_df['HomeDiff'] = experiment_df['HomeRating'] - experiment_df['Home_elo_rating']
experiment_df['AwayDiff'] = experiment_df['AwayRating'] - experiment_df['Away_elo_rating']
experiment_df['TotalDiff'] = abs(experiment_df['HomeDiff'].abs() - experiment_df['AwayDiff'].abs())
experiment_df['TotalDiff'].describe()

In [None]:
experiment_df["TeamDiff"] = (experiment_df["HomeDiff"].abs() + experiment_df["AwayDiff"].abs()) / 2

In [None]:
experiment_df[(experiment_df.Round == 4716) & (experiment_df.Number_of_nodes == 2451) & (experiment_df.ELO_Rating_K == 15)][["HomeTeam", "AwayTeam", "Round", "TrueForecast", "CalculatedForecast", "Result", "RPS", "Home_elo_rating", "Away_elo_rating"]]

In [None]:
test = experiment_df.groupby(["Number_of_nodes", "Density", "Round", "Day"]).agg({"HomeRating": ["mean", "std"]})

In [None]:
test.loc[981]

### Optimal K for density values

#### Split of data

The 10 first rounds are used as a rating initialisation. After that a third of the rounds is used as in_sample evaluation

In [None]:
init_dict = {}
split_dict = {}
for index, item in experiment_df.groupby("Density").agg({'Round': 'max'})['Round'].items():
    split_dict[index] = int(item * 0.5)
    init_dict[index] = int(item * 0.2)
experiment_df['Init_Rounds'] = experiment_df['Density'].map(init_dict)
experiment_df['Split_Rounds'] = experiment_df['Density'].map(split_dict)
experiment_df.head()

In [None]:
in_sample = experiment_df[(experiment_df.Round > experiment_df.Init_Rounds) & (experiment_df.Round <=  experiment_df.Split_Rounds)]
in_sample.head()

In_sample and out_sample rounds

In [None]:
in_sample[in_sample.Number_of_nodes == 260].Round.unique()

In [None]:
in_sample_agg = experiment_df.groupby(['Number_of_nodes', 'Density', 'ELO_Rating_K' ], as_index=False).agg({
    'RPS': 'mean', 'RatingError': 'mean', 'TotalDiff': 'mean'
})
in_sample_agg[in_sample_agg.Number_of_nodes == 50].sort_values(by=['RPS'], ascending=True)

In [None]:
optimal_k_df = in_sample_agg[in_sample_agg.groupby(['Number_of_nodes', 'Density'])['RPS'].transform(min) == in_sample_agg['RPS']]
optimal_k_df["color"] = "black"
optimal_k_df.head()

In [None]:
config = {
    'toImageButtonOptions': {
        'format': 'svg', # one of png, svg, jpeg, webp
        'filename': 'figure',
        "height": 400,
        "width": 1000,
        'scale': 1 # Multiply title/legend/axis/canvas sizes by this factor
    }
}
def return_fig_settings(fig_instance, x_title, y_title):
    fig_instance.update_layout(
        xaxis_title=x_title,
        yaxis_title=y_title,
        legend=dict(
            font=dict(
                family='Times New Roman',
                size=15,
                color="Black"
            ),
            orientation='h',
            bordercolor="Black",
            borderwidth=2,
            yanchor="bottom",
            xanchor='right',
            x=1,
            y=1
        )
    )
    font_dict = dict(
        family='Times New Roman',
        size=15,
        color='black'
    )
    fig_instance.update_layout(
        font=font_dict,  # font formatting
        plot_bgcolor='white',  # background color
    )

In [None]:
fig = px.scatter(
    optimal_k_df, x='Density', y='ELO_Rating_K', color="color", trendline='ols', trendline_color_override='lightgray',
    color_discrete_map= {"black": "black"},
    labels={
        "ELO_Rating_K": "K"
    }
)
return_fig_settings(fig, "Network density", "Optimal K")
fig.update_yaxes(range=[14, 60])
fig.update_xaxes(tickmode='linear', tick0=0.0, dtick=0.1)
fig.update_layout(showlegend=False)
fig.show(config=config)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        y=optimal_k_df["ELO_Rating_K"],
        x=optimal_k_df["Density"],
        marker_color="black",
        mode="lines+markers"
    )
)
return_fig_settings(fig, "Network density", "Optimal K")
fig.update_yaxes(range=[10, 60])
fig.update_xaxes(tickmode='linear', tick0=0.0, dtick=0.1)
fig.update_layout(showlegend=False)
fig.show(config=config)

In [None]:
df_optimal = experiment_df.merge(optimal_k_df, on=['Density', 'Number_of_nodes','ELO_Rating_K'], how='inner', suffixes=["", "_IS"])
df_optimal = df_optimal[df_optimal.Round > df_optimal.Split_Rounds]
df_optimal

At very low level of density and high number of games (^^number of nodes with 98 rounds). There are certain differences in true ratings that are really different between home and away, causing a clear prediction with a very low RPS.

In [None]:
df_optimal[df_optimal.Number_of_nodes == 2451].RPS.describe()

In [None]:
result_df = df_optimal.groupby(['Number_of_nodes', 'Density', 'ELO_Rating_K'], as_index=False).agg(
    {'RPS': np.mean, 'RatingError': np.mean, 'ForecastError': np.mean, 'Forecastability': np.mean, 'ExpectedRPS': np.mean, 'TotalDiff': np.mean, "TeamDiff": np.mean}
)
result_df

In [None]:
result_df["color"] = "black"

### Scatter plots with correlation line (OLS)

In [None]:
config = {
    'toImageButtonOptions': {
        'format': 'svg', # one of png, svg, jpeg, webp
        'filename': 'figure',
        "height": 1000,
        "width": 1000,
        'scale': 1 # Multiply title/legend/axis/canvas sizes by this factor
    }
}
def return_fig_settings(fig_instance, x_title, y_title):
    fig_instance.update_layout(
        xaxis_title=x_title,
        yaxis_title=y_title,
        legend=dict(
            font=dict(
                family='Times New Roman',
                size=15,
                color="Black"
            ),
            orientation='h',
            bordercolor="Black",
            borderwidth=2,
            yanchor="bottom",
            xanchor='right',
            x=1,
            y=1
        )
    )
    font_dict = dict(
        family='Times New Roman',
        size=15,
        color='black'
    )
    fig_instance.update_layout(
        font=font_dict,  # font formatting
        plot_bgcolor='white',  # background color
    )

#### RPS based metrics

In [None]:
result_df["color"] = "black"
fig = go.Figure()
for rps_based_metric in ["RPS", "ForecastError", "ExpectedRPS", "Forecastability"]:
    fig.add_trace(
        go.Scatter(
            y=result_df[rps_based_metric],
            x=result_df["Density"],
            mode="lines+markers",
            name=rps_based_metric
        )
    )
return_fig_settings(fig, "Network density", "RPS-based score")
fig.update_layout(showlegend=True)
fig.show(config=config)

In [None]:
config = {
    'toImageButtonOptions': {
        'format': 'svg', # one of png, svg, jpeg, webp
        'filename': 'figure',
        'height': 600,
        'width': 1000,
        'scale': 1 # Multiply title/legend/axis/canvas sizes by this factor
    }
}
def return_fig_settings(fig_instance, x_title, y_title):
    fig_instance.update_layout(
        xaxis3_title=x_title,
        yaxis_title=y_title,
        legend=dict(
            font=dict(
                family='Times New Roman',
                size=14,
                color="Black"
            ),
            orientation='h',
            bordercolor="Black",
            borderwidth=2,
            yanchor="bottom",
            xanchor='right',
            x=1,
            y=1
        ),
        yaxis3_range=[0.18, 0.22],
        yaxis2_range=[0.0050, 0.0090],
        yaxis1_range=[30, 45],        
        yaxis3_title="RPS",
        yaxis2_title="Forecast error",
        yaxis1_title="Rating error",
    )
    font_dict = dict(
        family='Times New Roman',
        size=14,
        color='black'
    )
    fig_instance.update_layout(
        font=font_dict,  # font formatting
        plot_bgcolor='white',  # background color
    )

In [None]:
from plotly.subplots import make_subplots

target_1 = "ForecastError"
target_2 = "TeamDiff"
target_3 = "RPS"
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.1)
for i, (t, t_name) in enumerate([("RPS", "RPS"), ("ForecastError", "Forecast error"), ("TeamDiff", "Rating error")]):
    fig.add_trace(
        go.Scatter(
            y=result_df[t],
            x=result_df["Density"],
            marker_color="black",
            mode="lines+markers",
            showlegend=False
        ),
        row=3 - i,
        col=1
    )
    result_df['bestfit' + t] = sm.OLS(result_df[t], sm.add_constant(result_df['Density'])).fit().fittedvalues
    fig.add_trace(
        go.Scatter(
            x=result_df["Density"],
            y=result_df['bestfit' + t],
            mode='lines',
            marker_color="gray",
            showlegend=False
        ),
        row=3-i, col=1
    )
return_fig_settings(fig, "Network density", "")
fig.update_xaxes(tickmode='linear', tick0=0.0, dtick=0.1)
#fig.update_layout(showlegend=False)
fig.show(config=config)

### Match level rating errors

In [None]:
fig = go.Figure()
for team_based_metric in ["RatingError", "TotalDiff", "TeamDiff"]:
    fig.add_trace(
        go.Scatter(
            y=result_df[team_based_metric],
            x=result_df["Density"],
            mode="lines+markers",
            name=team_based_metric
        )
    )
return_fig_settings(fig, "Network density", "Match level rating errors")
fig.update_layout(showlegend=True)
fig.show(config=config)

In [None]:
target = "RatingError"
fig = px.scatter(
    result_df, x='Density', y=target, color="color", trendline='ols', trendline_color_override='lightgray',
    color_discrete_map={"black": "black"},
    labels={
        "RatingError": "Rating difference absolute error",
        "TotalDiff": "Home and away absolute error"
    },
    width=1000, height=400
)
return_fig_settings(fig, "Network density", target)
fig.update_layout(showlegend=False)
#fig.update_yaxes(range=[150, 400])
fig.update_yaxes(range=[0, 100])
fig.show(config=config)

In [None]:
results = px.get_trendline_results(fig)
results.px_fit_results.iloc[0].summary()

### Adding confidence intervals

In [None]:
x = result_df['Density']
y_upper = [t[1] for t in result_df["RPS_bootstrap"]]
y_lower = [t[0] for t in result_df[("RPS_bootstrap")]]

fig = go.Figure([
    go.Scatter(
        x=x,
        y=result_df["RPS_mean"],
        line=dict(color='rgb(0,100,80)'),
        mode='lines'
    ),
    go.Scatter(
        x=pd.concat([x,x[::-1]]), # x, then x reversed
        y=y_upper+y_lower[::-1], # upper, then lower reversed
        fill='toself',
        fillcolor='rgba(0,100,80,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    )
])
fig.update_yaxes(range=[0.1, 0.6])
fig.show()

In [None]:
x = result_df['Density']
y_upper = [t[1] for t in result_df["RatingError_bootstrap"]]
y_lower = [t[0] for t in result_df[("RatingError_bootstrap")]]

fig = go.Figure([
    go.Scatter(
        x=x,
        y=result_df["RatingError_mean"],
        line=dict(color='rgb(0,100,80)'),
        mode='lines'
    ),
    go.Scatter(
        x=pd.concat([x,x[::-1]]), # x, then x reversed
        y=y_upper+y_lower[::-1], # upper, then lower reversed
        fill='toself',
        fillcolor='rgba(0,100,80,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    )
])

fig.update_yaxes(range=[0, 400])
fig.show()

In [None]:
x = result_df['Density']
y_upper = [t[1] for t in result_df["TotalDiff_bootstrap"]]
y_lower = [t[0] for t in result_df[("TotalDiff_bootstrap")]]

fig = go.Figure([
    go.Scatter(
        x=x,
        y=result_df["TotalDiff_mean"],
        line=dict(color='rgb(0,100,80)'),
        mode='lines'
    ),
    go.Scatter(
        x=pd.concat([x,x[::-1]]), # x, then x reversed
        y=y_upper+y_lower[::-1], # upper, then lower reversed
        fill='toself',
        fillcolor='rgba(0,100,80,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    )
])
fig.update_yaxes(range=[0, 200])
fig.show()