In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# fetch football data from season-1819.csv
df = pd.read_csv("team_statistics/season-1819.csv")     

In [4]:
print(df.columns)

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC',
       'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD',
       'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA',
       'VCH', 'VCD', 'VCA', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD',
       'BbMxA', 'BbAvA', 'BbOU', 'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5',
       'BbAv<2.5', 'BbAH', 'BbAHh', 'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA',
       'PSCH', 'PSCD', 'PSCA'],
      dtype='object')


In [5]:
# take only the 'HomeTeam' , 'AwayTeam' , 'FTHG' , 'FTAG' columns
df = df[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]
df.head(1)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG
0,Man United,Leicester,2,1


In [6]:
# calculate average goal and standard deviation for each team along
df_avg_home = df.groupby(['HomeTeam']).mean()
df_std_home = df.groupby(['HomeTeam']).std()
df_avg_away = df.groupby(['AwayTeam']).mean()
df_std_away = df.groupby(['AwayTeam']).std()

In [7]:
print(df.shape[0],"MATCHES in total"); print(df_avg_home.shape[0], "TEAMS in total")

380 MATCHES in total
20 TEAMS in total


In [8]:
# change the column names
df_avg_home.columns = ['attack_avg_home', 'defence_avg_home']
df_std_home.columns = ['attack_std_home', 'defence_std_home']
df_avg_away.columns = ['defence_avg_away', 'attack_avg_away']
df_std_away.columns = ['defence_std_away', 'attack_std_away']

In [9]:
display(df_avg_home.head(1))
display(df_std_home.head(1))
display(df_avg_away.head(1))
display(df_std_away.head(1))

Unnamed: 0_level_0,attack_avg_home,defence_avg_home
HomeTeam,Unnamed: 1_level_1,Unnamed: 2_level_1
Arsenal,2.210526,0.842105


Unnamed: 0_level_0,attack_std_home,defence_std_home
HomeTeam,Unnamed: 1_level_1,Unnamed: 2_level_1
Arsenal,1.228321,0.83421


Unnamed: 0_level_0,defence_avg_away,attack_avg_away
AwayTeam,Unnamed: 1_level_1,Unnamed: 2_level_1
Arsenal,1.842105,1.631579


Unnamed: 0_level_0,defence_std_away,attack_std_away
AwayTeam,Unnamed: 1_level_1,Unnamed: 2_level_1
Arsenal,1.213954,1.211543


In [10]:
# concatenate the four dataframes
df_avg = pd.concat([df_avg_home, df_avg_away], axis=1)
df_std = pd.concat([df_std_home, df_std_away], axis=1)

In [11]:
display( df_avg.head(3) )
display( df_std.head(3) )

Unnamed: 0,attack_avg_home,defence_avg_home,defence_avg_away,attack_avg_away
Arsenal,2.210526,0.842105,1.842105,1.631579
Bournemouth,1.578947,1.315789,2.368421,1.368421
Brighton,1.0,1.473684,1.684211,0.842105


Unnamed: 0,attack_std_home,defence_std_home,defence_std_away,attack_std_away
Arsenal,1.228321,0.83421,1.213954,1.211543
Bournemouth,1.21636,1.204281,1.706541,1.570935
Brighton,0.881917,1.389181,1.00292,0.83421


In [23]:
# save the dataframes to csv files
df_avg.to_csv("team_statistics/avg_goals.csv")
df_std.to_csv("team_statistics/std_goals.csv")

In [12]:
import random
import itertools

# create all the permutations of 20 teams
team_list = list(df_avg.index)
team_list_perm = list(itertools.permutations(team_list, 2))
print(len(team_list_perm)," matches CREATED."); print("example match=",team_list_perm[0])

380  matches CREATED.
example match= ('Arsenal', 'Bournemouth')


In [15]:
# decide the scores for each match by considering the average, standard deviation, away and home
scores = []
for match in team_list_perm:
    # calculate the scores for each match
    # score for home team
    score_home = random.normalvariate(df_avg.loc[match[0]]['attack_avg_home'], df_std.loc[match[0]]['attack_std_home'])
    # score for away team
    score_away = random.normalvariate(df_avg.loc[match[1]]['attack_avg_away'], df_std.loc[match[1]]['attack_std_away'])
    # round the scores to closest integer
    # if score < 0, round to 0
    score_home = round(score_home)
    if score_home < 0:
        score_home = 0
    score_away = round(score_away)
    if score_away < 0:
        score_away = 0
    # append the scores to the list
    scores.append([match[0], match[1], score_home, score_away])

In [20]:
scores[0:20]

[['Arsenal', 'Bournemouth', 3, 2],
 ['Arsenal', 'Brighton', 3, 2],
 ['Arsenal', 'Burnley', 1, 3],
 ['Arsenal', 'Cardiff', 3, 1],
 ['Arsenal', 'Chelsea', 3, 3],
 ['Arsenal', 'Crystal Palace', 4, 1],
 ['Arsenal', 'Everton', 3, 3],
 ['Arsenal', 'Fulham', 2, 0],
 ['Arsenal', 'Huddersfield', 2, 0],
 ['Arsenal', 'Leicester', 1, 0],
 ['Arsenal', 'Liverpool', 2, 0],
 ['Arsenal', 'Man City', 4, 3],
 ['Arsenal', 'Man United', 5, 1],
 ['Arsenal', 'Newcastle', 3, 3],
 ['Arsenal', 'Southampton', 4, 1],
 ['Arsenal', 'Tottenham', 2, 1],
 ['Arsenal', 'Watford', 2, 2],
 ['Arsenal', 'West Ham', 2, 2],
 ['Arsenal', 'Wolves', 2, 0],
 ['Bournemouth', 'Arsenal', 2, 3]]

In [22]:
# save the scores to a csv file
df_scores = pd.DataFrame(scores, columns=['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG'])
df_scores.to_csv("team_statistics/example_generated_scores.csv", index=False)
