In [1]:
#https://www.kaggle.com/code/blessontomjoseph/premier-league-predictions
#https://www.kaggle.com/code/jeongwonwoobit/epl-winner

In [2]:
from time import time
import pprint
import tqdm
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import namedtuple,OrderedDict
from functools import partial

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import make_scorer
from xgboost import XGBClassifier, DMatrix
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix

sns.set_style('darkgrid')
warnings.simplefilter('ignore')

ModuleNotFoundError: No module named 'skopt'

In [170]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

In [171]:
pd.set_option("display.max_rows", 200)
pd.options.mode.chained_assignment = None  # default='warn'

In [172]:
data_csv = '../data/EPL_results/results.csv'

In [173]:
data = pd.read_csv(data_csv, encoding='windows-1254')
print(data.tail())

       Season              DateTime     HomeTeam        AwayTeam  FTHG  FTAG  \
8284  2021-22  2022-04-09T17:30:00Z  Aston Villa       Tottenham     0     4   
8285  2021-22  2022-04-10T14:00:00Z    Brentford        West Ham     2     0   
8286  2021-22  2022-04-10T14:00:00Z    Leicester  Crystal Palace     2     1   
8287  2021-22  2022-04-10T14:00:00Z      Norwich         Burnley     2     0   
8288  2021-22  2022-04-10T16:30:00Z     Man City       Liverpool     2     2   

     FTR  HTHG  HTAG HTR  ... HST  AST  HC  AC  HF  AF  HY  AY  HR  AR  
8284   A     0     1   A  ...   8    5   9   3  12  14   2   3   0   0  
8285   H     0     0   D  ...   7    1   4   6   2   6   0   1   0   0  
8286   H     2     0   H  ...   3    3   3   4  11  12   1   1   0   0  
8287   H     1     0   H  ...   6    4   6   7  12  10   1   1   0   0  
8288   D     2     1   H  ...   5    4   4   1   9  11   1   4   0   0  

[5 rows x 23 columns]


In [182]:
def CalcRollingValues(df, value, home_col, away_col, window=10, func='mean'):
    
    df[home_col] = np.nan
    df[away_col] = np.nan
    teams = df.HomeTeam.value_counts().index
    
    for team in teams:
        calc_data = df[(df['HomeTeam']==team) | (df['AwayTeam']==team)].rolling(window=window,on="HomeTeam",closed='left')
#         calc_data = df.groupby(team_name)[value].rolling(window=window,closed='left')

        if func=='mean':
            calc_data = calc_data.mean(numeric_only=True)
        elif func=='sum':
            calc_data = calc_data.sum(numeric_only=True)
        else:
            return np.nan

        home_data = calc_data[team == calc_data['HomeTeam']]
        
        away_data = calc_data[team != calc_data['HomeTeam']]
#         series_away = pd.Series(away_data[value],index=away_data.index)
        
        df.loc[home_data.index, home_col] = home_data[value]
        df.loc[away_data.index, away_col] = away_data[value]
#         df[home_col] = pd.Series(HTAC.TC,index=HTAC['level_1'])

In [175]:
df = pd.DataFrame()

In [176]:
df['DateTime'] = data['DateTime']
df['HomeTeam'] = data['HomeTeam']
df['AwayTeam'] = data['AwayTeam']

In [177]:
df['TC'] = data['HC']+data['AC']
data['TC'] = data['HC']+data['AC']

In [178]:
data.groupby("HomeTeam")['HC'].mean()

HomeTeam
Arsenal             7.322892
Aston Villa         6.151261
Birmingham          5.624060
Blackburn           5.770335
Blackpool           4.947368
Bolton              6.162679
Bournemouth         5.684211
Bradford            5.684211
Brentford           4.312500
Brighton            5.510870
Burnley             4.777027
Cardiff             5.394737
Charlton            5.496241
Chelsea             6.789346
Coventry            6.684211
Crystal Palace      5.491979
Derby               4.701754
Everton             6.214976
Fulham              5.371930
Huddersfield        4.921053
Hull                4.831579
Ipswich             7.052632
Leeds               6.360360
Leicester           5.721951
Liverpool           7.328502
Man City            7.329114
Man United          6.934940
Middlesbrough       5.636842
Newcastle           5.920213
Norwich             5.192308
Portsmouth          6.308271
QPR                 5.000000
Reading             6.701754
Sheffield United    6.052632
South

In [185]:
small_df = data.loc[0:100]

In [186]:
small_df

Unnamed: 0,Season,DateTime,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AST,HC,AC,HF,AF,HY,AY,HR,AR,TC
0,Jan-00,2000-08-19T00:00:00Z,Charlton,Man City,4,0,H,2,0,H,...,4,6,6,13,12,1,2,0,0,12
1,Jan-00,2000-08-19T00:00:00Z,Chelsea,West Ham,4,2,H,1,0,H,...,5,7,7,19,14,1,2,0,0,14
2,Jan-00,2000-08-19T00:00:00Z,Coventry,Middlesbrough,1,3,A,1,1,D,...,9,8,4,15,21,5,3,1,0,12
3,Jan-00,2000-08-19T00:00:00Z,Derby,Southampton,2,2,D,1,2,A,...,6,5,8,11,13,1,1,0,0,13
4,Jan-00,2000-08-19T00:00:00Z,Leeds,Everton,2,0,H,2,0,H,...,6,6,4,21,20,1,3,0,0,10
5,Jan-00,2000-08-19T00:00:00Z,Leicester,Aston Villa,0,0,D,0,0,D,...,3,5,4,12,12,2,3,0,0,9
6,Jan-00,2000-08-19T00:00:00Z,Liverpool,Bradford,1,0,H,0,0,D,...,2,6,1,8,8,1,1,0,0,7
7,Jan-00,2000-08-19T00:00:00Z,Sunderland,Arsenal,1,0,H,0,0,D,...,7,2,9,10,21,3,1,0,1,11
8,Jan-00,2000-08-19T00:00:00Z,Tottenham,Ipswich,3,1,H,2,1,H,...,5,3,4,14,13,0,0,0,0,7
9,Jan-00,2000-08-20T00:00:00Z,Man United,Newcastle,2,0,H,1,0,H,...,6,7,1,7,13,0,1,0,0,8


In [181]:
small_df.groupby("HomeTeam")['TC'].mean()

HomeTeam
Arsenal          13.833333
Aston Villa      13.000000
Bradford         12.000000
Charlton         14.200000
Chelsea          11.800000
Coventry         11.400000
Derby            12.400000
Everton          10.800000
Ipswich          11.000000
Leeds            11.000000
Leicester        11.600000
Liverpool        13.800000
Man City         12.200000
Man United       10.000000
Middlesbrough    11.600000
Newcastle        10.400000
Southampton      11.000000
Sunderland        9.600000
Tottenham        10.600000
West Ham         11.000000
Name: TC, dtype: float64

In [200]:
CalcRollingValues(small_df,'TC','HTAC','ATAC',4)

In [201]:
small_df

Unnamed: 0,Season,DateTime,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AC,HF,AF,HY,AY,HR,AR,TC,HTAC,ATAC
0,Jan-00,2000-08-19T00:00:00Z,Charlton,Man City,4,0,H,2,0,H,...,6,13,12,1,2,0,0,12,,
1,Jan-00,2000-08-19T00:00:00Z,Chelsea,West Ham,4,2,H,1,0,H,...,7,19,14,1,2,0,0,14,,
2,Jan-00,2000-08-19T00:00:00Z,Coventry,Middlesbrough,1,3,A,1,1,D,...,4,15,21,5,3,1,0,12,,
3,Jan-00,2000-08-19T00:00:00Z,Derby,Southampton,2,2,D,1,2,A,...,8,11,13,1,1,0,0,13,,
4,Jan-00,2000-08-19T00:00:00Z,Leeds,Everton,2,0,H,2,0,H,...,4,21,20,1,3,0,0,10,,
5,Jan-00,2000-08-19T00:00:00Z,Leicester,Aston Villa,0,0,D,0,0,D,...,4,12,12,2,3,0,0,9,,
6,Jan-00,2000-08-19T00:00:00Z,Liverpool,Bradford,1,0,H,0,0,D,...,1,8,8,1,1,0,0,7,,
7,Jan-00,2000-08-19T00:00:00Z,Sunderland,Arsenal,1,0,H,0,0,D,...,9,10,21,3,1,0,1,11,,
8,Jan-00,2000-08-19T00:00:00Z,Tottenham,Ipswich,3,1,H,2,1,H,...,4,14,13,0,0,0,0,7,,
9,Jan-00,2000-08-20T00:00:00Z,Man United,Newcastle,2,0,H,1,0,H,...,1,7,13,0,1,0,0,8,,


In [197]:
small_df[(small_df['HomeTeam']=='Man City') | (small_df['AwayTeam']=='Man City')].rolling(window=3,on="HomeTeam",closed='left').sum(numeric_only=True)

Unnamed: 0,HomeTeam,AC,AF,AR,AS,AST,ATAC,AY,FTAG,FTHG,HC,HF,HR,HS,HST,HTAC,HTAG,HTHG,HY,TC
0,Charlton,,,,,,,,,,,,,,,,,,,
15,Man City,,,,,,,,,,,,,,,,,,,
23,Man City,,,,,,,,,,,,,,,,,,,
29,Leeds,14.0,38.0,0.0,26.0,16.0,,8.0,4.0,9.0,18.0,44.0,0.0,46.0,29.0,,2.0,4.0,6.0,32.0
43,Liverpool,12.0,50.0,0.0,26.0,15.0,,8.0,6.0,6.0,21.0,49.0,0.0,35.0,16.0,,4.0,2.0,7.0,33.0
57,Man City,13.0,53.0,0.0,26.0,18.0,,9.0,6.0,5.0,23.0,42.0,0.0,34.0,14.0,,4.0,1.0,7.0,36.0
67,Tottenham,11.0,52.0,0.0,21.0,12.0,107.0,11.0,5.0,5.0,27.0,50.0,0.0,31.0,13.0,,2.0,1.0,7.0,38.0
73,Man City,11.0,42.0,0.0,20.0,11.0,113.0,11.0,3.0,4.0,23.0,42.0,0.0,43.0,17.0,118.0,0.0,1.0,5.0,34.0
85,Man City,15.0,37.0,0.0,26.0,13.0,114.0,8.0,2.0,1.0,24.0,41.0,0.0,38.0,16.0,108.0,0.0,0.0,2.0,39.0
98,Southampton,18.0,42.0,0.0,32.0,13.0,104.0,5.0,1.0,2.0,20.0,36.0,0.0,40.0,22.0,111.0,0.0,2.0,0.0,38.0
