In [1]:
# necessary libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle

# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# hyperparameters
from sklearn.model_selection import GridSearchCV

# pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# data
s2019_2020 = pd.read_csv('data/2019-2020.csv')



In [None]:
np.random.seed(43)

In [None]:
columns = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 
           'HTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC']
s2019_2020 = s2019_2020[columns]
s2019_2020.head()

In [None]:
def form(data, k=0.33):
    clubs = data.HomeTeam.unique()
    form_dict = {}
    for club in clubs:
        form_dict[club] = [1.0]
        
    for idx, row in data.iterrows():
        ht_current_form = form_dict[row['HomeTeam']][-1]
        at_current_form = form_dict[row['AwayTeam']][-1]

        if row['FTR'] == 'H':
            form_dict[row['HomeTeam']].append(ht_current_form + (k * at_current_form))
            form_dict[row['AwayTeam']].append(at_current_form - (k * at_current_form))
    
        if row['FTR'] == 'A':
            form_dict[row['AwayTeam']].append(at_current_form + (k * ht_current_form))
            form_dict[row['HomeTeam']].append(ht_current_form - (k * ht_current_form))
            
        if row['FTR'] == 'D':
            form_dict[row['HomeTeam']].append(ht_current_form - (k * (ht_current_form - at_current_form)))
            form_dict[row['AwayTeam']].append(at_current_form - (k * (at_current_form - ht_current_form)))
            
    return form_dict

In [None]:
def transform_form(data):
    data['HF'] = 0.0
    data['AF'] = 0.0

    form_data = form(data)

    for club in data.HomeTeam.unique(): 
        mask = (data['HomeTeam'] == club) | (data['AwayTeam'] == club)
        k = 0

        for idx, row in data[mask].iterrows():
            if row['HomeTeam'] == club:
                data.loc[idx, 'HF'] = form_data[club][k]
            if row['AwayTeam'] == club:
                data.loc[idx, 'AF'] = form_data[club][k]
            k += 1
    return data

s2019_2020 = transform_form(s2019_2020)
s2019_2020.tail(12)

In [None]:
def transform_ftr(row, column_name):
    if row[column_name] == 'H':
        return 1
    if row[column_name] == 'A':
        return -1
    else:
        return 0

In [None]:
s2019_2020.FTR = s2019_2020.apply(lambda row: transform_ftr(row, 'FTR'), axis=1)

## Team Statistics

In [None]:
def get_team_statistics(X, avg_HS, avg_AS, avg_AC, avg_HC):
    team_statistics = pd.DataFrame(columns=['Club Name', 'HTG', 'ATG', 'HTC', 'ATC', 'HAS', 'AAS']) 
    home_team_group = X.groupby('HomeTeam')
    away_team_group = X.groupby('AwayTeam')
    num_games = X.shape[0] / 20

    team_statistics['Club Name'] = home_team_group.groups.keys()
    team_statistics['HTG'] = home_team_group.FTHG.sum().values
    team_statistics['ATG'] = away_team_group.FTAG.sum().values
    team_statistics['HTC'] = home_team_group.FTAG.sum().values
    team_statistics['ATC'] = away_team_group.FTHG.sum().values

    team_statistics['HAS'] = (team_statistics['HTG'] / num_games) / avg_HS
    team_statistics['AAS'] = (team_statistics['ATG'] / num_games) / avg_AS
    team_statistics['HDS'] = (team_statistics['ATC'] / num_games) / avg_AC
    team_statistics['ADS'] = (team_statistics['HTC'] / num_games) / avg_HC

    return team_statistics

def transform_stat(data):
    data['HAS'] = 0.0
    data['AAS'] = 0.0
    data['HDS'] = 0.0
    data['ADS'] = 0.0
    data['HXG'] = 0.0
    data['AXG'] = 0.0

    HAS = []
    AAS = []
    HDS = []
    ADS = []
    HXG = []
    AXG = []

    avg_HS = data.FTHG.sum() / data.shape[0]
    avg_AS = data.FTAG.sum() / data.shape[0]
    avg_HC = avg_AS
    avg_AC = avg_HS

    team_stat = get_team_statistics(data, avg_HS, avg_AS, avg_AC, avg_HC)

    for index, row in data.iterrows():
        HAS.append(team_stat[team_stat['Club Name'] == row['HomeTeam']]['HAS'].values[0])
        AAS.append(team_stat[team_stat['Club Name'] == row['AwayTeam']]['AAS'].values[0])
        HDS.append(team_stat[team_stat['Club Name'] == row['HomeTeam']]['HDS'].values[0])
        ADS.append(team_stat[team_stat['Club Name'] == row['AwayTeam']]['ADS'].values[0])

    data['HAS'] = HAS
    data['AAS'] = AAS
    data['HDS'] = HDS
    data['ADS'] = ADS

    for index, row in data.iterrows():
        HXG.append(row['HAS'] * row['ADS'] * avg_HS)
        AXG.append(row['AAS'] * row['HDS'] * avg_AS)

    data['HXG'] = HXG
    data['AXG'] = AXG
    
    return data

In [None]:
s2019_2020 = transform_stat(s2019_2020)

## Recent K Performance

In [None]:
def k_perf(data, k=3):
    data['PastFTHG'] = 0.0
    for idx in range(data.shape[0]-1, -1, -1):
        row = data.loc[idx]
        ht = row.HomeTeam
        at = row.AwayTeam

        ht_stats = data[idx:][(data.HomeTeam == ht)|(data.AwayTeam == ht)].head(k)
        at_stats = data[idx:][(data.HomeTeam == at)|(data.AwayTeam == at)].head(k)

        data.loc[idx, 'PastFTHG'] = ht_stats[ht_stats['HomeTeam'] == ht].FTHG.sum() + ht_stats[ht_stats['AwayTeam'] == ht].FTAG.sum()
        data.loc[idx, 'PastFTAG'] = at_stats[at_stats['HomeTeam'] == at].FTHG.sum() + at_stats[at_stats['AwayTeam'] == at].FTAG.sum()
        data.loc[idx, 'PastHST'] = ht_stats[ht_stats['HomeTeam'] == ht].HST.sum() + ht_stats[ht_stats['AwayTeam'] == ht].AST.sum()
        data.loc[idx, 'PastAST'] = at_stats[at_stats['HomeTeam'] == at].HST.sum() + ht_stats[ht_stats['AwayTeam'] == ht].AST.sum()
        data.loc[idx, 'PastHS'] = ht_stats[ht_stats['HomeTeam'] == ht].HS.sum() + ht_stats[ht_stats['AwayTeam'] == ht].AS.sum()
        data.loc[idx, 'PastAS'] = at_stats[at_stats['HomeTeam'] == at].HS.sum() + ht_stats[ht_stats['AwayTeam'] == ht].AS.sum()
    return data

In [None]:
s2019_2020 = k_perf(s2019_2020)

## Normal Features

In [None]:
X = s2019_2020[['HAS', 'AAS', 'HDS', 'ADS', 'HXG', 'AXG', 'PastFTHG', 'PastFTAG', 'PastHST',
               'PastAST', 'PastHS', 'PastAS']]
y = s2019_2020['FTR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=34, stratify=s2019_2020.FTR)

print('X_train.shape: {}'.format(X_train.shape))
print('X_test.shape: {}'.format(X_test.shape))
print('y_train.shape: {}'.format(y_train.shape))
print('y_test.shape: {}'.format(y_test.shape))

In [None]:
# initialize models
knn = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)
xgb = XGBClassifier(n_estimators=1000).fit(X_train, y_train)
tr = DecisionTreeClassifier().fit(X_train, y_train)
gnb = GaussianNB().fit(X_train, y_train)
svc = SVC(C=10).fit(X_train, y_train)

In [None]:
print('KNN:\n  train score: {:.3f}, test score: {:.3f}'.format(knn.score(X_train, y_train), knn.score(X_test, y_test)))
print('RF:\n  train score: {:.3f}, test score: {:.3f}'.format(rf.score(X_train, y_train), rf.score(X_test, y_test)))
print('XGB:\n  train score: {:.3f}, test score: {:.3f}'.format(xgb.score(X_train, y_train), xgb.score(X_test, y_test)))
print('TR:\n  train score: {:.3f}, test score: {:.3f}'.format(tr.score(X_train, y_train), tr.score(X_test, y_test)))
print('GNB:\n  train score: {:.3f}, test score: {:.3f}'.format(gnb.score(X_train, y_train), gnb.score(X_test, y_test)))
print('SVC:\n  train score: {:.3f}, test score: {:.3f}'.format(svc.score(X_train, y_train), svc.score(X_test, y_test)))

## Differential Features

In [None]:
def add_diff_features(data):
    scaler = StandardScaler()

    scaled = scaler.fit_transform(data.drop(['HomeTeam', 'AwayTeam', 'FTR'], axis=1))
    columns = set(data.columns) - {'HomeTeam', 'AwayTeam', 'FTR'}
    data[list(columns)] = scaled
    
    data['AttackDiff'] = data['HAS'] - data['AAS']
    data['DefenceDiff'] = data['HDS'] - data['ADS']
    data['ExpGoalDiff'] = data['HXG'] - data['AXG']
    data['PastGoalDiff'] = data['PastFTHG'] - data['PastFTAG']
    data['PastShotsOnTargetDiff'] = data['PastHST'] - data['PastAST']
    data['PastShotsDiff'] = data['PastHS'] - data['PastAS']
    data['AttackDiff'] = scaler.fit_transform(data['AttackDiff'].values.reshape(-1, 1))
    data['DefenceDiff'] = scaler.fit_transform(data['DefenceDiff'].values.reshape(-1, 1))
    data['ExpGoalDiff'] = scaler.fit_transform(data['ExpGoalDiff'].values.reshape(-1, 1))
    data['PastGoalDiff'] = scaler.fit_transform(data['PastGoalDiff'].values.reshape(-1, 1))
    data['PastShotsOnTargetDiff'] = scaler.fit_transform(data['PastShotsOnTargetDiff'].values.reshape(-1, 1))
    data['PastShotsDiff'] = scaler.fit_transform(data['PastShotsDiff'].values.reshape(-1, 1))
    
    return data

In [None]:
s2019_2020 = add_diff_features(s2019_2020)
s2019_2020.head()

In [None]:
X2 = s2019_2020[['AttackDiff', 'DefenceDiff', 'ExpGoalDiff', 'PastGoalDiff', 'PastShotsOnTargetDiff',
                'PastShotsDiff']]
y2 = s2019_2020['FTR']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.15, random_state=34, stratify=s2019_2020.FTR)

print('X_train.shape: {}'.format(X2_train.shape))
print('X_test.shape: {}'.format(X2_test.shape))
print('y_train.shape: {}'.format(y2_train.shape))
print('y_test.shape: {}'.format(y2_test.shape))

In [None]:
# initialize models
knn = KNeighborsClassifier(n_neighbors=10).fit(X2_train, y2_train)
rf = RandomForestClassifier(n_estimators=1000).fit(X2_train, y2_train)
xgb = XGBClassifier(n_estimators=1000).fit(X2_train, y2_train)
tr = DecisionTreeClassifier().fit(X2_train, y2_train)
gnb = GaussianNB().fit(X2_train, y2_train)
svc = SVC(C=10).fit(X2_train, y2_train)

In [None]:
print('KNN:\n  train score: {:.3f}, test score: {:.3f}'.format(knn.score(X2_train, y2_train), knn.score(X2_test, y2_test)))
print('RF:\n  train score: {:.3f}, test score: {:.3f}'.format(rf.score(X2_train, y2_train), rf.score(X2_test, y2_test)))
print('XGB:\n  train score: {:.3f}, test score: {:.3f}'.format(xgb.score(X2_train, y2_train), xgb.score(X2_test, y2_test)))
print('TR:\n  train score: {:.3f}, test score: {:.3f}'.format(tr.score(X2_train, y2_train), tr.score(X2_test, y2_test)))
print('GNB:\n  train score: {:.3f}, test score: {:.3f}'.format(gnb.score(X2_train, y2_train), gnb.score(X2_test, y2_test)))
print('SVC:\n  train score: {:.3f}, test score: {:.3f}'.format(svc.score(X2_train, y2_train), svc.score(X2_test, y2_test)))

## Cross Validation

In [None]:
forest_scores = cross_val_score(rf, X2, y2, cv=10)
tree_scores = cross_val_score(tr, X2, y2, cv=10)
knn_scores = cross_val_score(knn, X2, y2, cv=10)
xgb_scores = cross_val_score(xgb, X2, y2, cv=10)
gnb_scores = cross_val_score(gnb, X2, y2, cv=10)
svc_scores = cross_val_score(svc, X2, y2, cv=10)
print('Random Forest Classifier Accuracy: {:.2f}%'.format(forest_scores.mean() * 100))
print('Tree Classifier Accuracy: {:.2f}%'.format(tree_scores.mean() * 100))
print('K-Nearest Neighbor Accuracy: {:.2f}%'.format(knn_scores.mean() * 100))
print('XGB Classifier Accuracy: {:.2f}%'.format(xgb_scores.mean() * 100))
print('Gaussian Naive Bayes Classifier Accuracy: {:.2f}%'.format(gnb_scores.mean() * 100))
print('Support Vector Classifier Accuracy: {:.2f}%'.format(svc_scores.mean() * 100))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=.15)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=.15)
gnb.fit(X_train, y_train)
predictions = gnb.predict(X_test)
print(classification_report(y_test, predictions))

## Append Data 

In [None]:
s2018_2019 = pd.read_csv('data/2018-2019.csv')
s2017_2018 = pd.read_csv('data/2017-2018.csv')
s2016_2017 = pd.read_csv('data/2016-2017.csv')
s2015_2016 = pd.read_csv('data/2015-2016.csv')
s2014_2015 = pd.read_csv('data/2014-2015.csv')
columns = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 
           'HTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC']
s2018_2019 = s2018_2019[columns]
s2017_2018 = s2017_2018[columns]
s2016_2017 = s2016_2017[columns]
s2015_2016 = s2015_2016[columns]
s2014_2015 = s2014_2015[columns]

In [None]:
s2018_2019.FTR = s2018_2019.apply(lambda row: transform_ftr(row, 'FTR'), axis=1)
s2017_2018.FTR = s2017_2018.apply(lambda row: transform_ftr(row, 'FTR'), axis=1)
s2016_2017.FTR = s2016_2017.apply(lambda row: transform_ftr(row, 'FTR'), axis=1)
s2015_2016.FTR = s2015_2016.apply(lambda row: transform_ftr(row, 'FTR'), axis=1)
s2014_2015.FTR = s2014_2015.apply(lambda row: transform_ftr(row, 'FTR'), axis=1)

In [None]:
s2018_2019 = transform_stat(s2018_2019)
s2017_2018 = transform_stat(s2017_2018)
s2016_2017 = transform_stat(s2016_2017)
s2015_2016 = transform_stat(s2015_2016)
s2014_2015 = transform_stat(s2014_2015)

In [None]:
s2014_2015 = k_perf(s2014_2015)
s2015_2016 = k_perf(s2015_2016)
s2016_2017 = k_perf(s2016_2017)
s2017_2018 = k_perf(s2017_2018)
s2018_2019 = k_perf(s2018_2019)

In [None]:
s2014_2015 = add_diff_features(s2014_2015)
s2015_2016 = add_diff_features(s2015_2016)
s2016_2017 = add_diff_features(s2016_2017)
s2017_2018 = add_diff_features(s2017_2018)
s2018_2019 = add_diff_features(s2018_2019)

In [None]:
s2019_2020 = s2019_2020.append(s2018_2019, sort=False, ignore_index=True)
s2019_2020 = s2019_2020.append(s2017_2018, sort=False, ignore_index=True)
s2019_2020 = s2019_2020.append(s2016_2017, sort=False, ignore_index=True)
s2019_2020 = s2019_2020.append(s2015_2016, sort=False, ignore_index=True)
s2019_2020 = s2019_2020.append(s2014_2015, sort=False, ignore_index=True)
s2019_2020.info()

In [None]:
X2 = s2019_2020[['AttackDiff', 'DefenceDiff', 'ExpGoalDiff', 'PastGoalDiff', 'PastShotsOnTargetDiff',
                'PastShotsDiff']]
y2 = s2019_2020['FTR']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.15, random_state=34, stratify=s2019_2020.FTR)

print('X_train.shape: {}'.format(X2_train.shape))
print('X_test.shape: {}'.format(X2_test.shape))
print('y_train.shape: {}'.format(y2_train.shape))
print('y_test.shape: {}'.format(y2_test.shape))

In [None]:
# initialize models
knn = KNeighborsClassifier(n_neighbors=10).fit(X2_train, y2_train)
rf = RandomForestClassifier(n_estimators=1000).fit(X2_train, y2_train)
xgb = XGBClassifier(n_estimators=1000).fit(X2_train, y2_train)
tr = DecisionTreeClassifier().fit(X2_train, y2_train)
gnb = GaussianNB().fit(X2_train, y2_train)
svc = SVC(C=10).fit(X2_train, y2_train)

In [None]:
print('KNN:\n  train score: {:.3f}, test score: {:.3f}'.format(knn.score(X2_train, y2_train), knn.score(X2_test, y2_test)))
print('RF:\n  train score: {:.3f}, test score: {:.3f}'.format(rf.score(X2_train, y2_train), rf.score(X2_test, y2_test)))
print('XGB:\n  train score: {:.3f}, test score: {:.3f}'.format(xgb.score(X2_train, y2_train), xgb.score(X2_test, y2_test)))
print('TR:\n  train score: {:.3f}, test score: {:.3f}'.format(tr.score(X2_train, y2_train), tr.score(X2_test, y2_test)))
print('GNB:\n  train score: {:.3f}, test score: {:.3f}'.format(gnb.score(X2_train, y2_train), gnb.score(X2_test, y2_test)))
print('SVC:\n  train score: {:.3f}, test score: {:.3f}'.format(svc.score(X2_train, y2_train), svc.score(X2_test, y2_test)))

In [None]:
forest_scores = cross_val_score(rf, X2, y2, cv=10)
tree_scores = cross_val_score(tr, X2, y2, cv=10)
knn_scores = cross_val_score(knn, X2, y2, cv=10)
xgb_scores = cross_val_score(xgb, X2, y2, cv=10)
gnb_scores = cross_val_score(gnb, X2, y2, cv=10)
svc_scores = cross_val_score(svc, X2, y2, cv=10)
print('Random Forest Classifier Accuracy: {:.2f}%'.format(forest_scores.mean() * 100))
print('Tree Classifier Accuracy: {:.2f}%'.format(tree_scores.mean() * 100))
print('K-Nearest Neighbor Accuracy: {:.2f}%'.format(knn_scores.mean() * 100))
print('XGB Classifier Accuracy: {:.2f}%'.format(xgb_scores.mean() * 100))
print('Gaussian Naive Bayes Classifier Accuracy: {:.2f}%'.format(gnb_scores.mean() * 100))
print('Support Vector Classifier Accuracy: {:.2f}%'.format(svc_scores.mean() * 100))

### Support Vector Machines Hyperparameter tuning

In [None]:
param_grid = {
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 50, 100],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
    'kernel': ['rbf']
}
clf = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1)
clf.fit(X2, y2)

In [None]:
clf.best_params_

In [None]:
svc = SVC(**clf.best_params_)
svc.fit(X2_train, y2_train)
print('X_test score: {}'.format(svc.score(X2_test, y2_test)))
predictions = svc.predict(X2_test)
print(classification_report(y2_test, predictions))

### Knn Hyperpameter tuning

In [None]:
param_grid = {
    'n_neighbors': list(range(1, 200)),
    'leaf_size': list(range(1, 50)),
    'p': [1, 2]
}
clf = GridSearchCV(knn, param_grid, cv=10, n_jobs=-1)
clf.fit(X2, y2)

In [None]:
clf.best_params_

In [None]:
knn = KNeighborsClassifier(**clf.best_params_)
knn.fit(X2_train, y2_train)
print('X_test score: {}'.format(knn.score(X2_test, y2_test)))
predictions = knn.predict(X2_test)
print(classification_report(y2_test, predictions))

In [None]:
gnb = GaussianNB()
gnb.fit(X2_train, y2_train)
print('X_test score: {}'.format(gnb.score(X2_test, y2_test)))
predictions = gnb.predict(X2_test)
print(classification_report(y2_test, predictions))

In [None]:
class MakeAttributes(BaseEstimator):
    """
        Engineer new attributes
    """
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = transform_stat(X)
        X = k_perf(X)
        X = add_diff_features(X)
        return X

In [None]:
s2014_2015.head()

In [None]:
s2019_2020.head()

In [None]:
model = SVC(C=10, gamma=0.01, kernel='rbf')
model.fit(X2_train, y2_train)
print('Train score: {}'.format(model.score(X2_train, y2_train)))
print('Test score: {}'.format(model.score(X2_test, y2_test)))

In [None]:
pickle.dump(model, open('model2.pkl', 'wb'))

In [None]:
def transform_back_ftr(row, column_name):
    if row[column_name] == 1:
        return 'H'
    if row[column_name] == -1:
        return 'A'
    else:
        return 'D'

## Pipeline

In [None]:
model_pipeline = Pipeline(steps=[('Make Attributes', MakeAttributes())])

model = pickle.load(open('model2.pkl', 'rb'))

data = pd.read_csv('data/2020-2021.csv')
data = data[columns]
predict_columns = []
data_tr = model_pipeline.fit_transform(data)[['AttackDiff', 'DefenceDiff', 'ExpGoalDiff', 'PastGoalDiff', 'PastShotsOnTargetDiff',
                'PastShotsDiff']]
predictions = model.predict(data_tr)
data = data[['HomeTeam', 'AwayTeam', 'FTR']]
data['Predictions'] = predictions
data.Predictions = data.apply(lambda row: transform_back_ftr(row, 'Predictions'), axis=1)
data