In [1]:
import sys
sys.path.append('/Users/robertcampbell/sqlalchemy-tutorial/')
sys.path
import pytz
from datetime import datetime
from typing import List
from sqlalchemy.sql import text

import pandas as pd
import numpy as np
from numpy.dtypes import DateTime64DType

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, auc, f1_score, make_scorer, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from category_encoders import OneHotEncoder

from schema.schema import Transfer, Event, Fixture, Player
from main import engine

In [2]:
set_config(transform_output="pandas")
DRAW_THRESHOLD = 0.5
fixtures = Fixture.query.all()
game = fixtures[0]
f"{game.away_team.name}: {game.away_goals} {game.home_team.name}: {game.home_goals}"

'Leicester: 1 Manchester United: 2'

# Create Features

In [3]:
statement = text("SELECT * FROM denorm_stats")
with engine.connect() as conn:
    results = conn.execute(statement)

df = pd.DataFrame(results)
df['start_time'] = pd.to_datetime(df['start_time'])


In [16]:
def get_outcome(score_diff):
    if score_diff > 0:
        return 1
    elif score_diff < 0:
        return 2
    else:
        return 3

In [17]:
col_types = pd.Series(df.dtypes.apply(lambda x: str(x)))
numeric_columns = col_types[col_types.str.contains('float')].index.to_list()
date_columns = ["start_time"]

In [18]:
df.drop(list(range(20)), inplace=True)
home_goals = df['home_goals']
away_goals = df['away_goals']
X = df.drop(['id', 'home_team_id', 'away_team_id', 'home_goals', 'away_goals'], axis=1)
X_train, X_test, y_home_train, y_home_test, y_away_train, y_away_test = train_test_split(X, home_goals, away_goals, test_size=0.2, shuffle=False)

In [19]:
weekdays = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

In [20]:
# create custom transformer
class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        """ fit """
        return self
    
    def to_weekday(self, date:datetime) -> int:
        """ RETURN weekday as a string """
        return weekdays[date.weekday()].lower()
    
    def is_weekend(self, date:datetime) -> int:
        """ RETURN 1 if its a weekend and 0 otherwise"""
        if date.weekday() >= 5:
            return 1
        return 0
    
    def time_of_day_classification(self, date:datetime) -> int:
        """TODO implement time of day classification: Morning, Midday, Night """
        pass

    def transform(self, X:pd.DataFrame):
        """ transform """
        types = pd.Series(X.dtypes.apply(lambda x: str(x)))
        date_columns = types[types.str.contains('datetime')].index.to_list()

        return_cols = []
        for date_col in date_columns:
            # applying functions
            X[f"{date_col}_weekday"] = X[date_col].apply(self.to_weekday)
            X[f"{date_col}_is_weekend"] = X[date_col].apply(self.is_weekend)

            # add to return cols list
            return_cols.append(f"{date_col}_weekday")
            return_cols.append(f"{date_col}_is_weekend")
            
        return X.loc[:, return_cols]

In [21]:
# Evaluation Function
mae_score = make_scorer(mean_absolute_error)

In [22]:
parameters = {
    'alpha': [0.03, 0.05, 0.1],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7]
}

date_transformer = Pipeline(steps=[
    ('transformer', DateTransformer()),
    ('encoder', OneHotEncoder(use_cat_names=True))])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

feature_preprocessor = ColumnTransformer(transformers=[
    ('numerical', numeric_transformer, numeric_columns),
    ('datetime', date_transformer, date_columns)
])

home_pipeline = Pipeline(steps=[
    ('preprocessor', feature_preprocessor),
    ('regressor', GridSearchCV(ElasticNet(), param_grid=parameters, refit=True, scoring=mae_score))
])

away_pipeline = Pipeline(steps=[
    ('preprocessor', feature_preprocessor),
    ('regressor', GridSearchCV(ElasticNet(), param_grid=parameters, refit=True, scoring=mae_score))
])

In [24]:
# FIT model
home_pipeline.fit(X_train, y_home_train)
away_pipeline.fit(X_train, y_away_train)

# Eval on test set
y_home_pred = home_pipeline.predict(X_test)
y_away_pred = away_pipeline.predict(X_test)

print(f"HOME TRAIN MAE: {mean_absolute_error(home_pipeline.predict(X_train), y_home_train)}")
print(f"AWAY TRAIN MAE: {mean_absolute_error(home_pipeline.predict(X_train), y_away_train)}")
print(f"HOME TEST MAE: {mean_absolute_error(y_home_pred.round(), y_home_test)}")
print(f"AWAY TEST MAE: {mean_absolute_error(y_away_pred.round(), y_away_test)}")

 'home_defenders_cumulative_rating' 'away_forwards_cumulative_rating'
 'away_midfielders_cumulative_rating' 'away_defenders_cumulative_rating']. At least one non-missing value is needed for imputation with strategy='median'.
 'home_defenders_cumulative_rating' 'away_forwards_cumulative_rating'
 'away_midfielders_cumulative_rating' 'away_defenders_cumulative_rating']. At least one non-missing value is needed for imputation with strategy='median'.


HOME TRAIN MAE: 0.9799489661237317
AWAY TRAIN MAE: 1.0949391444683776
HOME TEST MAE: 0.986764705882353
AWAY TEST MAE: 0.9102941176470588


 'home_defenders_cumulative_rating' 'away_forwards_cumulative_rating'
 'away_midfielders_cumulative_rating' 'away_defenders_cumulative_rating']. At least one non-missing value is needed for imputation with strategy='median'.
 'home_defenders_cumulative_rating' 'away_forwards_cumulative_rating'
 'away_midfielders_cumulative_rating' 'away_defenders_cumulative_rating']. At least one non-missing value is needed for imputation with strategy='median'.
 'home_defenders_cumulative_rating' 'away_forwards_cumulative_rating'
 'away_midfielders_cumulative_rating' 'away_defenders_cumulative_rating']. At least one non-missing value is needed for imputation with strategy='median'.
 'home_defenders_cumulative_rating' 'away_forwards_cumulative_rating'
 'away_midfielders_cumulative_rating' 'away_defenders_cumulative_rating']. At least one non-missing value is needed for imputation with strategy='median'.


In [27]:
# GETTING ACCURACY SCORE FOR CLASSIFICATION --> TODO
def get_results(home_array: List[float], away_array: List[float]) -> List[float]:
    """ get results """
    results_array = []
    for home_pred, away_pred in zip(home_array, away_array):
        diff = home_pred - away_pred
        if abs(diff) <= DRAW_THRESHOLD: # draw
            results_array.append(0)
        elif diff > 0: # home team winds
            results_array.append(1)
        else: # away team wins
            results_array.append(2)
    return results_array

# create win, loss, draw array
pred_results = get_results(home_array=y_home_pred, away_array=y_away_pred)
actual_results = get_results(home_array=y_home_test, away_array=y_away_test)
accuracy_score(pred_results, actual_results)
# f1_score(actual_results, pred_results, average='macro')

0.4147058823529412

In [28]:
# GET FEATURE IMPORTANCES
def get_feature_importances(pipeline, X_test):
    regressor = pipeline['regressor'].best_estimator_
    coefs = list(np.round(regressor.coef_, 3))
    features = pipeline['preprocessor'].transform(X_test).columns.to_list()
    zipped = list(zip(coefs, features))
    feature_importances = sorted(zipped, key=lambda x: abs(x[0]), reverse=True)
    return feature_importances

In [29]:
home_features = get_feature_importances(home_pipeline, X_test)
away_features = get_feature_importances(away_pipeline, X_test)

 'home_defenders_cumulative_rating' 'away_forwards_cumulative_rating'
 'away_midfielders_cumulative_rating' 'away_defenders_cumulative_rating']. At least one non-missing value is needed for imputation with strategy='median'.
 'home_defenders_cumulative_rating' 'away_forwards_cumulative_rating'
 'away_midfielders_cumulative_rating' 'away_defenders_cumulative_rating']. At least one non-missing value is needed for imputation with strategy='median'.


In [30]:
# Top Features
print(f"TOP 10 home_features: {list(map(lambda x: x[1], home_features[:10]))}")
print(f"TOP 10 away_features: {list(map(lambda x: x[1], away_features[:10]))}")

TOP 10 home_features: ['numerical__home_defenders_cumulative_average_total_passes', 'numerical__home_midfielders_cumulative_average_key_passes', 'numerical__away_defenders_cumulative_average_total_passes', 'numerical__away_defenders_cumulative_duels_won_percentage', 'numerical__home_midfielders_cumulative_average_total_passes', 'numerical__home_midfielders_cumulative_average_total_blocks', 'numerical__away_midfielders_cumulative_average_key_passes', 'numerical__home_forwards_cumulative_average_key_passes', 'numerical__away_defenders_cumulative_average_total_blocks', 'numerical__home_forwards_cumulative_average_total_passes']
TOP 10 away_features: ['numerical__away_midfielders_cumulative_average_key_passes', 'numerical__away_defenders_cumulative_average_total_passes', 'numerical__away_defenders_cumulative_average_key_passes', 'numerical__away_forwards_cumulative_average_key_passes', 'numerical__home_midfielders_cumulative_average_key_passes', 'numerical__home_forwards_cumulative_average

In [26]:
# CREATE Trial Folder
# Hyperparameter Table Lasso Regression
# selected parameters, eval metrics, best prediction, 
# alpha value, MAE, MSE, R2, is_best, model_path, Train_data (split into hold out set for k_fold cross_validation), Test_data
home_pipeline['regressor'].best_estimator_

In [27]:
df.to_csv('../data/premier_league_2022.csv')