In [142]:
import pandas as pd
import sys
sys.path[0] = "c:/Users/Robert/Documents/Projekte/dev/bettingTool/"


In [143]:
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score,
                             precision_score, recall_score, roc_auc_score,
                             roc_curve)
import numpy as np
import shap


def set_dummies_div(df, cat, divs=[]):
    if cat in df.columns:
        if divs == []:
            divs = list(set(df[cat]))
        for d in divs:
            df[d] = [1 if ele == d else 0 for ele in df[cat]]
        df = df.drop(cat, axis=1)
    return df


def evaluate_model(model, y_test, X_test, show=False,threshold=''):
    if threshold == '':
        y_pred = model.predict(X_test)
    else:
       y_pred = model.predict_proba(X_test)[:,1]>threshold

    modellguete = dict()
    modellguete["roc_auc_score"] = np.round(roc_auc_score(y_test, y_pred,), 3)
    modellguete["f1_score"] = np.round(f1_score(y_test, y_pred), 3)
    modellguete["accuracy_score"] = np.round(accuracy_score(y_test, y_pred), 3)
    modellguete["precision_score"] = np.round(
        precision_score(y_test, y_pred), 3)
    modellguete["recall_score"] = np.round(recall_score(y_test, y_pred), 3)

    print('AUC Score : ', modellguete["roc_auc_score"])
    print('F1 Score : ', modellguete["f1_score"])
    print('Accuracy Score : ', modellguete["accuracy_score"])
    print('Precision Score : ', modellguete["precision_score"])
    print('Recall Score : ', modellguete["recall_score"])

    # Model Accuracy, how often is the classifier correct?
    print('Confusion Matrix : \n' +
          str(np.round(confusion_matrix(y_test, y_pred), 3)))
    if True:
        try:
            print('-'*100)
            print('Feature Importance',)
            feature_imp = pd.Series(
                model.feature_importances_, index=X_test.columns).sort_values(ascending=False)[0:15]
            feature_imp.to_json()
            print('Important')
            print(feature_imp)
            feature_not_imp = pd.Series(
                model.feature_importances_, index=X_test.columns).sort_values(ascending=True)[0:15]
            print('Not Important')
            print(feature_not_imp)
        except:
            print('Fehler bei Feature Importance')
            pass

    if False:
        try:
            print('-'*100)
            print('Shap Values',)
            rf_shap_values = shap.TreeExplainer(model).shap_values(
                X_test.sample(100, random_state=42))
            shap_df = pd.DataFrame(rf_shap_values[0], columns=X_test.columns)
            print(np.abs(shap_df).mean().sort_values(ascending=False))
        except:
            pass

        try:
            X_test.corr().round(2)[X_test.corr().round(2) > 0.3].to_csv(
                'C:/Users/Robert/Documents/corr.csv', sep=';', decimal=',', index=True)
        except:
            pass

    print('-'*100)
    return modellguete


In [144]:
import pandas as pd
from src.utils import globals as GLOBALS


In [145]:
data_raw = pd.DataFrame()
for leagues in GLOBALS.EXTRA_LEAGUES:
    df = pd.read_csv(
        f"C:/Users/Robert/Documents/Projekte/dev/bettingTool/data/leagues/{leagues}/season/{leagues}_2223.csv", sep=',', index_col=0, encoding='utf8')
    data_raw = pd.concat([df, data_raw], axis=0)


In [146]:
data_raw.index = data_raw.Date +'_'+ data_raw.Home+"_"+data_raw.Home


In [147]:
df_home = data_raw[["Home", "HG", "AG", "PH", "MaxH", "MaxD", "AvgH", "AvgD", "MaxA", "AvgA"]].rename(
    {"Home": "Team", "HG": "FTG", "AG": "FTOG", "PH": "P", "MaxH": "Max", "AvgH": "Avg", "MaxA": "MaxO", "AvgA": "AvgO"}, axis=1)
df_home["atHome"] = True
df_away = data_raw[["Away", "AG", "HG", "PA", "MaxA", "MaxD", "AvgA", "AvgD", "MaxH", "AvgH"]].rename(
    {"Away": "Team", "AG": "FTG", "HG": "FTOG", "PA": "P", "MaxA": "Max", "AvgA": "Avg", "MaxH": "MaxO", "AvgH": "AvgO"}, axis=1)
df_away["atHome"] = False
data = pd.concat([df_home, df_away], axis=0)

data.index = data.index+"_TEAM_"+data.Team


In [148]:
data

Unnamed: 0,Team,FTG,FTOG,P,Max,MaxD,Avg,AvgD,MaxO,AvgO,atHome
10/03/2012_Colorado Rapids_Colorado Rapids_TEAM_Colorado Rapids,Colorado Rapids,2.0,0.0,2.06,2.13,3.40,2.01,3.19,4.11,3.65,True
10/03/2012_Vancouver Whitecaps_Vancouver Whitecaps_TEAM_Vancouver Whitecaps,Vancouver Whitecaps,2.0,0.0,2.04,2.04,3.44,1.97,3.27,4.11,3.70,True
11/03/2012_DC United_DC United_TEAM_DC United,DC United,0.0,1.0,2.57,2.60,3.60,2.46,3.11,3.04,2.81,True
11/03/2012_Los Angeles Galaxy_Los Angeles Galaxy_TEAM_Los Angeles Galaxy,Los Angeles Galaxy,1.0,3.0,1.88,1.89,3.57,1.81,3.33,5.00,4.31,True
11/03/2012_San Jose Earthquakes_San Jose Earthquakes_TEAM_San Jose Earthquakes,San Jose Earthquakes,1.0,0.0,1.83,1.91,3.68,1.82,3.36,5.00,4.23,True
...,...,...,...,...,...,...,...,...,...,...,...
11/09/2022_Arsenal Sarandi_Arsenal Sarandi_TEAM_Velez Sarsfield,Velez Sarsfield,2.0,2.0,2.43,2.56,3.18,2.44,2.98,3.46,3.16,False
11/09/2022_Newells Old Boys_Newells Old Boys_TEAM_Gimnasia L.P.,Gimnasia L.P.,0.0,2.0,2.96,2.96,3.16,2.85,2.94,2.83,2.70,False
11/09/2022_Boca Juniors_Boca Juniors_TEAM_River Plate,River Plate,0.0,1.0,2.31,2.35,3.18,2.27,3.00,3.75,3.47,False
12/09/2022_Talleres Cordoba_Talleres Cordoba_TEAM_Defensa y Justicia,Defensa y Justicia,0.0,1.0,4.65,4.70,3.40,4.30,3.26,2.03,1.92,False


In [149]:
data["Target"] = data.FTG>data.FTOG
data = data.drop('Team',axis=1)
data = data.drop(["FTG","FTOG"],axis=1)

In [150]:
data = data[data.isna().T.sum()==0]

In [151]:
from sklearn.model_selection import train_test_split

y = data['Target']
X = data.drop(['Target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, stratify=y)

In [153]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV  
from xgboost import XGBClassifier, XGBRegressor
model = RandomForestClassifier(
     n_estimators=200, random_state=0, verbose=1, min_samples_leaf=5, min_samples_split=10)
model.fit(X=X_train, y=y_train)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   20.4s finished


RandomForestClassifier(min_samples_leaf=5, min_samples_split=10,
                       n_estimators=200, random_state=0, verbose=1)

In [156]:
evaluate_model(model, y_test, X_test, show=False,threshold=0.7)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


AUC Score :  0.536
F1 Score :  0.164
Accuracy Score :  0.652
Precision Score :  0.721
Recall Score :  0.093
Confusion Matrix : 
[[13775   295]
 [ 7465   762]]
----------------------------------------------------------------------------------------------------
Feature Importance
Important
P         0.165190
Avg       0.155074
AvgO      0.149333
Max       0.140263
MaxO      0.139020
AvgD      0.118457
MaxD      0.113049
atHome    0.019614
dtype: float64
Not Important
atHome    0.019614
MaxD      0.113049
AvgD      0.118457
MaxO      0.139020
Max       0.140263
AvgO      0.149333
Avg       0.155074
P         0.165190
dtype: float64
----------------------------------------------------------------------------------------------------


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    1.1s finished


{'roc_auc_score': 0.536,
 'f1_score': 0.164,
 'accuracy_score': 0.652,
 'precision_score': 0.721,
 'recall_score': 0.093}