# Problem
Our problem is a classification problem, our target is whether a shot is goal or not.

# Where Can We Use This Model In Real World ?

We can use this model to analyze player performances, understand player habits on the field, determining which players to change in game or find out in advance whether a new player will be compatible with the team or not during transfer seasons.

# Imports

In [255]:
import pandas as pd
import numpy as np
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
import rpy2.robjects.packages as rpackages
from rpy2.robjects.conversion import localconverter
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
import h2o
from h2o.automl import H2OAutoML
import pickle

In [256]:
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)

<rpy2.rinterface_lib.sexp.NULLType object at 0x13faad3d0> [0]

# Getting Dataset Over R from Python

In [257]:
pandas2ri.activate()
ro.r('''
        library("worldfootballR")
        laliga <- load_understat_league_shots(league = "La liga")
     ''')
laliga = pandas2ri.rpy2py(ro.r['laliga'])
laliga.drop('league', axis=1, inplace=True)
data = laliga[(laliga['date'] > '2022-01-01') & (laliga['date'] < '2024-06-10')]

→ Data last updated 2024-05-30 18:34:46.012307882309 UTC


We are going to use La Liga dataset between 2023-01-01 and 2024-06-10.

# Data Manipulations and Fixes

We have a problem about NaN values and duplicate features, we are going to fix these problems by manipulating the data.

In [258]:
warnings.filterwarnings('ignore')

def fixDataNaN(df):
    with localconverter(ro.default_converter + pandas2ri.converter):
        df = ro.conversion.py2rpy(df)
pairs = [['x','X'],['y','Y'],['x_g','xG'],['h_a','home_away'],['shot_type','shotType'],['last_action','lastAction']]

def camel_case_columns(df):
    def camel_case(column_name):
        parts = column_name.split('_')
        return str(parts[0] + ''.join(x.title() for x in parts[1:]))
    
    new_columns = []
    for column in df.columns:
        if '_' in column:
            new_columns.append(camel_case(column))
        else:
            new_columns.append(str(column))
    
    df.columns = new_columns
    return df

def fixMergeColumns(dataList, pairs):
    for targetData in dataList:
        for pair in pairs:
            if pair[0] in targetData.columns and pair[1] in targetData.columns:
                targetData['{}'.format(pair[1])].fillna(targetData['{}'.format(pair[0])], inplace=True)
                targetData.drop(columns=['{}'.format(pair[0])], inplace=True)
        targetData = camel_case_columns(targetData)
        fixDataNaN(targetData)

fixMergeColumns([data], pairs)

We are going to train our model to predict whether the position ends up to a goal or not so we need to convert our goal and not goal situations to binary tags.

In [259]:
replacement_dict = {
    'Goal': 'Goal',
    'BlockedShot': 'No Goal',
    'MissedShots': 'No Goal',
    'SavedShot': 'No Goal',
    'ShotOnPost': 'No Goal',
    'OwnGoal': 'No Goal'
}

data['result'] = data['result'].map(replacement_dict)

# Exploring Data

In [260]:
data.head(2)

Unnamed: 0,id,minute,result,X,Y,xG,player,playerId,situation,season,shotType,matchId,homeTeam,awayTeam,homeGoals,awayGoals,date,playerAssisted,lastAction,homeAway
66772,451021.0,8.0,Goal,0.907,0.404,0.404479,Enes Ünal,6219.0,OpenPlay,2021.0,RightFoot,17323.0,Getafe,Real Madrid,1.0,0.0,2022-01-02 13:00:00,,Tackle,h
66773,451022.0,13.0,No Goal,0.738,0.457,0.281229,Nemanja Maksimovic,6076.0,OpenPlay,2021.0,LeftFoot,17323.0,Getafe,Real Madrid,1.0,0.0,2022-01-02 13:00:00,,Dispossessed,h


### Target Variable

Our target variable is "result", this feature represents whether a shot is a goal or not.

### Feature Variables

Our feature variables are going to help our model to learn and predict the target variable.

In [261]:
print(data.drop('result',axis=1).columns.tolist())

['id', 'minute', 'X', 'Y', 'xG', 'player', 'playerId', 'situation', 'season', 'shotType', 'matchId', 'homeTeam', 'awayTeam', 'homeGoals', 'awayGoals', 'date', 'playerAssisted', 'lastAction', 'homeAway']


# Imbalancedness

Our data has imbalancedness problem which might cause our model to learn features of majority target variables better than the minority target variables, it might cause inaccurate predictions.

In [262]:
print(data['result'].value_counts())

result
No Goal    20985
Goal        2379
Name: count, dtype: int64


In [263]:
X = data.drop('result', axis=1)
Y = data['result']

X = pd.get_dummies(X)

data = pd.concat([pd.DataFrame(X), pd.DataFrame(Y)], axis=1)

In [264]:
data_shuffled = data.sample(frac=1, random_state=42)
half_length = len(data_shuffled) // 2
df_half1 = data_shuffled.iloc[:half_length]
df_half2 = data_shuffled.iloc[half_length:]

data = df_half1
validation_data = df_half2

We are going to use SMOTE method for oversampling to fix the gap between minority and majority target variables. This method uses clustering methods to create new observations based on original ones.

In [265]:
Y = data['result']
x = data.drop('result', axis=1)
scaler = MinMaxScaler()
X = scaler.fit_transform(x)

print('Original dataset shape %s' % Counter(Y))
sm = SMOTE(random_state=42,n_jobs=-1)
x_res, y_res = sm.fit_resample(X, Y)
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({'No Goal': 10478, 'Goal': 1204})
Resampled dataset shape Counter({'No Goal': 10478, 'Goal': 10478})


In [266]:
Y_validation = validation_data['result']
x_validation = validation_data.drop('result', axis=1)

X_validation = scaler.fit_transform(x_validation)

# Model

In [267]:
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.25, random_state=42, stratify=y_res)

We have found the best model to use is Random Forest after comparing Decision Tree, Logistic Regression,XGBoost and Random Forest.

In [268]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

5-Fold Cross Validation score:

In [269]:
print(cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy').mean())

0.9611250899652607


### Train - Test Accuracy Comparison to Check Overfitting

Our model seems to have learned train set perfectly, we might have suspected of overfitting much more than the current situation if the test accuracies were bad but accuracy and balanced accuracy scores on train and test sets are very close to each other. Our model is good to go, but we are going to check if hyperparameter tuning or automl could improve our performance.

In [270]:
test_accuracy = accuracy_score(y_test, y_pred)
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
test_confusion_matrix = confusion_matrix(y_test, y_pred)

train_accuracy = accuracy_score(y_train, model.predict(x_train))
train_balanced_accuracy = balanced_accuracy_score(y_train, model.predict(x_train))
train_confusion_matrix = confusion_matrix(y_train, model.predict(x_train))

In [271]:
print('Test Accuracy: {}\nTrain Accuracy: {}\n'.format(test_accuracy, train_accuracy))
print('Test Balanced Accuracy: {}\nTrain Balanced Accuracy: {}\n'.format(test_balanced_accuracy, train_balanced_accuracy))
print('Test Confusion Matrix: \n{}\n\nTrain Confusion Matrix: \n{}'.format(test_confusion_matrix, train_confusion_matrix))

Test Accuracy: 0.9622065279633518
Train Accuracy: 1.0

Test Balanced Accuracy: 0.9622076195972473
Train Balanced Accuracy: 1.0

Test Confusion Matrix: 
[[2506  114]
 [  84 2535]]

Train Confusion Matrix: 
[[7858    0]
 [   0 7859]]


### Hyperparameter Tuning

We are going to use the Halving Random Search method and 5-Fold Cross Validation together to find the best hyperparameters, evaluating our iterations by balanced accuracy scores.

The Halving Random Search method uses an elimination system as its base idea: the best of the two compared hyperparameter sets rises above on the leaderboard, and we get the best hyperparameters after the final round. Another pro of Halving Random Search is that it is much faster than Random Search.

In [272]:
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=50, stop=5000, num=10)],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [int(x) for x in np.linspace(2, 50)],
    'min_samples_split': [int(x) for x in np.linspace(2, 5)],
    'min_samples_leaf': [int(x) for x in np.linspace(2, 5)],
    'bootstrap': [True, False],
}

halving = HalvingRandomSearchCV(model, param_grid, factor=3, resource='n_samples', max_resources=1000, random_state=42, verbose=0,scoring='balanced_accuracy', n_jobs=-1)
halving.fit(x_train, y_train)
print("Best Params:{}/nBest Balanced Accuracy:{}".format(halving.best_params_,halving.best_score_))



Best Params:{'n_estimators': 4450, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 32, 'bootstrap': False}/nBest Balanced Accuracy:0.8637369061968074


We did not get what we wanted from hyperparameter tuning, our balanced accuracy decreased and it might not be ideal to consume more time to random searching, so let's see how AutoML is going to work.

In [273]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,4 hours 19 mins
H2O_cluster_timezone:,Europe/Istanbul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.2
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_sezaiufukoral_x1b2rc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,808 Mb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [274]:
hf = h2o.H2OFrame(pd.concat([pd.DataFrame(x_res), pd.DataFrame(y_res)], axis=1))

train_hf, test_hf = hf.split_frame(ratios=[0.75], seed = 1)

### AutoML

We are going to use the H2O library because of its ease of use. We are going to set the maximum runtime to 300 seconds so our process won't consume too much time. AutoML is going to use 5-Fold Cross Validation by default and compare balanced accuracies and more metrics to choose the best model for our data.

In [275]:
aml = H2OAutoML(max_models = 20,
                balance_classes=True,
		seed =1,max_runtime_secs=600,verbosity='none')

aml.train(training_frame = train_hf, y = 'result')

Unnamed: 0,number_of_trees,number_of_nternal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,165.0,165.0,116235.0,10.0,15.0,14.563637,24.0,71.0,50.854546

Unnamed: 0,Goal,No Goal,Error,Rate
Goal,5401.0,94.0,0.0171,(94.0/5495.0)
No Goal,33.0,5463.0,0.006,(33.0/5496.0)
Total,5434.0,5557.0,0.0116,(127.0/10991.0)

metric,threshold,value,idx
max f1,0.5385782,0.9885099,205.0
max f2,0.4474125,0.9926471,226.0
max f0point5,0.7290621,0.9899091,155.0
max accuracy,0.5385782,0.9884451,205.0
max precision,0.9993068,1.0,0.0
max recall,0.1870207,1.0,291.0
max specificity,0.9993068,1.0,0.0
max absolute_mcc,0.5385782,0.9769503,205.0
max min_per_class_accuracy,0.6074576,0.9872611,188.0
max mean_per_class_accuracy,0.5385782,0.9884446,205.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100082,0.9991044,1.999818,1.999818,1.0,0.9993637,1.0,0.9993637,0.0200146,0.0200146,99.9818049,99.9818049,0.0200146
2,0.0200164,0.9986783,1.999818,1.999818,1.0,0.9988894,1.0,0.9991265,0.0200146,0.0400291,99.9818049,99.9818049,0.0400291
3,0.0300246,0.9983322,1.999818,1.999818,1.0,0.9985194,1.0,0.9989241,0.0200146,0.0600437,99.9818049,99.9818049,0.0600437
4,0.0400328,0.9980561,1.999818,1.999818,1.0,0.9981982,1.0,0.9987427,0.0200146,0.0800582,99.9818049,99.9818049,0.0800582
5,0.0500409,0.9976626,1.999818,1.999818,1.0,0.9978596,1.0,0.9985661,0.0200146,0.1000728,99.9818049,99.9818049,0.1000728
6,0.1000819,0.9956482,1.999818,1.999818,1.0,0.9966898,1.0,0.9976279,0.1000728,0.2001456,99.9818049,99.9818049,0.2001456
7,0.1500318,0.9930942,1.999818,1.999818,1.0,0.9944488,1.0,0.9965695,0.0998908,0.3000364,99.9818049,99.9818049,0.3000364
8,0.2001638,0.9890695,1.999818,1.999818,1.0,0.9912729,1.0,0.9952429,0.1002547,0.4002911,99.9818049,99.9818049,0.4002911
9,0.3000637,0.9737378,1.999818,1.999818,1.0,0.9823921,1.0,0.9909645,0.1997817,0.6000728,99.9818049,99.9818049,0.6000728
10,0.4000546,0.9297783,1.999818,1.999818,1.0,0.9553362,1.0,0.9820595,0.1999636,0.8000364,99.9818049,99.9818049,0.8000364

Unnamed: 0,Goal,No Goal,Error,Rate
Goal,778.0,70.0,0.0825,(70.0/848.0)
No Goal,21.0,800.0,0.0256,(21.0/821.0)
Total,799.0,870.0,0.0545,(91.0/1669.0)

metric,threshold,value,idx
max f1,0.3256764,0.9461857,246.0
max f2,0.1826618,0.9674352,283.0
max f0point5,0.6268161,0.9391176,199.0
max accuracy,0.3611765,0.9454763,242.0
max precision,0.9995102,1.0,0.0
max recall,0.052349,1.0,332.0
max specificity,0.9995102,1.0,0.0
max absolute_mcc,0.3256764,0.8925656,246.0
max min_per_class_accuracy,0.5675515,0.9386792,209.0
max mean_per_class_accuracy,0.3256764,0.9459371,246.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0101857,0.9991207,2.0328867,2.0328867,1.0,0.9993892,1.0,0.9993892,0.0207065,0.0207065,103.2886724,103.2886724,0.0207065
2,0.0203715,0.9987495,2.0328867,2.0328867,1.0,0.9989067,1.0,0.9991479,0.0207065,0.0414129,103.2886724,103.2886724,0.0414129
3,0.0305572,0.9982666,2.0328867,2.0328867,1.0,0.9985087,1.0,0.9989349,0.0207065,0.0621194,103.2886724,103.2886724,0.0621194
4,0.0401438,0.9977805,2.0328867,2.0328867,1.0,0.9980698,1.0,0.9987283,0.0194884,0.0816078,103.2886724,103.2886724,0.0816078
5,0.0503295,0.9973622,2.0328867,2.0328867,1.0,0.9975367,1.0,0.9984871,0.0207065,0.1023143,103.2886724,103.2886724,0.1023143
6,0.1000599,0.9951849,2.0328867,2.0328867,1.0,0.9962254,1.0,0.9973631,0.1010962,0.2034105,103.2886724,103.2886724,0.2034105
7,0.1503895,0.9920697,2.0086857,2.0247876,0.9880952,0.9938658,0.9960159,0.9961927,0.1010962,0.3045067,100.8685691,102.4787573,0.3033275
8,0.2001198,0.9863547,1.9839015,2.0146273,0.9759036,0.9891642,0.991018,0.9944461,0.0986602,0.4031669,98.3901501,101.4627262,0.3996291
9,0.3001797,0.9646297,1.9598489,1.9963678,0.9640719,0.9773753,0.9820359,0.9887558,0.1961023,0.5992692,95.9848877,99.63678,0.588656
10,0.4002397,0.8963864,1.8624651,1.9628921,0.9161677,0.9369344,0.9655689,0.9758005,0.1863581,0.7856273,86.2465082,96.2892121,0.7585046

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
,2024-06-09 18:54:55,0.033 sec,0.0,0.5000124,0.6931721,0.5,0.5000455,1.0,0.4999545,0.4999558,0.6930588,0.5,0.4919113,1.0,0.5080887
,2024-06-09 18:55:02,7.795 sec,5.0,0.3892859,0.4888256,0.9569196,0.9573418,1.9998180,0.1178237,0.3938071,0.4961438,0.9434537,0.9354641,2.0328867,0.1354104
,2024-06-09 18:55:10,15.189 sec,10.0,0.3317045,0.3866943,0.9624754,0.9636461,1.9998180,0.1086343,0.3403842,0.3994133,0.9496286,0.9445517,2.0328867,0.1228280
,2024-06-09 18:55:18,23.985 sec,15.0,0.2939101,0.3186825,0.9710239,0.9716393,1.9998180,0.0927122,0.3078272,0.3383523,0.9577770,0.9513462,2.0328867,0.1066507
,2024-06-09 18:55:28,33.160 sec,20.0,0.2702736,0.2738644,0.9751424,0.9755947,1.9998180,0.0832499,0.2885910,0.2995175,0.9615244,0.9547522,2.0328867,0.0976633
,2024-06-09 18:55:36,41.294 sec,25.0,0.2555598,0.2449354,0.9780574,0.9785437,1.9998180,0.0785188,0.2785075,0.2767677,0.9634728,0.9572951,2.0328867,0.0940683
,2024-06-09 18:55:44,49.474 sec,30.0,0.2430325,0.2212981,0.9811507,0.9815229,1.9998180,0.0735147,0.2703970,0.2588564,0.9658694,0.9601479,2.0328867,0.0898742
,2024-06-09 18:55:55,1 min 0.589 sec,35.0,0.2311348,0.2005266,0.9840281,0.9841905,1.9998180,0.0652352,0.2610124,0.2404017,0.9696097,0.9646248,2.0328867,0.0856800
,2024-06-09 18:56:02,1 min 7.713 sec,40.0,0.2196822,0.1822469,0.9867470,0.9867598,1.9998180,0.0589573,0.2536207,0.2270068,0.9718820,0.9674490,2.0328867,0.0826842
,2024-06-09 18:56:09,1 min 14.581 sec,45.0,0.2095898,0.1668853,0.9888125,0.9886716,1.9998180,0.0544991,0.2471492,0.2155119,0.9738261,0.9691363,2.0328867,0.0742960

variable,relative_mportance,scaled_mportance,percentage
4,5614.7119141,1.0,0.4906837
8,1526.2762451,0.2718352,0.1333851
9,1222.4212646,0.2177175,0.1068305
2,681.6923218,0.1214118,0.0595748
3,351.6031189,0.0626218,0.0307275
2508,276.6093140,0.0492651,0.0241736
1,190.1030579,0.0338580,0.0166136
2509,170.2273865,0.0303181,0.0148766
5,167.3738861,0.0298099,0.0146272
6,122.4053116,0.0218008,0.0106973


AutoML found the best model to be Gradient Boosting Machines. This algorithm is based on the idea of creating an ensemble of weak learners, typically decision trees, in a sequential manner. Each new model attempts to correct the errors of the previous models.

Our new model's performance is almost the same as vanilla Random Forest. However, AutoML checked more metrics to validate that this is the right model for our data. Both models are black box models, so it does not affect us which one we choose for the sake of interpretability. We can trust more in the one that AutoML found for its generalizability because it checked more metrics when building the model. So, we are good to go now.

In [276]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_1_AutoML_6_20240609_185301,0.982868,0.157277,0.979261,0.0540629,0.212556,0.0451799
XGBoost_1_AutoML_6_20240609_185301,0.980755,0.165348,0.976219,0.0552033,0.216288,0.0467807
XGBoost_2_AutoML_6_20240609_185301,0.979706,0.164984,0.971498,0.0564989,0.213872,0.0457413
GBM_2_AutoML_6_20240609_185301,0.979438,0.184273,0.974896,0.0649669,0.22866,0.0522856
DRF_1_AutoML_6_20240609_185301,0.970245,0.371753,0.969912,0.0815848,0.323399,0.104587
GLM_1_AutoML_6_20240609_185301,0.936036,0.316161,0.949772,0.116551,0.303505,0.092115


In [277]:
warnings.filterwarnings('ignore')
preds = aml.leader.predict(test_hf)

y_test_gbm = test_hf['result'].as_data_frame().values.flatten()
y_pred_gbm = preds['predict'].as_data_frame().values.flatten()

balanced_acc = balanced_accuracy_score(y_test_gbm, y_pred_gbm)
print("Balanced Accuracy Score: ", balanced_acc)

Balanced Accuracy Score:  0.9476668326715598


# Validation of the Model

We are going to test our model with a new, unseen data.

In [280]:
warnings.filterwarnings('ignore')

validation_x_hf = h2o.H2OFrame(pd.DataFrame(X_validation))
validation_y_hf = h2o.H2OFrame(pd.DataFrame(Y_validation))

                               
preds_validation = aml.leader.predict(validation_x_hf)

y_validation_test_gbm = validation_y_hf['result'].as_data_frame().values.flatten()
y_validation_pred_gbm = preds_validation['predict'].as_data_frame().values.flatten()

balanced_acc = balanced_accuracy_score(y_validation_test_gbm, y_validation_pred_gbm)
print("Balanced Accuracy Score: ", balanced_acc)

print(confusion_matrix(y_validation_test_gbm, y_validation_pred_gbm))

Balanced Accuracy Score:  0.6376103874013069
[[  350   825]
 [  238 10269]]


In [279]:
y_validation_pred = model.predict(X_validation)

print(confusion_matrix(Y_validation, y_validation_pred))

print(balanced_accuracy_score(Y_validation, y_validation_pred))

[[  385   790]
 [  330 10177]]
0.6481259707307591
