In [183]:
import pandas as pd
import numpy as np
import rpy2.robjects as ro
import rpy2.robjects.packages as rpackages
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

In [184]:
utils = rpackages.importr('utils')

utils.chooseCRANmirror(ind=1)

worldfootballR = rpackages.importr('worldfootballR')

In [185]:
pandas2ri.activate()
ro.r('''
        library("worldfootballR")
        laliga <- load_understat_league_shots(league = "La liga")
     ''')
laliga = pandas2ri.rpy2py(ro.r['laliga'])
laliga.drop('league', axis=1, inplace=True)

→ Data last updated 2024-05-23 18:32:07.0585150718689 UTC


In [186]:
def fixDataNaN(df):
    with localconverter(ro.default_converter + pandas2ri.converter):
        df = ro.conversion.py2rpy(df)
pairs = [['x','X'],['y','Y'],['x_g','xG'],['h_a','home_away'],['shot_type','shotType'],['last_action','lastAction']]

def camel_case_columns(df):
    def camel_case(column_name):
        parts = column_name.split('_')
        return str(parts[0] + ''.join(x.title() for x in parts[1:]))
    
    new_columns = []
    for column in df.columns:
        if '_' in column:
            new_columns.append(camel_case(column))
        else:
            new_columns.append(str(column))
    
    df.columns = new_columns
    return df

def fixMergeColumns(dataList, pairs):
    for targetData in dataList:
        for pair in pairs:
            if pair[0] in targetData.columns and pair[1] in targetData.columns:
                targetData['{}'.format(pair[1])].fillna(targetData['{}'.format(pair[0])], inplace=True)
                targetData.drop(columns=['{}'.format(pair[0])], inplace=True)
        targetData = camel_case_columns(targetData)
        fixDataNaN(targetData)

fixMergeColumns([laliga], pairs)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  targetData['{}'.format(pair[1])].fillna(targetData['{}'.format(pair[0])], inplace=True)


In [187]:
#  NA_ string contained count

total_count = laliga.isna().sum()

print(total_count)

id                    0
minute                0
result                0
X                     0
Y                     0
xG                    0
player                0
playerId              0
situation             0
season                0
shotType              0
matchId               0
homeTeam              0
awayTeam              0
homeGoals             0
awayGoals             0
date                  0
playerAssisted    24434
lastAction            0
homeAway              0
dtype: int64


In [188]:
laliga['result'].value_counts()

result
MissedShots    35495
SavedShot      21289
BlockedShot    21281
Goal            9681
ShotOnPost      1862
OwnGoal          270
Name: count, dtype: int64

# Model

In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import KFold,LeaveOneOut,LeavePOut,cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [190]:
print(laliga.columns)

Index(['id', 'minute', 'result', 'X', 'Y', 'xG', 'player', 'playerId',
       'situation', 'season', 'shotType', 'matchId', 'homeTeam', 'awayTeam',
       'homeGoals', 'awayGoals', 'date', 'playerAssisted', 'lastAction',
       'homeAway'],
      dtype='object')


In [191]:
from sklearn.utils import resample

minority_class_size = len(laliga[laliga['result'] == 'Goal'])

# Çoğunluk sınıfındaki örnekler
majority_class_samples = laliga[laliga['result'] != 'Goal']

# Azınlık sınıfındaki örnekler
minority_class_samples = laliga[laliga['result'] == 'Goal']

# Çoğunluk sınıfındaki örneklerin sayısını azaltarak azınlık sınıfıyla aynı sayıda örneği rastgele seçme
majority_class_samples_downsampled = resample(majority_class_samples, replace=False, n_samples=minority_class_size, random_state=42)

# Azınlık sınıfındaki örneklerle birlikte yeniden örnekleme yapılmış çoğunluk sınıfındaki örnekler
undersampled_data = pd.concat([majority_class_samples_downsampled, minority_class_samples])

replacement_dict = {
    'BlockedShot': 'No Goal',
    'MissedShots': 'No Goal',
    'SavedShot': 'No Goal',
    'ShotOnPost': 'No Goal',
    'Goal': 'Goal',
    'OwnGoal': 'No Goal'
}

# 'result' sütununu değiştirme
undersampled_data['result'] = undersampled_data['result'].map(replacement_dict)

# 'result' sütununu alarak Y'yi tanımlayın
Y = undersampled_data['result']

# 'result' sütununu içermeyen X veri çerçevesini tanımlayın
X = undersampled_data.drop(columns=['result'])

Y = Y.replace(replacement_dict)

In [192]:
X = pd.get_dummies(X)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [193]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)

# model = RandomForestClassifier()
# model.fit(x_train, y_train)

# y_pred = model.predict(x_test)

In [194]:
#confusion matrix
# from sklearn.metrics import confusion_matrix


# kfold = KFold(n_splits=5, random_state=42, shuffle=True)
# results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='balanced_accuracy')
# print("Balanced Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


# print(confusion_matrix(y_test, y_pred))


# Hyperparameter Tuning

In [195]:
# # #hyperparameter tuning for RandomForestClassifier

# n_estimators = [int(x) for x in np.linspace(start=50, stop=1000)]
# max_features = ['sqrt', 'log2', None]
# max_depth = [int(x) for x in np.linspace(2, 8)]
# max_depth.append(None)
# min_samples_split = [2, 6, 10]
# min_samples_leaf = [1, 4, 4]
# bootstrap = [True, False]

# #Halving search
# from sklearn.experimental import enable_halving_search_cv
# from sklearn.model_selection import HalvingRandomSearchCV
# from sklearn.ensemble import RandomForestClassifier

# params = {
#     'max_features': None,
#     'max_depth': 7,
#     'min_samples_split': 2,
#     'min_samples_leaf': 4,
#     'bootstrap': True,
# }

# model = RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1, verbose=3, **params)
# halving = HalvingRandomSearchCV(model, params, factor=3, resource='n_estimators', max_resources=1000, random_state=42, verbose=3,scoring='balanced_accuracy', n_jobs=-1)
# halving.fit(x_train, y_train)
# print("Best Params:{}/nBest Balanced Accuracy:{}".format(halving.best_params_,halving.best_score_))



In [196]:
# import pickle

# filename = 'model.sav'
# pickle.dump(model, open(filename, 'wb'))


In [197]:
import pickle
from sklearn.metrics import confusion_matrix

model = pickle.load(open('model.sav', 'rb'))
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))

[[1798  623]
 [ 398 2022]]


In [198]:
print(balanced_accuracy_score(y_test, y_pred))

0.789102754479571


In [199]:
params = model.get_params()

# 'class_weight' parametresini sil
print(params)

# model = RandomForestClassifier(**params)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# print(confusion_matrix(y_test, y_pred))

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [200]:
# def draw_soccer_field(X,Y,xG,homeTeam,awayTeam,home_away):
    

In [204]:
random_position_index = 120
real_result = y_test.iloc[random_position_index]
position_features = x_test[random_position_index].reshape(1, -1)
predicted_result = model.predict(position_features)[0]

print("Gerçek Sonuç:", real_result)
print("Tahmin Edilen Sonuç:", predicted_result)

laliga.iloc[random_position_index]

Gerçek Sonuç: Goal
Tahmin Edilen Sonuç: Goal


id                            32490.0
minute                           76.0
result                    MissedShots
X                               0.895
Y                               0.588
xG                           0.054564
player                    Carlos Vela
playerId                       2416.0
situation                    OpenPlay
season                         2014.0
shotType                         Head
matchId                        5830.0
homeTeam                        Eibar
awayTeam                Real Sociedad
homeGoals                         1.0
awayGoals                         0.0
date              2014-08-24 18:00:00
playerAssisted                 Zaldúa
lastAction                      Cross
homeAway                            a
Name: 121, dtype: object

In [215]:
import pandas as pd

# Özel pozisyonu tahmin etmek için veri oluşturma
customPositionToPredict = {
    'minute': [90],
    'X': [0.805],
    'Y': [0.568],
    'xG': [0.05],
    'player': ['Arda Güler'],
    'playerId': [12190.0],
    'situation': ['OpenPlay'],
    'season': [2024],
    'shotType': ['LeftFoot'],
    'matchId': [9999],
    'homeTeam': ['Barcelona'],
    'awayTeam': ['Real Madrid'],
    'homeGoals': [1],
    'awayGoals': [0],
    'date': ['2024-06-30 19:00:00'],
    # 'player_assisted': ,
    'lastAction': ['Pass'],
    'home_away': ['h']
}

# Özel pozisyonu içeren DataFrame oluşturma
customPosDf = pd.DataFrame(customPositionToPredict)

customPosDf = pd.get_dummies(customPosDf)

customPosDf = pd.get_dummies(customPosDf)
scaler = MinMaxScaler()
customPosDf = scaler.fit_transform(customPosDf)

print(customPosDf)


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [206]:
laliga[laliga['player'] == 'Arda Güler']

Unnamed: 0,id,minute,result,X,Y,xG,player,playerId,situation,season,shotType,matchId,homeTeam,awayTeam,homeGoals,awayGoals,date,playerAssisted,lastAction,homeAway
87613,568863.0,93.0,Goal,0.961,0.375,0.549858,Arda Güler,12190.0,OpenPlay,2023.0,RightFoot,22963.0,Real Madrid,Celta Vigo,4.0,0.0,2024-03-10 17:30:00,Dani Ceballos,TakeOn,h
87757,569505.0,93.0,ShotOnPost,0.527,0.291,0.011,Arda Güler,12190.0,OpenPlay,2023.0,LeftFoot,22967.0,Osasuna,Real Madrid,2.0,4.0,2024-03-16 15:15:00,Lucas Vázquez,Pass,a
88618,576543.0,28.0,Goal,0.907,0.606,0.291466,Arda Güler,12190.0,OpenPlay,2023.0,LeftFoot,23014.0,Real Sociedad,Real Madrid,0.0,1.0,2024-04-26 19:00:00,Daniel Carvajal,Cross,a
88620,576547.0,46.0,BlockedShot,0.758,0.497,0.020034,Arda Güler,12190.0,FromCorner,2023.0,LeftFoot,23014.0,Real Sociedad,Real Madrid,0.0,1.0,2024-04-26 19:00:00,,,a
88621,576548.0,47.0,BlockedShot,0.726,0.427,0.008564,Arda Güler,12190.0,FromCorner,2023.0,LeftFoot,23014.0,Real Sociedad,Real Madrid,0.0,1.0,2024-04-26 19:00:00,Luka Modric,Chipped,a
88900,578502.0,34.0,MissedShots,0.726,0.354,0.015573,Arda Güler,12190.0,OpenPlay,2023.0,LeftFoot,23021.0,Real Madrid,Cadiz,3.0,0.0,2024-05-04 14:15:00,Luka Modric,Pass,h
88903,578506.0,47.0,MissedShots,0.794,0.352,0.089795,Arda Güler,12190.0,DirectFreekick,2023.0,LeftFoot,23021.0,Real Madrid,Cadiz,3.0,0.0,2024-05-04 14:15:00,,Standard,h
89190,580075.0,46.0,Goal,0.889,0.595,0.114215,Arda Güler,12190.0,OpenPlay,2023.0,LeftFoot,23029.0,Granada,Real Madrid,0.0,4.0,2024-05-11 16:30:00,Fran García,Pass,a
89197,580086.0,73.0,BlockedShot,0.794,0.389,0.02449,Arda Güler,12190.0,OpenPlay,2023.0,LeftFoot,23029.0,Granada,Real Madrid,0.0,4.0,2024-05-11 16:30:00,,Rebound,a
89394,581003.0,80.0,Goal,0.865,0.404,0.076612,Arda Güler,12190.0,OpenPlay,2023.0,LeftFoot,23044.0,Real Madrid,Alaves,5.0,0.0,2024-05-14 19:30:00,,Rebound,h
