In [63]:
import pandas as pd
import numpy as np
import rpy2.robjects as ro
import rpy2.robjects.packages as rpackages
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
from sklearn.preprocessing import MinMaxScaler

In [64]:
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)

<rpy2.rinterface_lib.sexp.NULLType object at 0x1262979d0> [0]

In [65]:
pandas2ri.activate()
ro.r('''
        library("worldfootballR")
        library("dplyr")
        laliga <- load_understat_league_shots(league = "La liga")
     ''')
laliga = pandas2ri.rpy2py(ro.r['laliga'])
laliga.drop('league', axis=1, inplace=True)

→ Data last updated 2024-05-30 18:34:46.012307882309 UTC


In [66]:
laliga = laliga[(laliga['date'] > '2020-01-01') & (laliga['date'] < '2024-06-10')]

In [67]:
warnings.filterwarnings('ignore')

def fixDataNaN(df):
    with localconverter(ro.default_converter + pandas2ri.converter):
        df = ro.conversion.py2rpy(df)
pairs = [['x','X'],['y','Y'],['x_g','xG'],['h_a','home_away'],['shot_type','shotType'],['last_action','lastAction']]

def camel_case_columns(df):
    def camel_case(column_name):
        parts = column_name.split('_')
        return str(parts[0] + ''.join(x.title() for x in parts[1:]))
    
    new_columns = []
    for column in df.columns:
        if '_' in column:
            new_columns.append(camel_case(column))
        else:
            new_columns.append(str(column))
    
    df.columns = new_columns
    return df

def fixMergeColumns(dataList, pairs):
    for targetData in dataList:
        for pair in pairs:
            if pair[0] in targetData.columns and pair[1] in targetData.columns:
                targetData['{}'.format(pair[1])].fillna(targetData['{}'.format(pair[0])], inplace=True)
                targetData.drop(columns=['{}'.format(pair[0])], inplace=True)
        targetData = camel_case_columns(targetData)
        fixDataNaN(targetData)

fixMergeColumns([laliga], pairs)

In [68]:
#  NA_ string contained count

total_count = laliga.isna().sum()

print(total_count)

id                    0
minute                0
result                0
X                     0
Y                     0
xG                    0
player                0
playerId              0
situation             0
season                0
shotType              0
matchId               0
homeTeam              0
awayTeam              0
homeGoals             0
awayGoals             0
date                  0
playerAssisted    10653
lastAction            0
homeAway              0
dtype: int64


In [69]:
replacement_dict = {
    'Goal': 'Goal',
    'BlockedShot': 'No Goal',
    'MissedShots': 'No Goal',
    'SavedShot': 'No Goal',
    'ShotOnPost': 'No Goal',
    'OwnGoal': 'No Goal'
}

# 'result' sütununu değiştirme
laliga['result'] = laliga['result'].map(replacement_dict)

# Imbalancedness

In [70]:
print(laliga['result'].value_counts())

result
No Goal    35980
Goal        4211
Name: count, dtype: int64


In [72]:
Y = laliga['result']
x = laliga.drop('result', axis=1)

X = pd.get_dummies(x)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

print('Original dataset shape %s' % Counter(Y))
sm = SMOTE(random_state=42)
x_res, y_res = sm.fit_resample(X, Y)
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({'No Goal': 35980, 'Goal': 4211})
Resampled dataset shape Counter({'Goal': 35980, 'No Goal': 35980})


# Model