In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import json, warnings
warnings.filterwarnings('ignore')

In [31]:
df = pd.read_csv('../data/raw/playersData.csv', encoding='utf-8-sig')
df.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Att (GK),Thr,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist
0,1,Max Aarons,eng ENG,DF,Bournemouth,eng Premier League,24.0,2000.0,3,1,...,,,,,,,,,,
1,2,Max Aarons,eng ENG,"DF,MF",Valencia,es La Liga,24.0,2000.0,4,1,...,,,,,,,,,,
2,3,Rodrigo Abajas,es ESP,DF,Valencia,es La Liga,21.0,2003.0,1,1,...,,,,,,,,,,
3,4,James Abankwah,ie IRL,"DF,MF",Udinese,it Serie A,20.0,2004.0,6,0,...,,,,,,,,,,
4,5,Keyliane Abdallah,fr FRA,FW,Marseille,fr Ligue 1,18.0,2006.0,1,0,...,,,,,,,,,,


In [32]:
#filtering out players based on the number of 90 minutes played in the season, to avoid null data
df = df[pd.to_numeric(df['90s'], errors='coerce') >= 3].copy()

In [33]:
#Check for the number of unique teams covered
print(len(sorted(df['Squad'].unique())))

96


In [34]:
#Coluns important to derive positional classification data.
counting_cols = [
    'Gls', 'Ast', 'xG', 'xAG', 'PrgC', 'PrgP',
    'Tkl', 'TklW', 'Int', 'Clr', 'Blocks_stats_defense',
    'KP', 'SCA', 'GCA',
    'Sh', 'SoT',
    'Def 3rd', 'Att 3rd',
    'Def 3rd_stats_possession',
    'Att 3rd_stats_possession', 'Att Pen',
    'PrgC_stats_possession',
    'Crs_stats_passing_types',
    'Won',
]

In [35]:
#making sure all the players data is calculated as a per match impact they provide by dividing with 90s
for col in counting_cols:
    if col in df.columns:
        df[col + '_p90'] = pd.to_numeric(df[col], errors='coerce') / pd.to_numeric(df['90s'], errors='coerce')

In [36]:
#Converting Completion percentage as a numeric data field
df['Cmp%'] = pd.to_numeric(df['Cmp%'], errors='coerce')
#Additional data required for players
df['Crs_p90'] = pd.to_numeric(df['Crs'], errors='coerce') / pd.to_numeric(df['90s'], errors='coerce')

In [37]:
df.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Sh_p90,SoT_p90,Def 3rd_p90,Att 3rd_p90,Def 3rd_stats_possession_p90,Att 3rd_stats_possession_p90,Att Pen_p90,PrgC_stats_possession_p90,Won_p90,Crs_p90
5,6,Yunis Abdelhamid,ma MAR,DF,Saint-Étienne,fr Ligue 1,36.0,1987.0,16,11,...,0.173913,0.086957,1.304348,0.0,36.0,1.217391,0.608696,0.347826,1.652174,0.0
6,7,Himad Abdelli,dz ALG,"MF,FW",Angers,fr Ligue 1,24.0,1999.0,32,32,...,1.107595,0.253165,0.411392,0.474684,9.272152,18.892405,1.550633,3.386076,0.411392,1.392405
7,8,Mohamed Abdelmoneim,eg EGY,DF,Nice,fr Ligue 1,25.0,1999.0,12,10,...,0.0,0.0,1.263158,0.315789,27.368421,4.842105,0.315789,0.631579,3.263158,0.210526
8,9,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,30.0,1993.0,25,17,...,1.870968,0.709677,1.16129,0.709677,14.83871,20.322581,3.225806,2.258065,1.419355,3.290323
10,11,Abel,es ESP,DF,Osasuna,es La Liga,23.0,2000.0,35,20,...,0.521739,0.173913,1.217391,0.347826,16.086957,15.521739,0.826087,2.173913,0.869565,3.695652


In [38]:
p90_cols = [c for c in df.columns if c.endswith('_p90')]
#Columns with per 90 statistics
print(p90_cols)
#Number of per 90 stat columns
print(len(p90_cols))

['Gls_p90', 'Ast_p90', 'xG_p90', 'xAG_p90', 'PrgC_p90', 'PrgP_p90', 'Tkl_p90', 'TklW_p90', 'Int_p90', 'Clr_p90', 'Blocks_stats_defense_p90', 'KP_p90', 'SCA_p90', 'GCA_p90', 'Sh_p90', 'SoT_p90', 'Def 3rd_p90', 'Att 3rd_p90', 'Def 3rd_stats_possession_p90', 'Att 3rd_stats_possession_p90', 'Att Pen_p90', 'PrgC_stats_possession_p90', 'Won_p90', 'Crs_p90']
24


In [39]:
#functional rules to define a player role manually before training
def assign_role(row):
    pos = str(row['Pos'])

    if 'GK' in pos:
        return 'Goalkeeper'

    #Function to avoid NaN values
    def avoid_nan(value):
        try:
            return float(row.get(value, 0) or 0)
        except:
            return 0.0

    clr    = avoid_nan('Clr_p90')
    tkl    = avoid_nan('Tkl_p90')
    int_   = avoid_nan('Int_p90')
    kp     = avoid_nan('KP_p90')
    sca    = avoid_nan('SCA_p90')
    prgc   = avoid_nan('PrgC_stats_possession_p90')
    att3   = avoid_nan('Att 3rd_stats_possession_p90')
    attpen = avoid_nan('Att Pen_p90')
    sh     = avoid_nan('Sh_p90')
    xg     = avoid_nan('xG_p90')
    crs    = avoid_nan('Crs_p90')
    won    = avoid_nan('Won_p90')
    prgp   = avoid_nan('PrgP_p90')

    #Defender labelling
    # CB if high amount of aeriel duels won and high number of clearances
    # DM if tackles and interceptions are higher
    # FB if there are high level of progressive carries and attacking third involvements
    if pos in ('DF', 'DF,MF', 'MF,DF'):
        cb_score  = clr * 2 + won
        fb_score  = prgc * 2 + att3
        dm_score  = tkl + int_

        if pos in ('DF,MF', 'MF,DF') and dm_score > cb_score and dm_score > fb_score:
            return 'Defensive Mid'

        return 'Centre Back' if cb_score >= fb_score else 'Fullback'

    if pos in ('MF', 'MF,DF', 'DF,MF'):
        dm_score = tkl + int_
        cm_score = kp + sca * 0.5 + prgc

        return 'Defensive Mid' if dm_score >= cm_score else 'Central Mid'

    if pos in ('MF,FW', 'FW,MF'):
        winger_score = prgc * 2 + crs
        cm_score = kp * 2 + sca * 0.5 + prgp

        return 'Winger' if winger_score >= cm_score else 'Central Mid'

    if pos in ('FW', 'DF,FW', 'FW,DF'):
        st_score = attpen * 3 + sh
        wing_score = prgc * 3 + crs + (att3 - attpen)

        return 'Striker' if st_score >= wing_score else 'Winger'

    return 'Central Mid'  # fallback for anything unexpected


In [40]:
#Create a new column field for each role with the current players
df['role'] = df.apply(assign_role, axis=1)
#distrbution of roles in current player dataset
print(df['role'].value_counts())


role
Fullback         461
Central Mid      458
Centre Back      394
Winger           384
Defensive Mid    203
Goalkeeper       170
Striker          144
Name: count, dtype: int64


In [41]:
#Using a RandomForest Classifier to predict the role of players based on the rules defined in assign_role function
feature_cols = [c for c in df.columns if c.endswith('_p90')] + ['Cmp%']
feature_cols = [c for c in feature_cols if df[c].notna().sum() > 100]
X = df[feature_cols].fillna(0)
y = df['role']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

  Central Mid       0.75      0.84      0.79        92
  Centre Back       0.99      0.96      0.97        79
Defensive Mid       0.80      0.82      0.81        40
     Fullback       0.91      0.77      0.84        92
   Goalkeeper       1.00      1.00      1.00        34
      Striker       0.80      0.83      0.81        29
       Winger       0.70      0.74      0.72        77

     accuracy                           0.84       443
    macro avg       0.85      0.85      0.85       443
 weighted avg       0.85      0.84      0.84       443



In [44]:
#Attempting to check the normally accepted players roles for world-class players
known = [
    'Mohamed Salah', 'Virgil van Dijk', 'Trent Alexander-Arnold',
    'Rodrygo', 'Erling Haaland', 'Kevin De Bruyne',
    'Bukayo Saka', 'Kylian Mbappé', 'Harry Kane'
]

check = df[df['Player'].isin(known)][['Player', 'Squad', 'Pos', 'role']]
print(check.to_string())


                      Player            Squad    Pos         role
83    Trent Alexander-Arnold        Liverpool     DF     Fullback
678          Kevin De Bruyne  Manchester City  MF,FW  Central Mid
1109          Erling Haaland  Manchester City     FW      Striker
1317              Harry Kane    Bayern Munich     FW      Striker
1691           Kylian Mbappé      Real Madrid     FW       Winger
2242                 Rodrygo      Real Madrid  FW,MF       Winger
2299             Bukayo Saka          Arsenal  FW,MF       Winger
2304           Mohamed Salah        Liverpool     FW       Winger
2670         Virgil van Dijk        Liverpool     DF  Centre Back
