In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.decomposition import PCA

dfn = pd.read_csv('Football_Scouts_Database_Raw_Stats_Normalised_Floats.csv')

dfn = dfn.fillna(0) #replacing na with 0

Centre_forward_qualities=['Goals', 'Goals/90']
Winger_qualities=['Progressive Carries','Successful Take Ons','Touches in Attacking 3rd','Successful Take Ons/90','Touches in Attacking 3rd/90','Progressive Carries/90']
Attacking_mid_qualities=['Key Passes','Key Passes/90']
Central_mid_qualities=['Progressive Passes','Passes Completed','Progressive Passes/90','Passes Completed/90'] #through balls, passes into final third to be considered. Currently getting lot of cb
Defensive_mid_qualities=['Tackles Won','Interceptions','Ball Recoveries','Tackles Won/90','Interceptions/90','Ball Recoveries/90']
Wingback_qualities=['Tackles Won','Crosses into penalty area','Tackles Won/90','Crosses into penalty area/90']
Ballplaying_def_qualities=['% of Aerial Duels won','Shots Blocked','Clearances','Passes Completed','Shots Blocked/90','Clearances/90','Passes Completed/90']
Defensive_cb_qualities=['Shots Blocked','Clearances','% of Aerial Duels won','Shots Blocked/90','Clearances/90']

y = dfn[Defensive_cb_qualities] # target variable (needs to be maximised)
X = dfn.drop(['Player','Position'], axis=1) # removes non-integer values from dataframe as we cant put non-integer values into a regression model
X = X.drop(columns=y)

pca = PCA(n_components=30)
pca.fit(X)

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# create models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'Lasso': Lasso(),
    'Ridge': Ridge()
}

r2_values=[]

# fitting the models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred) #evaluating the models
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: MSE={mse:.2f}, RMSE={rmse:.2f}, MAE={mae:.2f}, R2={r2:.2f}")
    r2_values.append(r2)


Linear Regression: MSE=0.01, RMSE=0.06, MAE=0.04, R2=0.71
Random Forest: MSE=0.01, RMSE=0.06, MAE=0.04, R2=0.73
XGBoost: MSE=0.01, RMSE=0.05, MAE=0.03, R2=0.81
Lasso: MSE=0.02, RMSE=0.11, MAE=0.08, R2=-0.00
Ridge: MSE=0.01, RMSE=0.07, MAE=0.05, R2=0.64


In [None]:
# Linear regression model looks to be working the best with our dataset (for strikers). So we use it for the implementation
#Step 2: Finding the players who have the best stats leading to Goals/90

# find index of model with highest r2 value
best_model_index = r2_values.index(max(r2_values))

# fit best model on all data
if best_model_index == 0:
    lr = LinearRegression()
    lr.fit(X, y) # use transformed data
    coefficients = lr.coef_
    weighted_sum = np.dot(X, coefficients.T).sum(axis=1) # use transformed data
elif best_model_index == 1:
    rfr = RandomForestRegressor()
    rfr.fit(X, y) # use transformed data
    weighted_sum = rfr.predict(X) # use transformed data
    weighted_sum = np.sum(weighted_sum, axis=1)
elif best_model_index == 2:
    gbr = XGBRegressor()
    gbr.fit(X, y) # use transformed data
    weighted_sum = gbr.predict(X) # use transformed data
    weighted_sum = np.sum(weighted_sum, axis=1)
elif best_model_index == 3:
    lasso = Lasso()
    lasso.fit(X, y) # use transformed data
    coefficients = lasso.coef_
    weighted_sum = np.dot(X, coefficients.T).sum(axis=1) # use transformed data
else:
    ridge = Ridge()
    ridge.fit(X, y) # use transformed data
    coefficients = ridge.coef_
    weighted_sum = np.dot(X, coefficients.T).sum(axis=1) # use transformed data

df = pd.read_csv('Football_Scouts_Database_Raw_Stats.csv')
dfn['weighted_sum'] = weighted_sum
dfn_combined = dfn.groupby('Player').agg({'weighted_sum': 'mean'})
df['weighted_sum'] = df['Player'].map(dfn_combined['weighted_sum'])

if set(y.columns) == set(Centre_forward_qualities) or set(y.columns) == set(Winger_qualities):
  df_filtered=df[df['Position'].str.contains("FW")]

elif set(y.columns) == set(Attacking_mid_qualities) or set(y.columns) == set(Central_mid_qualities) or set(y.columns) == set(Defensive_mid_qualities):
  df_filtered=df[df['Position'].str.contains("MF")]

elif set(y.columns) == set(Wingback_qualities) or set(y.columns) == set(Ballplaying_def_qualities) or set(y.columns) == set(Defensive_cb_qualities):
  df_filtered=df[df['Position'].str.contains("DF")]

df.drop_duplicates(subset=['Player'])

top_20_players= df_filtered.nlargest(20, "weighted_sum")
top_20_players

Unnamed: 0.1,Unnamed: 0,Player,Position,Age,Matches Played,Starts,90 mins played,Progressive Carries,Progressive Passes,Progressive Passes Recvd,...,Shots Blocked/90,Passes Blocked/90,Interceptions/90,Clearances/90,Touches/90,Touches in Attacking 3rd/90,Touches in Penalty Box/90,Penalty Kicks won/90,Ball Recoveries/90,weighted_sum
2345,2389,James Tarkowski,DF,29.0,38,38,38.0,13,106,11,...,2.052632,0.552632,1.342105,5.289474,55.763158,2.973684,2.131579,0.0,4.552632,3.022521
904,923,Ethan Pinnock,DF,29.0,30,30,30.0,2,52,16,...,1.033333,0.333333,1.166667,7.166667,51.8,4.466667,1.833333,0.0,4.1,2.366945
1900,1938,Rodrigo Ely,DF,28.0,36,36,35.1,6,52,4,...,0.940171,0.37037,0.883191,5.783476,52.905983,1.538462,0.854701,0.0,4.045584,2.28905
1523,1557,Rasmus Nicolaisen,DF,25.0,34,34,33.3,21,114,4,...,1.351351,0.39039,1.501502,4.564565,63.963964,1.651652,0.990991,0.0,5.645646,2.258907
1928,1966,Max Kilman,DF,25.0,37,37,36.7,30,127,5,...,1.144414,0.762943,0.517711,4.468665,67.302452,2.043597,0.708447,0.0,5.967302,2.22324
2374,2418,Jubal,DF,28.0,37,35,35.7,7,73,1,...,0.952381,0.644258,1.484594,5.406162,49.691877,1.596639,0.868347,0.028011,4.985994,2.221932
1064,1089,Montassar Talbi,DF,24.0,38,38,38.0,13,61,0,...,0.894737,0.552632,1.052632,5.0,62.236842,0.736842,0.421053,0.0,4.315789,2.1974
1749,1784,Sebastiano Luperto,DF,26.0,36,36,34.8,13,72,3,...,1.206897,0.344828,1.235632,4.683908,59.109195,1.494253,0.66092,0.0,4.885057,2.140435
2288,2332,Federico Baschirotto,DF,26.0,37,37,37.0,12,107,12,...,0.783784,0.432432,0.972973,5.189189,49.27027,2.594595,0.972973,0.0,5.513514,2.132341
1380,1409,Antonio Raillo,DF,30.0,31,31,30.7,8,50,5,...,0.944625,0.29316,1.009772,5.14658,46.026059,1.726384,1.107492,0.032573,4.560261,2.039406


In [None]:
xg_df = pd.read_csv('Football_Scouts_Database_xG.csv')

if set(y.columns) == set(Centre_forward_qualities) or set(y.columns) == set(Winger_qualities):
  xg_df_filtered=xg_df[xg_df['Position'].str.contains("FW")]

elif set(y.columns) == set(Attacking_mid_qualities) or set(y.columns) == set(Central_mid_qualities) or set(y.columns) == set(Defensive_mid_qualities):
  xg_df_filtered=xg_df[xg_df['Position'].str.contains("MF")]

elif set(y.columns) == set(Wingback_qualities) or set(y.columns) == set(Ballplaying_def_qualities) or set(y.columns) == set(Defensive_cb_qualities):
  xg_df_filtered=xg_df[xg_df['Position'].str.contains("DF")]

#mapping weighted sum values from df_sorted to xg_df_filtered
xg_df_filtered['weighted_sum']= xg_df_filtered['Unnamed: 0'].map(df_filtered.set_index('Unnamed: 0')['weighted_sum'])
xg_df_sorted = xg_df_filtered.sort_values(by='weighted_sum', ascending=False)
top_10_players= xg_df_sorted.head(10)
top_10_players

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xg_df_filtered['weighted_sum']= xg_df_filtered['Unnamed: 0'].map(df_filtered.set_index('Unnamed: 0')['weighted_sum'])


Unnamed: 0.1,Unnamed: 0,Player,Position,Age,90 mins played,Non Penalty xG,Non-penalty goals,Non-Penalty(xG-Goals),Expected Assists,Assists,xAG-Assists,npxGi-npGi,Non Penalty xG + Assisted Goals,xG/90,Expected Assisted Goals/90,Expected Goals+Assisted Goals/90,Non Penalty xG/90,Non Penalty xG+xAG/90,weighted_sum
2345,2389,James Tarkowski,DF,29.0,38.0,3.1,1,2.1,2.9,1,1.9,4.0,6.0,0.08,0.08,0.16,0.08,0.16,3.022521
904,923,Ethan Pinnock,DF,29.0,30.0,3.0,3,0.0,1.0,0,1.0,1.0,4.0,0.1,0.03,0.13,0.1,0.13,2.366945
1900,1938,Rodrigo Ely,DF,28.0,35.1,1.0,0,1.0,0.8,1,-0.2,0.8,1.8,0.03,0.02,0.05,0.03,0.05,2.28905
1523,1557,Rasmus Nicolaisen,DF,25.0,33.3,1.2,0,1.2,0.8,2,-1.2,0.0,2.0,0.04,0.02,0.06,0.04,0.06,2.258907
1928,1966,Max Kilman,DF,25.0,36.7,1.3,0,1.3,0.2,0,0.2,1.5,1.6,0.04,0.01,0.04,0.04,0.04,2.22324
2374,2418,Jubal,DF,28.0,35.7,2.3,2,0.3,0.4,1,-0.6,-0.3,2.7,0.09,0.01,0.1,0.06,0.08,2.221932
1064,1089,Montassar Talbi,DF,24.0,38.0,0.7,1,-0.3,0.5,2,-1.5,-1.8,1.2,0.02,0.01,0.03,0.02,0.03,2.1974
1749,1784,Sebastiano Luperto,DF,26.0,34.8,1.2,2,-0.8,0.6,0,0.6,-0.2,1.9,0.04,0.02,0.05,0.04,0.05,2.140435
2288,2332,Federico Baschirotto,DF,26.0,37.0,1.9,3,-1.1,0.7,0,0.7,-0.4,2.6,0.05,0.02,0.07,0.05,0.07,2.132341
1380,1409,Antonio Raillo,DF,30.0,30.7,0.6,2,-1.4,1.1,3,-1.9,-3.3,1.7,0.02,0.03,0.05,0.02,0.05,2.039406


In [None]:
from scipy.spatial.distance import cdist

if set(y.columns) == set(Centre_forward_qualities):
  target_variable='Non-Penalty(xG-Goals)'
elif set(y.columns) == set(Winger_qualities) or set(y.columns) == set(Attacking_mid_qualities):
  target_variable='npxGi-npGi'
elif set(y.columns) == set(Central_mid_qualities) or set(y.columns) == set(Wingback_qualities):
  target_variable='xAG-Assists'
else:
  print('End')

xg_df_sorted_numeric = xg_df_sorted.select_dtypes(include='number')
top_10_avg = xg_df_sorted_numeric.head(10).mean()

# Calculate the Euclidean and Manhattan distances for each player to the average
euclidean_distances = cdist(xg_df_sorted_numeric, top_10_avg.to_frame().T, metric='minkowski', p=2)
manhattan_distances = cdist(xg_df_sorted_numeric, top_10_avg.to_frame().T, metric='minkowski', p=1)
xg_df_sorted['Euclidean Distance'] = euclidean_distances[:,0]
xg_df_sorted['Manhattan Distance'] = manhattan_distances[:,0]

# Normalize the Euclidean and Manhattan distances and the 'xAG- Assists' values
xg_df_sorted['Euclidean Distance (Normalized)'] = 1- (xg_df_sorted['Euclidean Distance'] - xg_df_sorted['Euclidean Distance'].min()) / (xg_df_sorted['Euclidean Distance'].max() - xg_df_sorted['Euclidean Distance'].min())
xg_df_sorted['Manhattan Distance (Normalized)'] = 1- (xg_df_sorted['Manhattan Distance'] - xg_df_sorted['Manhattan Distance'].min()) / (xg_df_sorted['Manhattan Distance'].max() - xg_df_sorted['Manhattan Distance'].min())
xg_df_sorted['Target Variable (Normalized)'] = (xg_df_sorted[target_variable] - xg_df_sorted[target_variable].min()) / (xg_df_sorted[target_variable].max() - xg_df_sorted[target_variable].min())

# Calculate the weighted average of the normalized values
weights = [0.25, 0.25, 0.5]
xg_df_sorted['Combined Score'] = xg_df_sorted[['Euclidean Distance (Normalized)', 'Manhattan Distance (Normalized)', 'Target Variable (Normalized)']].dot(weights)

# Sort rows by combined score in ascending order
sorted_df = xg_df_sorted.sort_values(by='Combined Score', ascending=False)

print('Players sorted by least combined score:')
sorted_df.head(20)

End
Players sorted by least combined score:


Unnamed: 0.1,Unnamed: 0,Player,Position,Age,90 mins played,Non Penalty xG,Non-penalty goals,Non-Penalty(xG-Goals),Expected Assists,Assists,...,Expected Goals+Assisted Goals/90,Non Penalty xG/90,Non Penalty xG+xAG/90,weighted_sum,Euclidean Distance,Manhattan Distance,Euclidean Distance (Normalized),Manhattan Distance (Normalized),Target Variable (Normalized),Combined Score
1551,1585,Jonas Hector,DF,32.0,31.4,1.7,0,1.7,3.9,2,...,0.18,0.05,0.18,1.133361,195.700148,220.957857,0.952692,0.948397,0.709924,0.830234
1683,1718,Lewis Hall,DF,17.0,7.3,0.9,0,0.9,2.3,0,...,0.43,0.12,0.43,0.664536,69.239019,114.114681,0.98392,0.974518,0.679389,0.829304
1729,1764,Hugo Mallo,DF,31.0,24.0,1.4,0,1.4,2.6,1,...,0.16,0.06,0.16,1.074083,20.698677,43.345135,0.995907,0.99182,0.664122,0.828993
346,354,Cengiz Ünder,"MF,DF",25.0,27.7,7.5,4,3.5,7.9,4,...,0.59,0.27,0.56,0.475986,1426.642549,1485.243232,0.648719,0.639298,1.0,0.822004
1332,1359,Mitchell van Bergen,"FW,DF",22.0,10.1,2.2,0,2.2,2.7,1,...,0.48,0.22,0.48,0.457546,422.302616,468.111672,0.896734,0.887971,0.732824,0.812588
1781,1818,José Luis Gayà,DF,27.0,30.1,1.1,1,0.1,5.5,3,...,0.24,0.04,0.22,0.875772,38.545189,60.909446,0.9915,0.987526,0.633588,0.81155
1470,1501,Iván Balliu,DF,30.0,35.9,0.3,0,0.3,2.8,0,...,0.09,0.01,0.09,0.955782,279.56726,297.091436,0.931981,0.929783,0.671756,0.801319
1438,1468,Jesús Navas,"DF,FW",36.0,22.0,0.3,0,0.3,2.8,0,...,0.14,0.01,0.14,0.804506,312.941751,348.352711,0.92374,0.917251,0.671756,0.796125
1737,1772,Renato Tapia,"MF,DF",27.0,16.1,1.0,0,1.0,1.1,0,...,0.13,0.06,0.13,0.978656,20.932234,36.540562,0.995849,0.993484,0.59542,0.795043
2345,2389,James Tarkowski,DF,29.0,38.0,3.1,1,2.1,2.9,1,...,0.16,0.08,0.16,3.022521,608.544928,629.653303,0.850742,0.848477,0.740458,0.795034
