In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


pd.set_option('display.max_rows', 500)
df = pd.read_excel('final_dataset_20-21.xlsx')
df.rename(columns=lambda x: x.replace(" (20/21)", ""), inplace=True)

df['Contract Years Left'] = pd.to_numeric(df['Contract Years Left'], errors='coerce')
df['Value'] = df['Value'] / 1000000

df_numeric = df.select_dtypes(include=[np.number])
df_numeric = df_numeric.dropna()
correlation_matrix = df_numeric.corr()
value_correlations = correlation_matrix['Value'].drop('Value', axis=0).sort_values()


print(value_correlations)

Age                                                                  -0.223886
Goal Saving Blocks                                                   -0.054364
Avg Shot Distance (yds)                                              -0.051458
% Aerial Duels Won                                                   -0.047926
% of Dribblers Tackled                                               -0.045539
Total Clearances                                                     -0.027663
Penalties Conceded                                                   -0.021307
2nd Yellow Cards                                                     -0.004800
Red Cards                                                            -0.001868
Mistakes leading to Opponent Shots                                    0.006684
Tackles in Defensive 3rd                                              0.010761
Own Goals                                                             0.013262
Shots on Target%                                    

In [22]:

X = df_numeric.drop('Value', axis=1)
y = df_numeric['Value']

sorted_features = value_correlations.index.tolist()
r2_scores = []

for i in range(1, len(sorted_features) + 1):
    selected_features = sorted_features[:i]
    X_selected = X[selected_features]
    
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=50)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)


for i, score in enumerate(r2_scores, 1):
    print(f'Liczba atrybutów: {i}, R^2: {score:.4f}')

Liczba atrybutów: 1, R^2: -0.0415
Liczba atrybutów: 2, R^2: -0.0415
Liczba atrybutów: 3, R^2: -0.0357
Liczba atrybutów: 4, R^2: -0.0345
Liczba atrybutów: 5, R^2: -0.0303
Liczba atrybutów: 6, R^2: -0.0288
Liczba atrybutów: 7, R^2: -0.0301
Liczba atrybutów: 8, R^2: -0.0303
Liczba atrybutów: 9, R^2: -0.0299
Liczba atrybutów: 10, R^2: -0.0298
Liczba atrybutów: 11, R^2: -0.0375
Liczba atrybutów: 12, R^2: -0.0372
Liczba atrybutów: 13, R^2: -0.0381
Liczba atrybutów: 14, R^2: -0.0307
Liczba atrybutów: 15, R^2: -0.0157
Liczba atrybutów: 16, R^2: -0.0187
Liczba atrybutów: 17, R^2: -0.0160
Liczba atrybutów: 18, R^2: -0.0197
Liczba atrybutów: 19, R^2: 0.0064
Liczba atrybutów: 20, R^2: 0.0249
Liczba atrybutów: 21, R^2: 0.0632
Liczba atrybutów: 22, R^2: 0.0633
Liczba atrybutów: 23, R^2: 0.0630
Liczba atrybutów: 24, R^2: 0.0658
Liczba atrybutów: 25, R^2: 0.0739
Liczba atrybutów: 26, R^2: 0.1035
Liczba atrybutów: 27, R^2: 0.1239
Liczba atrybutów: 28, R^2: 0.1300
Liczba atrybutów: 29, R^2: 0.1379
Liczb