## IMPORTS

In [162]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from skopt import BayesSearchCV
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from skopt.space import Integer, Categorical, Real


# NO GOALKEEPERS

In [163]:
df = pd.read_csv('train.csv')
df.dropna(subset=['value_eur'], inplace=True)
df = df[~df["player_positions"].str.contains("GK", na=False)]

In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13610 entries, 0 to 15390
Data columns (total 76 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   13610 non-null  int64  
 1   id                           13610 non-null  int64  
 2   short_name                   13610 non-null  object 
 3   long_name                    13610 non-null  object 
 4   player_positions             13610 non-null  object 
 5   overall                      13610 non-null  int64  
 6   potential                    13610 non-null  int64  
 7   value_eur                    13610 non-null  float64
 8   wage_eur                     13610 non-null  float64
 9   age                          13610 non-null  int64  
 10  dob                          13610 non-null  object 
 11  height_cm                    13610 non-null  int64  
 12  weight_kg                    13610 non-null  int64  
 13  club_team_id         

## DATA PREPROCESSING

In [165]:
# drop irrelevant columns
cols_to_drop = ['Unnamed: 0', 'id', 'short_name', "long_name", "dob", "club_jersey_number", "nation_jersey_number", "club_id", "club_loaned_from", "nation_position", "player_traits", "player_tags", "nationality_name", "club_team_id", "nationality_id"]

df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

In [166]:
# contract related variables manipulation
start_date = float(2021) # year when dataset was recorded 
df["years_left_contract"] = df["club_contract_valid_until"] - start_date

df['club_joined'] = pd.to_datetime(df['club_joined'], format='%Y-%m-%d')
df["years_in_club"] = start_date - df["club_joined"].dt.year

df.drop(columns=["club_contract_valid_until", "club_joined"], inplace=True, errors='ignore')

In [167]:
# work_rate split 
df[["work_attack", "work_defense"]] = df["work_rate"].str.split("/", expand=True)
maps = {"Low": 1, "Medium": 2, "High": 3}
df["work_attack"] = df["work_attack"].map(maps)
df["work_defense"] = df["work_defense"].map(maps)

In [168]:
# target based ordinal encoding for positions
df[["primary_position", "secondary_position", "tertiary_position"]] = df["player_positions"].str.split(",", expand=True)

club_position_map = df.groupby('club_position')['value_eur'].median().sort_values().rank().to_dict()
club_position_map[None] = 0
df["club_position"] = df["club_position"].map(club_position_map)
df["primary_position"] = df["primary_position"].map(club_position_map)
df["secondary_position"] = df["secondary_position"].map(club_position_map)
df["tertiary_position"] = df["tertiary_position"].map(club_position_map)

df["is_multiposition"] = df["player_positions"].apply(lambda x: len(x.split(",")) > 1).astype(int)

In [169]:
# Starter Bool
df["is_starter"] = (~df["club_position"].isin(["SUB", "RES"])).astype(int)

In [170]:
cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    df[col] = df[col].astype('category')

In [171]:
# data engineering
df["overall_potential"] = df["overall"] * df["potential"]

## TRAIN

In [172]:
# not goalkeeper

In [173]:
# goalkeeper

In [174]:
# Split first
X = df.drop(columns=["value_eur"], errors='ignore')
y = df["value_eur"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Merge temporarily for groupby operations
train_df = X_train.copy()
train_df["value_eur"] = y_train

# Compute mappings from training set only
club_map = train_df.groupby('club_name', observed=True)['value_eur'].median().sort_values().rank().to_dict()
league_map = train_df.groupby('league_name', observed=True)['value_eur'].median().sort_values().rank().to_dict()
work_rate_map = train_df.groupby('work_rate', observed=True)['value_eur'].median().sort_values().rank().to_dict()

# Apply mappings
X_train["club_name"] = X_train["club_name"].map(club_map)
X_test["club_name"] = X_test["club_name"].map(club_map)

X_train["league_name"] = X_train["league_name"].map(league_map)
X_test["league_name"] = X_test["league_name"].map(league_map)

X_train["work_rate"] = X_train["work_rate"].map(work_rate_map)
X_test["work_rate"] = X_test["work_rate"].map(work_rate_map)

In [175]:
# List of columns to keep
cols_to_keep = [
    "release_clause_eur", "overall_potential", 
    "wage_eur", "age", "league_name", 
    "club_name", "work_rate", "is_starter",
    "secondary_position", "tertiary_position",
]

# Filter the DataFrame to keep only the specified columns
X_train = X_train[cols_to_keep]
X_test = X_test[cols_to_keep]

In [176]:
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=100,
    random_state=42,
)

rf.fit(X_train, y_train)
train_preds = rf.predict(X_train)
test_preds = rf.predict(X_test)

# Evaluate the model
train_rmse = root_mean_squared_error(y_train, train_preds)
test_rmse = root_mean_squared_error(y_test, test_preds)
print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R^2: {r2_score(y_train, train_preds)}")
print(f"Test R^2: {r2_score(y_test, test_preds)}")

Train RMSE: 297414.99340681836
Test RMSE: 558652.1658887943
Train R^2: 0.9984807926229545
Test R^2: 0.9943935966162789


In [177]:
model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, 50, 100],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [2, 3, 4, 6, 10], 
    'max_features': ['sqrt', 0.3, 0.5, 0.8]
}
search = RandomizedSearchCV(model, param_grid, n_iter=30 ,cv=5, scoring='neg_root_mean_squared_error', verbose=3, n_jobs=-1)
search.fit(X_train, y_train)
best_model = search.best_estimator_
train_preds = best_model.predict(X_train)
test_preds = best_model.predict(X_test)
# Evaluate the model
train_rmse = root_mean_squared_error(y_train, train_preds)
test_rmse = root_mean_squared_error(y_test, test_preds)
print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END max_depth=30, max_features=0.3, min_samples_leaf=10, min_samples_split=2, n_estimators=100;, score=-1907118.982 total time=   1.8s
[CV 2/5] END max_depth=30, max_features=0.3, min_samples_leaf=10, min_samples_split=2, n_estimators=100;, score=-2425917.374 total time=   1.7s
[CV 3/5] END max_depth=30, max_features=0.3, min_samples_leaf=10, min_samples_split=2, n_estimators=100;, score=-795645.555 total time=   1.8s
[CV 4/5] END max_depth=30, max_features=0.3, min_samples_leaf=10, min_samples_split=2, n_estimators=100;, score=-1076125.202 total time=   1.1s
[CV 5/5] END max_depth=30, max_features=0.3, min_samples_leaf=10, min_samples_split=2, n_estimators=100;, score=-1231114.711 total time=   1.1s
[CV 3/5] END max_depth=10, max_features=0.3, min_samples_leaf=4, min_samples_split=2, n_estimators=300;, score=-715427.671 total time=   3.9s
[CV 1/5] END max_depth=10, max_features=0.3, min_samples_leaf=4, min_samples_

In [178]:
model2 = RandomForestRegressor(
    n_estimators=200,
    max_depth=100,
    random_state=42
)

model2.fit(X_train, y_train)
train_preds = model2.predict(X_train)
test_preds = model2.predict(X_test)
# Evaluate the model
train_rmse = root_mean_squared_error(y_train, train_preds)
test_rmse = root_mean_squared_error(y_test, test_preds)
print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R^2: {r2_score(y_train, train_preds)}")
print(f"Test R^2: {r2_score(y_test, test_preds)}")

Train RMSE: 278381.82945714536
Test RMSE: 556714.232157933
Train R^2: 0.9986690151484207
Test R^2: 0.9944324257630158


# MODEL WITH GOALKEEPERS

In [181]:
df = pd.read_csv("train.csv")
df = df[df["player_positions"].str.contains("GK", na=False)]
df.dropna(subset=["value_eur"], inplace=True)

In [182]:
df["overall_potential"] = df["overall"] * df["potential"]

In [183]:
X = df.drop(columns=["value_eur"], errors='ignore')
y = df["value_eur"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Merge temporarily for groupby operations
train_df = X_train.copy()
train_df["value_eur"] = y_train

# Compute mappings from training set only
club_map = train_df.groupby('club_name', observed=True)['value_eur'].median().sort_values().rank().to_dict()
league_map = train_df.groupby('league_name', observed=True)['value_eur'].median().sort_values().rank().to_dict()
work_rate_map = train_df.groupby('work_rate', observed=True)['value_eur'].median().sort_values().rank().to_dict()

# Apply mappings
X_train["club_name"] = X_train["club_name"].map(club_map)
X_test["club_name"] = X_test["club_name"].map(club_map)

X_train["league_name"] = X_train["league_name"].map(league_map)
X_test["league_name"] = X_test["league_name"].map(league_map)

X_train["work_rate"] = X_train["work_rate"].map(work_rate_map)
X_test["work_rate"] = X_test["work_rate"].map(work_rate_map)

In [184]:
# List of columns to keep based on the top correlations with 'value_eur'
cols_to_keep = [
    "release_clause_eur", "wage_eur", 
    "international_reputation", "overall_potential", 
    "goalkeeping_handling", "goalkeeping_diving", 
    "goalkeeping_reflexes", "goalkeeping_positioning", 
    "goalkeeping_kicking"
]

# Filter the DataFrame to keep only the specified columns
X_train = X_train[cols_to_keep]
X_test = X_test[cols_to_keep]

In [185]:
rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=50,
    random_state=42,
)

rf.fit(X_train, y_train)
train_preds = rf.predict(X_train)
test_preds = rf.predict(X_test)

# Evaluate the model
train_rmse = root_mean_squared_error(y_train, train_preds)
test_rmse = root_mean_squared_error(y_test, test_preds)
print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R^2: {r2_score(y_train, train_preds)}")
print(f"Test R^2: {r2_score(y_test, test_preds)}")

Train RMSE: 760557.3338021102
Test RMSE: 531822.9997200699
Train R^2: 0.988112492918213
Test R^2: 0.9933713425594822


- need to add baseline model -> linear regression with top 10 most correlated features (so need correlation matrix for it)