In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('Data/player_stats_cleaned.csv')
data.drop(columns=['id'], inplace=True)
#Filter with market value above 10000000
# data = data[data['MarketValue'] > 5000000]
data['IsValuable'] = (data['MarketValue'] >= 5000000).astype(int)
data['Outfitter'] = data['Outfitter'].fillna('None')
data['ContractOption'] = data['ContractOption'].fillna('None')
data


Unnamed: 0,name,dateOfBirth,Age,Height,Foot,Position,OtherPosition,National,MarketValue,Outfitter,...,25MP,25AP,Ranking,2020AvgMV,2021AvgMV,2022AvgMV,2023AvgMV,2024AvgMV,TotalCups,IsValuable
0,Aaron Hickey,"Jun 10, 2002",22,1.85,both,Right-Back,"['Left-Back', 'Left Midfield']",Scotland,22000000.0,,...,0.00,0,432.000,1.845000e+06,8.666667e+06,1.700000e+07,2.833333e+07,2.500000e+07,0,1
1,Aaron Connolly,"Jan 28, 2000",25,1.74,right,Centre-Forward,['Second Striker'],Ireland,2500000.0,,...,505.00,14,3.644,5.500000e+06,7.000000e+06,5.500000e+06,3.500000e+06,2.500000e+06,2,0
2,Aaron Bastiaans,"Apr 4, 2002",22,1.84,right,Left Winger,"['Right Winger', 'Centre-Forward']",Netherlands,0.0,,...,0.00,0,0.000,1.500000e+05,1.250000e+05,1.750000e+05,0.000000e+00,0.000000e+00,0,0
3,AJ Marcucci,"Jul 31, 1999",25,1.91,hand,Goalkeeper,[],United States,250000.0,,...,0.00,0,0.000,0.000000e+00,1.000000e+05,1.250000e+05,1.500000e+05,2.166667e+05,0,0
4,Aarón Herrera,"Jun 6, 1997",27,1.80,right,Right-Back,"['Left-Back', 'Right Midfield']",United States,2000000.0,,...,0.00,0,4.274,9.666667e+05,1.600000e+06,2.000000e+06,2.000000e+06,2.000000e+06,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12947,Simon Stefanec,"Sep 5, 1998",26,1.78,right,Attacking Midfield,"['Left Winger', 'Right Winger']",Slovakia,0.0,,...,0.00,0,0.000,1.000000e+05,1.250000e+05,1.125000e+05,0.000000e+00,0.000000e+00,0,0
12948,Žan Kolmanič,"Mar 3, 2000",24,1.78,left,Left-Back,"['Left Midfield', 'Right-Back']",Slovenia,1000000.0,,...,0.00,0,6.910,4.250000e+05,7.000000e+05,1.000000e+06,1.000000e+06,1.000000e+06,1,0
12949,Zan Celar,"Mar 14, 1999",25,1.86,right,Centre-Forward,['Left Winger'],Slovenia,4500000.0,,...,1134.00,22,2.226,7.625000e+05,7.250000e+05,1.116667e+06,3.750000e+06,4.500000e+06,4,0
12950,Zan Majer,"Jul 25, 1992",32,1.80,right,Central Midfield,"['Attacking Midfield', 'Defensive Midfield']",Slovenia,400000.0,,...,682.00,16,0.000,1.733333e+06,1.600000e+06,1.700000e+06,1.150000e+06,5.666667e+05,2,0


## Dataset Creation

In [3]:
def remove_outliers_iqr(df, numeric_columns):
    Q1 = df[numeric_columns].quantile(0.25)
    Q3 = df[numeric_columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[~((df[numeric_columns] < lower_bound) | (df[numeric_columns] > upper_bound)).any(axis=1)]

In [4]:
# Define categorical and numerical columns 
categorical_columns = ['Foot', 'Position', 'OtherPosition', 'National', 'Club_name', 'ContractOption', 'Outfitter']
numeric_columns = ['Age', 'Height', '20YC', '20YC2', '20RC', '20G', '20A', '20MP', '20AP', '21YC', '21YC2', '21RC', '21G', '21A', '21MP', '21AP', '22YC', '22YC2', '22RC', '22G', '22A', '22MP', '22AP', '23YC', '23YC2', '23RC', '23G', '23A', '23MP', '23AP', '24YC', '24YC2', '24RC', '24G', '24A', '24MP', '24AP', '25YC', '25YC2', '25RC', '25G', '25A', '25MP', '25AP', 'TotalCups']

# Keep only categorical columns and target variable
data = data[categorical_columns + numeric_columns+ ['IsValuable']]

# Apply Label Encoding to categorical features
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))  # Convert to string to handle missing values
    label_encoders[col] = le  # Store encoders for future use

# Split into features and target variable
X = data.drop(columns=['IsValuable'])
y = data['IsValuable']

train_size = 0.8
# Separate classes
X_0, X_1 = X[y == 0], X[y == 1]
y_0, y_1 = y[y == 0], y[y == 1]

# Find the minimum class size to balance train-test split
min_class_size = min(len(y_0), len(y_1))

# Sample equal number of 0s and 1s
X_0_sample = X_0.sample(n=min_class_size, random_state=1911)
X_1_sample = X_1.sample(n=min_class_size, random_state=1911)
y_0_sample = y_0.sample(n=min_class_size, random_state=1911)
y_1_sample = y_1.sample(n=min_class_size, random_state=1911)

# Combine the balanced samples
X_balanced = pd.concat([X_0_sample, X_1_sample], axis=0)
y_balanced = pd.concat([y_0_sample, y_1_sample], axis=0)

# # Perform a random train-test split
# X_train, X_test, y_train, y_test = train_test_split(
#     X_balanced, y_balanced, train_size=train_size, random_state=1911, shuffle=True
# )

# scaler = StandardScaler()
# # Fit on training data and transform both training and test sets
# X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
# X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

# X_train.shape, X_test.shape, y_train.shape, y_test.shape

X_balanced_no_outliers = remove_outliers_iqr(X_balanced, numeric_columns)
y_balanced_no_outliers = y_balanced.loc[X_balanced_no_outliers.index]

# Perform train-test split again after removing outliers
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced_no_outliers, y_balanced_no_outliers, train_size=train_size, random_state=1911, shuffle=True
)

# Standardize numerical features
scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((829, 52), (208, 52), (829,), (208,))

## Naieve Bayes Classifier

In [5]:
from sklearn.naive_bayes import GaussianNB

# Train Naïve Bayes classifier
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluation
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))

Naïve Bayes Accuracy: 0.6682692307692307

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.68      0.72       133
           1       0.53      0.65      0.59        75

    accuracy                           0.67       208
   macro avg       0.65      0.67      0.65       208
weighted avg       0.69      0.67      0.67       208


Confusion Matrix:
[[90 43]
 [26 49]]


# Decision Tree Classifier

In [6]:
from sklearn.tree import DecisionTreeClassifier

# Create and train the model
dt_model = DecisionTreeClassifier(random_state=19)
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
# Evaluation
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))

Naïve Bayes Accuracy: 0.6778846153846154

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.71      0.74       133
           1       0.55      0.61      0.58        75

    accuracy                           0.68       208
   macro avg       0.66      0.66      0.66       208
weighted avg       0.69      0.68      0.68       208


Confusion Matrix:
[[95 38]
 [29 46]]


## Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression

# Create and train the model
lr_model = LogisticRegression(random_state=19, max_iter=1000)
lr_model.fit(X_train, y_train)
# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.7884615384615384

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.78      0.83       133
           1       0.67      0.80      0.73        75

    accuracy                           0.79       208
   macro avg       0.77      0.79      0.78       208
weighted avg       0.80      0.79      0.79       208


Confusion Matrix:
[[104  29]
 [ 15  60]]


## Ensemble Methods

In [8]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Initialize models
rf_model = RandomForestClassifier(n_estimators=100, random_state=1911)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=1911)
xgb_model = XGBClassifier(n_estimators=100, random_state=1911, use_label_encoder=False, eval_metric="logloss")

# Train models
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_gb = gb_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate models
models = {'Random Forest': (y_pred_rf, rf_model),
          'Gradient Boosting': (y_pred_gb, gb_model),
          'XGBoost': (y_pred_xgb, xgb_model)}

for name, (y_pred, model) in models.items():
    print(f"\n{name} Model Evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))



Random Forest Model Evaluation:
Accuracy: 0.7933
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84       133
           1       0.72      0.71      0.71        75

    accuracy                           0.79       208
   macro avg       0.78      0.77      0.78       208
weighted avg       0.79      0.79      0.79       208


Gradient Boosting Model Evaluation:
Accuracy: 0.8173
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86       133
           1       0.76      0.72      0.74        75

    accuracy                           0.82       208
   macro avg       0.80      0.80      0.80       208
weighted avg       0.82      0.82      0.82       208


XGBoost Model Evaluation:
Accuracy: 0.8221
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86       133
           1     

## Support Vector Machines

In [9]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', gamma='auto', random_state=1911)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))


SVM Accuracy: 0.6731
Classification Report:
               precision    recall  f1-score   support

           0       0.66      1.00      0.80       133
           1       1.00      0.09      0.17        75

    accuracy                           0.67       208
   macro avg       0.83      0.55      0.48       208
weighted avg       0.78      0.67      0.57       208


Confusion Matrix:
[[133   0]
 [ 68   7]]
