In [1]:
#Import all relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

%matplotlib inline

In [16]:
df = pd.read_csv('./final_data.csv')

In [4]:
df.shape

(10754, 22)

In [17]:
#remove unrelevent columns
df.drop(['player'],axis=1,inplace=True)
df.drop(['minutes played'],axis=1,inplace=True)
df.drop(['position_encoded'],axis=1,inplace=True)
df.drop(['days_injured'],axis=1,inplace=True)
df.drop(['winger'],axis=1,inplace=True)


In [7]:
df.shape

(10754, 17)

In [18]:
goalkeepers = df[df['position'] == 'Goalkeeper'] #extract goal keepers


In [19]:
df.drop(df[df['position'] == 'Goalkeeper'].index, inplace=True)

In [20]:
df.drop(['goals conceded'],axis=1,inplace=True)


In [21]:
df.drop(['clean sheets'],axis=1,inplace=True)

In [22]:
df = df[
    (df['highest_value'] < 5.000000e+07) & (df['highest_value'] > 4.900000e+05)
]

In [23]:
df  = df[
    (df['current_value'] <= 3.000000e+07) & (df['current_value'] > 5.00000e+05)
]

In [24]:
df = df[
    (df['age'] <40) & (df['age'] > 17)
]

In [25]:
df = df[
    (df['appearance'] <= 100) & (df['appearance'] > 1)
]

In [26]:
df = df[
    (df['award'] <= 20)
]

In [27]:
df.shape

(5500, 15)

In [28]:
categorical_features = ['team', 'name', 'position']
numeric_features = ['height', 'age', 'appearance','goals','assists','yellow cards','second yellow cards','red cards'
                    ,'games_injured','award','current_value','highest_value']

### Feature Enginering

In [29]:
# one hot coding
df = pd.get_dummies(df, columns=categorical_features)

In [30]:
df.dtypes

height                                 float64
age                                    float64
appearance                               int64
goals                                  float64
assists                                float64
                                        ...   
position_midfield-AttackingMidfield       bool
position_midfield-CentralMidfield         bool
position_midfield-DefensiveMidfield       bool
position_midfield-LeftMidfield            bool
position_midfield-RightMidfield           bool
Length: 5835, dtype: object

In [31]:
# Calculate the 35th and 75th percentiles of the price
p35 = df['current_value'].quantile(0.35)
p75 = df['current_value'].quantile(0.75)

# Function to categorize current value
def categorize_price(current_value):
    if current_value < p35:
        return 'Low_Price'
    elif current_value < p75:
        return 'Mid_Price'
    else:
        return 'High_Price'

# Apply the function to create a new column
df['current_value_category'] = df['current_value'].apply(categorize_price)

df.drop('current_value', axis=1, inplace=True)

# Verify the distribution of the new categories
print(df['current_value_category'].value_counts())

current_value_category
Mid_Price     2195
Low_Price     1895
High_Price    1410
Name: count, dtype: int64


In [33]:
encoder = LabelEncoder()
df['current_value_category'] = encoder.fit_transform(df['current_value_category'])  

In [None]:
df.head()


Unnamed: 0,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,games_injured,award,...,position_Defender,position_Defender Centre-Back,position_Defender Left-Back,position_Defender Right-Back,position_midfield-AttackingMidfield,position_midfield-CentralMidfield,position_midfield-DefensiveMidfield,position_midfield-LeftMidfield,position_midfield-RightMidfield,current_value_category
10692,179.0,30.0,58,0.166256,0.193966,0.083128,0.0,0.027709,33,0,...,False,False,False,False,False,False,False,False,False,2
10718,183.0,31.0,55,0.449902,0.332536,0.136927,0.0,0.0,1,3,...,False,False,False,False,False,False,False,False,False,2
10729,191.0,36.0,47,0.091347,0.022837,0.251205,0.0,0.045674,2,5,...,False,True,False,False,False,False,False,False,False,1
10736,186.0,33.0,36,0.107399,0.107399,0.268496,0.0,0.053699,42,5,...,False,False,False,False,False,False,True,False,False,2
10745,178.0,27.0,45,0.359337,0.138206,0.248771,0.0,0.0,61,1,...,False,False,False,False,False,False,False,False,False,1


In [38]:
numeric_data = df.select_dtypes(include=['number'])
correlation = numeric_data.corr()
print(correlation['current_value_category'].sort_values(ascending=False))

current_value_category    1.000000
age                       0.097579
yellow cards              0.044013
red cards                 0.021993
second yellow cards       0.003951
height                   -0.032051
games_injured            -0.035557
assists                  -0.070363
goals                    -0.072278
award                    -0.079508
appearance               -0.209701
highest_value            -0.448863
Name: current_value_category, dtype: float64


In [44]:
# Set the correlation threshold
threshold = 0.05 # You can change this value based on your requirement ,i want the + and - relation 

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['current_value_category']) > threshold]['current_value_category'].index
selected_features

Index(['age', 'appearance', 'goals', 'assists', 'award', 'highest_value',
       'current_value_category'],
      dtype='object')

In [45]:
selected_features = ['age', 'appearance', 'goals', 'assists', 'award', 'highest_value',
       'current_value_category']

In [46]:
df = df[selected_features]

In [47]:
# Prepare data
X = df.drop(['current_value_category'], axis=1)
y = df['current_value_category']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)
# sacle the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [48]:
X.shape

(5500, 6)

#### Model Building 

In [49]:
model = KNeighborsClassifier()


In [50]:
param_grid = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro', 
                           verbose=1)

In [51]:
# Fit the model on the training data
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [52]:
model = grid_search.best_estimator_

### Test Model

In [53]:
# Predict and evaluate the model
y_pred = model.predict(X_test_scaled)

### Evaluating the Model 

In [56]:
# our benchmark model
base_model = round(df['current_value_category'].value_counts()[1]/df.shape[0]*100, 2)
base_model

np.float64(34.45)