In [1]:
import pandas as pd

# Load the CSV file into a pandas DataFrame with a different encoding
df = pd.read_csv('football_players.csv', encoding='ISO-8859-1')

# Display the first few rows of the DataFrame to understand its structure
print(df.head())

                Name  Age Nationality  Overall  Acceleration  Aggression  \
0  Cristiano Ronaldo   32    Portugal     94.0          89.0        63.0   
1           L. Messi   30   Argentina     93.0          92.0        48.0   
2             Neymar   25      Brazil     92.0          94.0        56.0   
3          L. Suárez   30     Uruguay     92.0          88.0        78.0   
4           M. Neuer   31     Germany     92.0          58.0        29.0   

   Agility  Balance  Ball control  Composure  ...  Shot power  Sliding tackle  \
0     89.0     63.0          93.0       95.0  ...        94.0            23.0   
1     90.0     95.0          95.0       96.0  ...        85.0            26.0   
2     96.0     82.0          95.0       92.0  ...        80.0            33.0   
3     86.0     60.0          91.0       83.0  ...        87.0            38.0   
4     52.0     35.0          48.0       70.0  ...        25.0            11.0   

   Sprint speed  Stamina  Standing tackle  Strength  Vis

In [2]:
# Determine the most common 'Overall' score
most_common_overall = df['Overall'].mode()[0]
print(f'The most common Overall score is: {most_common_overall}')

The most common Overall score is: 66.0


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Step 1: Create a subset of players who can play in central defence (CB)
cb_players = df[df['Preferred Positions'].str.contains('CB', na=False)]

# Step 2: Classify these players into three categories based on their overall score
def classify_player(overall):
    if overall >= 80:
        return 'World Class'
    elif 70 <= overall < 80:
        return 'Good'
    else:
        return 'Mediocre'

cb_players['Class'] = cb_players['Overall'].apply(classify_player)

# Step 3: Prepare the data for the Random Forest classifier
X = cb_players.drop(columns=['Name', 'Nationality', 'Preferred Positions', 'Preferred Positions Type', 'Class'])
y = cb_players['Class']

# Convert categorical features to numeric
X = X.apply(pd.to_numeric, errors='coerce')
X.fillna(0, inplace=True)

# Encode the target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Step 4: Train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=500, random_state=1971)
rf.fit(X, y)

# Step 5: Identify the most important features
feature_importances = rf.feature_importances_
features = X.columns
important_features = pd.Series(feature_importances, index=features).sort_values(ascending=False)

# Print the top 5 most important features
print("Top 5 most important features:")
print(important_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cb_players['Class'] = cb_players['Overall'].apply(classify_player)


Top 5 most important features:
Overall               0.383625
Standing tackle       0.130194
Marking               0.096120
Interceptions         0.090903
Sliding tackle        0.068036
Reactions             0.049961
Heading accuracy      0.037788
Composure             0.033894
Aggression            0.018604
Long passing          0.013325
Short passing         0.012387
Ball control          0.009920
Strength              0.006696
Shot power            0.005011
Age                   0.003243
Dribbling             0.003016
Crossing              0.002877
Jumping               0.002653
Vision                0.002651
Long shots            0.002469
Stamina               0.002375
Curve                 0.002320
Free kick accuracy    0.002041
Penalties             0.002017
Sprint speed          0.001914
Finishing             0.001761
Volleys               0.001748
Positioning           0.001740
Balance               0.001625
Acceleration          0.001582
Agility               0.001542
GK handl

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score

# Step 1: Create a subset of players who can play in central defence (CB)
cb_players = df[df['Preferred Positions'].str.contains('CB', na=False)]

# Step 2: Classify these players into three categories based on their overall score
def classify_player(overall):
    if overall >= 80:
        return 'World Class'
    elif 70 <= overall < 80:
        return 'Good'
    else:
        return 'Mediocre'

cb_players['Class'] = cb_players['Overall'].apply(classify_player)

# Step 3: Prepare the data for the KNN classifier
X = cb_players.drop(columns=['Name', 'Nationality', 'Preferred Positions', 'Preferred Positions Type', 'Class'])
y = cb_players['Class']

# Convert categorical features to numeric
X = X.apply(pd.to_numeric, errors='coerce')
X.fillna(0, inplace=True)

# Encode the target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Step 4: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=911)

# Step 5: Train and evaluate KNN models with k varying from 1 to 5
best_f1_scores = {'World Class': 0, 'Good': 0}
best_k = {'World Class': 0, 'Good': 0}

for k in range(1, 6):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)
    
    world_class_f1 = report['World Class']['f1-score']
    good_f1 = report['Good']['f1-score']
    
    if world_class_f1 > best_f1_scores['World Class']:
        best_f1_scores['World Class'] = world_class_f1
        best_k['World Class'] = k
    
    if good_f1 > best_f1_scores['Good']:
        best_f1_scores['Good'] = good_f1
        best_k['Good'] = k

print(f"Best k for World Class: {best_k['World Class']} with F1 score: {best_f1_scores['World Class']}")
print(f"Best k for Good: {best_k['Good']} with F1 score: {best_f1_scores['Good']}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cb_players['Class'] = cb_players['Overall'].apply(classify_player)


Best k for World Class: 1 with F1 score: 0.6363636363636365
Best k for Good: 4 with F1 score: 0.8218085106382979
