In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN



In [67]:
df = pd.read_csv('ml_ready_players.csv')

# Convert all bool columns to int automatically
bool_cols = df.select_dtypes(include=['bool']).columns
df[bool_cols] = df[bool_cols].astype(int)


In [68]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# Initialize cluster column
df['cluster'] = -1  # Default value

# Get unique positions excluding Manager
positions = df['position_name'].unique()

# Loop through each position group
for pos in positions:
    print(f"Processing position: {pos}")

    # Filter data for current position
    df_pos = df[df['position_name'] == pos].copy()

    df_features = df_pos.drop(columns=['position_name'])

    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_features)


    kmeans = KMeans(n_clusters=20, random_state=42, n_init='auto')
    clusters = kmeans.fit_predict(X_scaled)
        
   
    n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
    print(f'Number of clusters: {n_clusters}')


    # Assign clusters back to main DataFrame
    df.loc[df['position_name'] == pos, 'cluster'] = clusters
    
# Done
print("✅ Clustering complete. Sample of updated DataFrame:")
print(df[['position_name', 'cluster']].head())


Processing position: Midfielder
Number of clusters: 20
Processing position: Forward
Number of clusters: 20
Processing position: Defender
Number of clusters: 20
Processing position: Goalkeeper
Number of clusters: 20
Processing position: Manager
Number of clusters: 20
✅ Clustering complete. Sample of updated DataFrame:
  position_name  cluster
0    Midfielder       11
1       Forward       18
2      Defender        9
3       Forward        3
4    Goalkeeper        9




In [69]:
# Assuming your dataframe is named df and cluster column is 'clusters'
unique_clusters = df['cluster'].nunique()
print(f"Number of unique clusters: {unique_clusters}")

cluster_counts = df['cluster'].value_counts()
print("Count of each cluster label:")
print(cluster_counts)


Number of unique clusters: 20
Count of each cluster label:
cluster
4     89
2     75
11    66
9     59
1     51
5     43
13    40
6     39
0     37
12    36
17    34
8     32
15    31
19    31
10    29
16    28
14    26
18    25
3     23
7     10
Name: count, dtype: int64


In [70]:
df2 = pd.read_csv('full_data.csv')
df2['cluster'] = df['cluster']
df2.to_csv('clustered_data.csv', index=False)
