In [None]:
""" 
ALGORITHM:
1.choose the number of clusters required-usually k=3 or 5|low value of k=>more noisy,less accurate
2.initialize the centroid of the k clusters randomly-the centroid is the middle point of each cluster
3.take a data point and compute euclidean distance to each centroid-closest centroid determines the cluster the point belongs to.
4.update the cluster coordinates using the geometric mean of each cluster.
5.repeat step 3 and step 4 until clusters don't change.
"""

In [6]:
import numpy as np
import pandas as pd
players=pd.read_csv(r'C:\Users\MSI\Downloads\players_22.csv',low_memory=False)
players.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [7]:
#selecting the features for clustering
features=["overall","potential","wage_eur","value_eur","age"]

In [13]:
#dropping null values
#subset specifies where to look for the missing values-->optional
players=players.dropna(subset=features)


In [14]:
data=players[features].copy()
data

Unnamed: 0,overall,potential,wage_eur,value_eur,age
0,93,93,320000.0,78000000.0,34
1,92,92,270000.0,119500000.0,32
2,91,91,270000.0,45000000.0,36
3,91,91,270000.0,129000000.0,29
4,91,91,350000.0,125500000.0,30
...,...,...,...,...,...
19234,47,52,1000.0,70000.0,22
19235,47,59,500.0,110000.0,19
19236,47,55,500.0,100000.0,21
19237,47,60,500.0,110000.0,19


In [15]:
#feature scaling using min-max scaler
#steps:
#1.min value in each colmun=0
#2.divide by range--> everything is in range 0,1
#3.multiply by 9--> range=0,9
#4.add 1-->range=1,10
#NOTE:max of the scale can be everything, but min cannot be 0 or negative
#min-max scaler also preserves variance,i.e, the difference in ratings or wages etc is scaled down accurately
data=((data-data.min())/(data.max()-data.min()))*9+1

In [17]:
data.head()

Unnamed: 0,overall,potential,wage_eur,value_eur,age
0,10.0,9.608696,9.227468,4.618307,7.0
1,9.804348,9.413043,7.939914,6.543654,6.333333
2,9.608696,9.217391,7.939914,3.087308,7.666667
3,9.608696,9.217391,7.939914,6.984396,5.333333
4,9.608696,9.217391,10.0,6.822018,5.666667


In [37]:
#initializing centroids randomly
#we are taking a random sample from each column and using that as the centroid value
#using float() because sample function returns a panda series
#alternatively, we can simply take a random value from (1,10) for each columm and use that as the centroid

def random_centroids(data,k):
    centroids=[]
    for i in range(5):
        centroid=data.apply(lambda x:float(x.sample()))  #note that centroids is a series
        centroids.append(centroid)
    return pd.concat(centroids,axis=1)   #returns a dataframe of centroid series objects

In [38]:
centroids=random_centroids(data,5)
centroids

  centroid=data.apply(lambda x:float(x.sample()))  #note that centroids is a series


Unnamed: 0,0,1,2,3,4
overall,3.934783,6.086957,4.130435,3.934783,4.521739
potential,5.695652,3.934783,4.130435,4.521739,5.695652
wage_eur,1.038627,1.888412,1.038627,1.007725,1.193133
value_eur,1.695491,1.02162,1.036698,1.003294,1.045976
age,5.333333,4.333333,4.0,1.333333,1.666667


In [60]:
#finding distances and assigning to clusters based on which one is closest
def get_labels(data,centroids):
    distances=centroids.apply(lambda x:np.sqrt((data-x)**2).sum(axis=1))
    return distances.idxmin(axis=1)



In [61]:
labels=get_labels(data,centroids)
labels.value_counts()

2    7249
4    4872
1    2655
0    2651
3    1738
Name: count, dtype: int64

In [65]:
#finding new centroids using geometric mean
def new_centroids(data,labels,k):
    return data.groupby(labels).apply(lambda x:np.exp(np.log(x).mean())).T
    

In [68]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [69]:
def plot_clusters(data,labels,centroids,iteration):
    pca=PCA(n_components=2)
    data_2d=pca.fit_transform(data)
    centroids_2d=pca.transform(centroids.T)
    clear_output(wait=True)
    plt.title(f"iteration{iteration}")
    plt.scatter(x=l

SyntaxError: incomplete input (3313108101.py, line 1)