## Clustering Call-of-Duty players using the K-Means algorithm

This is a link to the dataset used: https://www.kaggle.com/aishahakami/call-of-duty-players



In [None]:
%reset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

Seaborn is prepared for plotting data and the dataset is read. The dataset page mentions that the 'name' column is unique, so it is used as the indexing column.

In [None]:
sns.set(context="notebook", palette="Spectral", style = 'darkgrid' ,font_scale = 1.5, color_codes=True)
dataset = pd.read_csv('../input/call-of-duty-players/cod.csv',index_col='name')

Inspecting the dataset top 5 rows and the columns and removing the duplicate elements (if any).



In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.drop_duplicates(inplace=True)

Based on the column description, the rows with 'timePlayed' = 0 are removed, as these rows do not provide any useful information.
Also, 3 new columns are created: hit_rate, miss_rate and headshot_rate. Hits, misses and headshots are divided by number of total shots and the total amount of time played.


In [None]:
dataset.drop(dataset[dataset.timePlayed == 0].index, inplace=True)

dataset['hit_rate'] = (dataset.hits/dataset.shots)/dataset.timePlayed
dataset['miss_rate'] = (dataset.misses/dataset.shots)/dataset.timePlayed
dataset['headshot_rate'] = (dataset.headshots/dataset.shots)/dataset.timePlayed

dataset.replace([np.inf, -np.inf, np.nan], 0, inplace=True)


del dataset['hits']
del dataset['misses']
del dataset['shots']
del dataset['headshots']
del dataset['timePlayed']
del dataset['averageTime']

dataset.head(n=10)

In [None]:
# Extract the values of the DataFrame
data = dataset.iloc[:,1:].values

Two pipelines are created: preprocessor and clusterer. The preprocessor pipeline standardizes the data and uses PCA for dimensionality reduction.
The clusterer pipeline is used to apply the K-Means algorithm.

In [None]:
preprocessor = Pipeline(
    [
        ("standardize", StandardScaler()),
        ("pca", PCA(n_components=2,random_state=42)),
    ]
)

clusterer = Pipeline(
   [
       (
           "kmeans",
           KMeans(
               n_clusters=4,
               init="k-means++",
               n_init=50,
               max_iter=500,
               random_state=42,
           ),
       ),
   ]
)

In [None]:
# Create the overal pipeline
pipe = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clusterer", clusterer)
    ]
)

In [None]:
# process the pipeline
pipe.fit(data)

In [None]:
# pass the data throught the preprocessor pipeline
preprocessed_data = pipe["preprocessor"].transform(data)

Looping over different number of components for PCA and number of clusters in K-Means, and computing the Silhouette coefficient for each experiment.

In [None]:
silhouette_scores = []
inertia_scores = []
for n in range(2, 6):
    for k in range(2, 6):
        pipe["preprocessor"]["pca"].n_components = n
        pipe["clusterer"]["kmeans"].n_clusters = k
        pipe.fit(data)
        silhouette_coef = silhouette_score(
             pipe["preprocessor"].transform(data),
             pipe["clusterer"]["kmeans"].labels_,)

        silhouette_scores.append([n,k,silhouette_coef])

df = pd.DataFrame(silhouette_scores, columns=['n','k','score'] )

Plotting the calculated Silhouette coefficients for different choices of the parameters.

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
x = df['n'].values
y = df['k'].values
z = df['score'].values

ax.scatter(x,y,s=z*100)
ax.set_xlabel('Number of PCA Components')
ax.set_ylabel('Number of Clusters')
ax.set_title('Silhouette Score')
for i in range(len(z)):
    ax.text(x[i],y[i],f'S={z[i].round(2)}',size=15, zorder=1, color='k')

Based on the results, the experiment with the highest Silhouette value is chosen (2 PCA components and 3 clusters).
In order to name the 3 categories, the level column of the dataset is used. First, let's plot a histogram of levels.

In [None]:
sns.histplot(dataset.level, bins=10)

In [None]:
# labeling the dataset based on the level value.
conditions = [
    (dataset['level'] < 50),
    (dataset['level'] >= 50) & (dataset['level'] < 150),
    (dataset['level'] >= 150)]
choices = ['Level 0', 'Level 1', 'Level 2']
lbls = np.select(conditions, choices, default=0)

K-Means is run for the optimum choice of PCA components and the number of clusters, and the resulting categories are compared to the players' level to put players into 3 categories : Professional, Intermediate and Beginner.

In [None]:
pipe["preprocessor"]["pca"].n_components = 2
pipe["clusterer"]["kmeans"].n_clusters = 3
pipe.fit(data)
output_labels = pipe["clusterer"]["kmeans"].labels_

sns.scatterplot(preprocessed_data[:,0], preprocessed_data[:,1], hue=output_labels, style=lbls, s=100, palette="Set2").set(title='Comparing resulted categories with the player level.',xlabel='PCA component 1', ylabel='PCA component 2')

In [None]:
# Creating names for the categories
myDict = {0: "Intermediate", 1: "Professional", 2: "Beginner"}
dataset['Category'] = [myDict[x] for x in output_labels]