In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
cod = pd.read_csv('../input/call-of-duty-players/cod.csv') #Import the dataset
cod.head()

In [None]:
cod.info() #Understand features and its data types 

In [None]:
cod.isnull().sum() #Identify whether any features have missing values.

In [None]:
cod = cod.drop(['name'], axis = 1) # The feature 'name' is removed since is not relevant in the dataset.

## Data Exploration

#### Win Distribution of Players

In [None]:
plt.figure(figsize=(10,5))
plt.title('Win Distribution of Players')
plt.xlabel('Wins')
plt.ylabel('Frequency')
plot = sns.histplot(data= cod, x='wins', kde= True)

#### Distribution of Kill-Death Ratio of Players

In [None]:
plt.figure(figsize=(10,5))
plt.title('Distribution of kdRatio of Players')
plt.xlabel('KdRatio')
plt.ylabel('Frequency')
plot = sns.histplot(data= cod, x='kdRatio', kde= True)

#### Kills vs Assists vs Deaths

In [None]:
cod.plot(x='level',y=['kills','assists','deaths'])

#### Kill Streak vs Kill - Death Ratio

In [None]:
plt.figure(figsize=(10,5))
plt.title('Kill Streak vs KD ratio')
plt.xlabel('Kill Streak')
plt.ylabel('kd ratio')
plot = sns.barplot(data= cod, x='killstreak', y='kdRatio')

#### Kills vs Deaths

In [None]:
scat = sns.relplot(data=cod, x='kills', y = 'deaths')
scat.ax.axline(xy1 = (10,5), slope = 1, color ='r',dashes =(5,2))

#### Shots vs Misses

In [None]:
scat1 = sns.relplot(data =cod, x='shots', y='misses')
scat1.ax.axline(xy1 =(10,5), slope= 0.8, color= 'g', dashes=(5,2))

#### Score per Minute vs XP

In [None]:
scat2 = sns.relplot(data = cod, x ='scorePerMinute', y='xp')
scat2.ax.axline(xy1=(10,5), slope = 5, color ='y', dashes = (5,2))

#### Correlation between Kill - Death ratio and other features

In [None]:
pip install dython

In [None]:
from dython.nominal import compute_associations
corr = compute_associations(cod)
plt.subplots(figsize=(10,5))
fig = sns.heatmap(corr[['kdRatio']].sort_values(by=['kdRatio']).T)
fig.set(title = 'Correlation between Kill Death ratio and other features')

# Data Preperation

In [None]:
#Selecting the features Kill- Death and Losses for Kmeans Clustering. 
x = cod.iloc[:,[3,2]].values

# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(x)

In [None]:
# statistics of scaled data
pd.DataFrame(data_scaled).describe()

# Modelling

#### In this phase we will be performing a K-Means Clustering since we need to classify the gamers level of  expertise in Call of Duty. Here the feature 'Kill-Death Ratio is most import aspect of understanding a players expertise in the game. The higher the ratio the higher the expertise of the player. Along with that one more feature will be included based on the highly correlated feature  for Kill - Death Ratio is 'Kill-Streak' which would also impacts the  expertise of the player. 

In [None]:
from sklearn.cluster import KMeans

# defining the kmeans function with initialization as k-means+
kmeans = KMeans(n_clusters=2, init='k-means++')

# fitting the k means algorithm on scaled data
kmeans.fit(data_scaled)

In [None]:
# inertia on the fitted data
kmeans.inertia_

In [None]:
# fitting multiple k-means algorithms and storing the values in an empty list
wss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init ='k-means++')
    kmeans.fit(data_scaled)
    wss.append(kmeans.inertia_)

In [None]:
# converting the results into a dataframe and plotting them
frame = pd.DataFrame({'Cluster':range(1,11), 'WSS':wss})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['WSS'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

In [None]:
pip install kneed

In [None]:
#The elbow point is calculated simply by instantiating the KneeLocator
from kneed import KneeLocator
K_value = KneeLocator(range(1,11), wss, curve='convex', direction ='decreasing')
K_value.elbow

In [None]:
#Plotting the value of k(Number of Clusters)
K_value.plot_knee()

#### We select the value of K =3 since from the above graph we can observe that line is being constant after the K_value = 3.

In [None]:
kmeans = KMeans(n_clusters = 3 ,init = 'k-means++')
y_pred = kmeans.fit_predict(data_scaled)

In [None]:
# Plotting the Kmeans Clustered data
plt.figure(figsize=(20,10))
plt.scatter(x[y_pred == 0,0], x[y_pred == 0,1], s=100, c='blue', label = 'Cluster1')
plt.scatter(x[y_pred == 1,0], x[y_pred == 1,1], s=100, c='red', label = 'Cluster2')
plt.scatter(x[y_pred == 2,0], x[y_pred == 2,1], s=100, c='green', label = 'Cluster3')

plt.xlabel('Kill Streaks')
plt.ylabel('Kill-Death ratio')

#### Here cluster - 3 (Green) is ignored since players with higher kill streaks must have higher Kill-Death Ratio when comparing with other clusters. Since cluster 1 (Blue) is having players with higher Kill-Death ratio and Kill Streaks, the players in this cluster is said to be Pro- Players, cluster 2 (Red))  which is having a lower kill- streaks and kill death ratio the players in this cluster comes under the Elite level.

# Model Evaluation 

#### The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b). To clarify, b is the distance between a sample and the nearest cluster that the sample is not a part of.

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(x,y_pred)

#### The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar. So the accuracy of the model is approximately 75%.