## Assignment 1

In [1]:
## Importing required libraries

import numpy as np
import pandas as pd
from scipy.spatial import distance
import matplotlib.pyplot as plt
from sklearn import preprocessing
from matplotlib.colors import ListedColormap

ModuleNotFoundError: No module named 'scipy'

In [None]:
# reading input file
# given csv file is tab separated

df = pd.read_csv('cricketers.csv', sep="\t")

In [None]:
# clearing some common issues with data
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [None]:
# lookup sample records
df.head(10)

In [None]:
# some standard description about data
df.describe()

In [None]:
# players who played 0 times

df[df['matches_played'] == 0]

In [None]:
## finding average runs

df['average_runs'].unique()

In [None]:
df['bowling_economy'].unique()

### Part 1 - Normalization of the data

In [None]:
# creating a copy for manipulation

df1 = df.copy()

In [None]:
# normalization the values - because the column data is not in range

minmax_scale = preprocessing.MinMaxScaler().fit(df1.iloc[:,1:])
df1.iloc[:,1:] = minmax_scale.transform(df1.iloc[:,1:])

In [None]:
# lookup some data
df1.head()

### Part 2

In [None]:
def kmeans(X,k,max_iterations=100):
    '''
    X: multidimensional data frame
    k: number of clusters
    max_iterations: number of repetitions before clusters are established
    Steps:
        1. Convert data to numpy aray
        2. Pick indices of k random point without replacement
        3. Find class (P) of each data point using euclidean distance
        4. Stop when max_iteration are reached of P matrix doesn't change
    Return:
        np.array: containg class of each data point and centroids
    '''
    if isinstance(X, pd.DataFrame):X = X.values
    idx = np.random.choice(len(X), k, replace=False)
    centroids = X[idx, :]
    P = np.argmin(distance.cdist(X, centroids, 'euclidean'),axis=1)
    for _ in range(max_iterations):
        centroids = np.vstack([X[P==i,:].mean(axis=0) for i in range(k)])
        tmp = np.argmin(distance.cdist(X, centroids, 'euclidean'),axis=1)
        if np.array_equal(P,tmp):break
        P = tmp
    return P, centroids

In [None]:
# taking average_runs and bowling_economy for clustering

df2 = df1[['average_runs','bowling_economy']]
df2.head()

In [None]:
## with k = 2, two clusters

P, centroids = kmeans(df2, 2)
plt.figure(figsize=(12,10))
classes = ['Cluster 1', 'Cluster 2']
colours = ListedColormap(['red','blue'])
ax = plt.scatter(df2.iloc[:,0],df2.iloc[:,1],c=P, cmap = colours)
plt.scatter(centroids[0][0], centroids[0][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[1][0], centroids[1][1], s=200, c='yellow', marker='s')
plt.xlabel("Average Runs")
plt.ylabel("Bowling Economy")
plt.legend(handles=ax.legend_elements()[0], labels=classes)
plt.show()

### Part - 3

In [None]:
## with k = 2, two clusters

P, centroids = kmeans(df2, 2)
plt.figure(figsize=(12,10))
classes = ['Cluster 1', 'Cluster 2']
colours = ListedColormap(['red','blue'])
ax = plt.scatter(df2.iloc[:,0],df2.iloc[:,1],c=P, cmap = colours)
plt.scatter(centroids[0][0], centroids[0][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[1][0], centroids[1][1], s=200, c='yellow', marker='s')
plt.xlabel("Average Runs")
plt.ylabel("Bowling Economy")
plt.legend(handles=ax.legend_elements()[0], labels=classes)
plt.show()

In [None]:
## with k = 3, three clusters

P, centroids = kmeans(df2, 3)
plt.figure(figsize=(12,10))
classes = ['Cluster 1', 'Cluster 2', 'Cluster 3']
colours = ListedColormap(['red','blue','orange'])
ax = plt.scatter(df2.iloc[:,0],df2.iloc[:,1],c=P, cmap = colours)
plt.scatter(centroids[0][0], centroids[0][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[1][0], centroids[1][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[2][0], centroids[2][1], s=200, c='yellow', marker='s')
plt.xlabel("Average Runs")
plt.ylabel("Bowling Economy")
plt.legend(handles=ax.legend_elements()[0], labels=classes)
plt.show()

In [None]:
## with k = 4, four clusters

P, centroids = kmeans(df2, 4)
plt.figure(figsize=(12,10))
classes = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']
colours = ListedColormap(['red','blue','orange','pink'])
ax = plt.scatter(df2.iloc[:,0],df2.iloc[:,1],c=P, cmap = colours)
plt.scatter(centroids[0][0], centroids[0][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[1][0], centroids[1][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[2][0], centroids[2][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[3][0], centroids[3][1], s=200, c='yellow', marker='s')
plt.xlabel("Average Runs")
plt.ylabel("Bowling Economy")
plt.legend(handles=ax.legend_elements()[0], labels=classes)
plt.show()

In [None]:
## with k = 5, five clusters

P, centroids = kmeans(df2, 5)
plt.figure(figsize=(12,10))
classes = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5']
colours = ListedColormap(['red','blue','orange','pink','green'])
ax = plt.scatter(df2.iloc[:,0],df2.iloc[:,1],c=P, cmap = colours)
plt.scatter(centroids[0][0], centroids[0][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[1][0], centroids[1][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[2][0], centroids[2][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[3][0], centroids[3][1], s=200, c='yellow', marker='s')
plt.scatter(centroids[4][0], centroids[4][1], s=200, c='yellow', marker='s')
plt.xlabel("Average Runs")
plt.ylabel("Bowling Economy")
plt.legend(handles=ax.legend_elements()[0], labels=classes)
plt.show()

### Assignment - End