In [2]:
from numpy import array
from math import sqrt
import numpy as np

from pyspark.mllib.clustering import KMeans, KMeansModel

# Load and parse the data

parsedData = sc.textFile("hdfs://localhost:54310/project/bowler_stat.csv") \
    .map(lambda line: line.split(",")) \
    .filter(lambda line: len(line)>1 and line[1]!="Name") \
    .map(lambda line: array([float(line[2]),float(line[3]),float(line[4]),float(line[5]),\
                             float(line[6]),float(line[7]),float(line[8])]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 5, maxIterations=100000, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "hdfs://localhost:54310/project/bowl_result")
sameModel = KMeansModel.load(sc, "hdfs://localhost:54310/project/bowl_result")

print("A Mishra class :")
print(sameModel.predict(array([284,14,13,332,23.7142857143,7.014084507,0.9285714286])))

print("Bumrah class :")
print(sameModel.predict(array([322,14,15,396,28.2857142857,7.3788819876,1.0714285714])))

print("Virat Kohli class :")
print(sameModel.predict(array([8,1,0,11,11,8.25,0])))



Within Set Sum of Squared Error = 3158.079092522345
A Mishra class :
4
Bumrah class :
1
Virat Kohli class :
0


In [9]:
names = sc.textFile("hdfs://localhost:54310/project/bowler_stat.csv") \
    .map(lambda line: line.split(",")).filter(lambda line: len(line)>1 and line[1]!="Name").map(lambda line: array([line[1]]))
names = names.map(lambda x:x[0])
#print(names.collect())
#print(len(names.collect()))

player_class = parsedData.map(lambda x: (x,sameModel.predict(x)))
#print(player_class.collect())
#print(len(player_class.collect()))

name_class = names.zip(player_class)
#print(name_class.collect())


def players_of_class(k):
    print("Players of class ",k)
    k_class_players = name_class.map(lambda x:(x[0],x[1][1])).filter(lambda x:x[1]==k)
    a = k_class_players.collect()
    l = [i[0] for i in a]
    print("Number of players under class "+str(k)+" = ",len(a))
    print("\n\n")
    print(l)
    print("\n\n")
    #for i in a:
    #    print(i)

    
for i in range(5):
    players_of_class(i)


Players of class  0
Number of players under class 0 =  38



['A Ashish Reddy', 'AF Milne', 'Ankit Sharma', 'Anureet Singh', 'BCJ Cutting', 'C Munro', 'CH Gayle', 'D Wiese', 'DL Chahar', 'DW Steyn', 'GJ Maxwell', 'Gurkeerat Singh', 'IK Pathan', 'J Suchith', 'JA Morkel', 'JD Unadkat', 'JP Duminy', 'JW Hastings', 'KA Pollard', 'KS Williamson', 'M Vijay', 'MR Marsh', 'N Rana', 'P Negi', 'PJ Sangwan', 'R Dhawan', 'R Sathish', 'R Vinay Kumar', 'S Gopal', 'S Ladda', 'SK Raina', 'SM Boland', 'Sachin Baby', 'Swapnil Singh', 'TA Boult', 'V Kohli', 'YK Pathan', 'Yuvraj Singh']



Players of class  1
Number of players under class 1 =  15



['AR Patel', 'B Kumar', 'BB Sran', 'DJ Bravo', 'DS Kulkarni', 'Harbhajan Singh', 'JJ Bumrah', 'MC Henriques', 'MJ McClenaghan', 'MM Sharma', 'Mustafizur Rahman', 'P Kumar', 'SR Watson', 'Sandeep Sharma', 'YS Chahal']



Players of class  2
Number of players under class 2 =  28



['A Zampa', 'AS Rajpoot', 'Bipul Sharma', 'DJ Hooda', 'DR Smith', 'GB Hogg', 'HH 