In [1]:
%matplotlib inline
from numpy import array
from math import sqrt
import pandas as pd


from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql import SQLContext
from pyspark.ml.linalg import Vectors, VectorUDT


In [2]:
#preprocessing

df = pd.read_csv('./stats.csv')
df =df.drop(['player_id'], axis=1)
df.isnull().values.any()
print(len(df.index))
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()
bsr = []
x= df['bowl_sr'].isnull()
for i in range(len(x)):
    if x[i] == True:
        bsr.append(i)
rc=[]
y= df['runs_conceded'].isnull()
for i in range(len(y)):
    if y[i] == True:
        rc.append(i)
w=[]
z= df['wkts'].isnull()
for i in range(len(z)):
    if z[i] == True:
        w.append(i)
        
ba=[]
a= df['bowl_ave'].isnull()
for i in range(len(a)):
    if [i] == True:
        ba.append(i)
ec=[]
b= df['econ'].isnull()
for i in range(len(b)):
    if b[i] == True:
        ec.append(i)
        
# print(bsr)
# print(rc)
# print(w)
# print(ba)
# print(ec)

print(bsr)

df=df.drop(df.index[bsr])
df.isnull().values.any()
df.to_csv('stats_clean.csv', sep=',', index = False)

606
[]


In [3]:
#feature engineering

df_batsman = pd.read_csv('stats_clean.csv')
df_batsman['hundreds/innings'] = df_batsman['hundreds'] / df_batsman['bat_inns']
df_batsman['fifties/innings'] = df_batsman['fifties'] / df_batsman['bat_inns']
df_batsman['fours_rate'] = df_batsman['fours'] / df_batsman['balls_faced']
df_batsman['six_rate'] = df_batsman['sixes'] / df_batsman['balls_faced']
df_batsman['vulnerability'] = (df_batsman['bat_inns'] - df_batsman['not_outs']) / df_batsman['balls_faced']


desired_attributes = ['player_name','ave_score', 'sr' , 'balls_faced', 'hundreds/innings' ,'fifties/innings', 'fours_rate', 'six_rate','vulnerability']

df_batsman = df_batsman[desired_attributes]

hi = []
x= df_batsman['hundreds/innings'].isnull()
for i in range(len(x)):
    if x[i] == True:
        hi.append(i)
        


df_batsman=df_batsman.drop(df_batsman.index[hi])
df_batsman.isnull().values.any()

False

In [4]:
df_bowler = pd.read_csv('stats_clean.csv')
df_bowler = df_bowler[['player_name','bowl_ave','econ' ,'bowl_sr','balls']]
print(df_bowler.columns.values)
df_bowler.isnull().values.any()

['player_name' 'bowl_ave' 'econ' 'bowl_sr' 'balls']


False

In [5]:
df_batsman.to_csv('stats_clean_batsman.csv', sep=',',index=False)
df_bowler.to_csv('stats_clean_bowler.csv' , sep = ',',index = False)

In [6]:
spark = SparkSession.builder.appName('batsman-clustering').getOrCreate()
data = spark.read.csv('stats_clean_batsman.csv', header=True, inferSchema=True)

In [7]:
data.printSchema()
data.show(5)

root
 |-- player_name: string (nullable = true)
 |-- ave_score: double (nullable = true)
 |-- sr: double (nullable = true)
 |-- balls_faced: integer (nullable = true)
 |-- hundreds/innings: double (nullable = true)
 |-- fifties/innings: double (nullable = true)
 |-- fours_rate: double (nullable = true)
 |-- six_rate: double (nullable = true)
 |-- vulnerability: double (nullable = true)

+-----------------+---------+------+-----------+--------------------+--------------------+-------------------+--------------------+--------------------+
|      player_name|ave_score|    sr|balls_faced|    hundreds/innings|     fifties/innings|         fours_rate|            six_rate|       vulnerability|
+-----------------+---------+------+-----------+--------------------+--------------------+-------------------+--------------------+--------------------+
|      Zaheer Khan|     8.68| 86.42|        221|                 0.0|                 0.0|0.07692307692307693|0.013574660633484163| 0.09954751131221719

In [8]:
#vectorizing data to be fed into the model
cols = list(data.columns)[2:]
assembler = VectorAssembler(inputCols=cols, outputCol='features')
assembled_data = assembler.transform(data)

In [9]:
#feature scaling as it is a requirement of kmeans
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)

In [10]:
#fitting the model on the vectorized and scaled features
k_means_5 = KMeans(featuresCol='scaledFeatures', k=5).setSeed(1)
model_k5 = k_means_5.fit(scaled_data)

In [11]:
model_k5_data = model_k5.transform(scaled_data)
model_k5_data.groupBy('prediction').count()
model_k5_data.write.mode('overwrite').format('parquet').save('batsman_cluster')
sqlContext=SQLContext(spark.sparkContext)
k=sqlContext.read.format('parquet').load('batsman_cluster')
k.show()

+-------------------+---------+------+-----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        player_name|ave_score|    sr|balls_faced|    hundreds/innings|     fifties/innings|         fours_rate|            six_rate|       vulnerability|            features|      scaledFeatures|prediction|
+-------------------+---------+------+-----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        Zaheer Khan|     8.68| 86.42|        221|                 0.0|                 0.0|0.07692307692307693|0.013574660633484163| 0.09954751131221719|[86.42,221.0,0.0,...|[2.98955122466210...|         3|
|     Mayank Agarwal|     23.4|130.79|       1789|0.009433962264150943| 0.14150943396226415|0.13191727221911684|0.045835662381218556| 0.05589714924538849|[130.79,1789.0

In [12]:
k.groupBy('prediction').avg(*(k.columns[1:9])).collect()

[Row(prediction=1, avg(ave_score)=29.028999999999996, avg(sr)=132.33133333333336, avg(balls_faced)=2361.766666666667, avg(hundreds/innings)=0.028819782947306378, avg(fifties/innings)=0.1362667643341448, avg(fours_rate)=0.12896692526707906, avg(six_rate)=0.04915212662398278, avg(vulnerability)=0.04665035396834867),
 Row(prediction=3, avg(ave_score)=8.519383561643833, avg(sr)=77.9672602739726, avg(balls_faced)=130.43835616438355, avg(hundreds/innings)=0.0, avg(fifties/innings)=0.004672225273224407, avg(fours_rate)=0.04973314484438348, avg(six_rate)=0.011503125648160665, avg(vulnerability)=0.14601506512446538),
 Row(prediction=4, avg(ave_score)=5.0, avg(sr)=196.425, avg(balls_faced)=4.5, avg(hundreds/innings)=0.0, avg(fifties/innings)=0.0, avg(fours_rate)=0.39285714285714285, avg(six_rate)=0.0, avg(vulnerability)=1.9642857142857144),
 Row(prediction=2, avg(ave_score)=29.247563025210088, avg(sr)=128.81109243697477, avg(balls_faced)=2911.378151260504, avg(hundreds/innings)=0.004697569393403

In [13]:
k.groupBy('prediction').count().collect()

[Row(prediction=1, count=30),
 Row(prediction=3, count=146),
 Row(prediction=4, count=2),
 Row(prediction=2, count=119),
 Row(prediction=0, count=276)]

In [14]:
pd.DataFrame(k.groupBy('prediction').avg(*(k.columns[1:9])).collect(),columns=['prediction']+k.columns[1:9]).to_csv('batsman_cluster_info.csv')

BOWLER

In [15]:
#process similar to batsman carried out for bowler
spark = SparkSession.builder.appName('bowler-clustering').getOrCreate()
data_bwl = spark.read.csv('stats_clean_bowler.csv', header=True, inferSchema=True)

In [16]:
data_bwl.show()

+-------------------+--------+----+-------+-----+
|        player_name|bowl_ave|econ|bowl_sr|balls|
+-------------------+--------+----+-------+-----+
|        Zaheer Khan|   27.35|7.49|   21.8| 3044|
|     Mayank Agarwal|     0.0| 8.0|    0.0|    6|
|      Khaleel Ahmed|   19.64|7.26|   16.2|  276|
|       Sam Billings|     0.0| 0.0|    0.0|    0|
|  Carlos Brathwaite|   25.94|7.86|   19.7| 2354|
|Nathan Coulter-Nile|   22.63|7.76|   17.4| 1924|
|    Quinton de Kock|     0.0| 0.0|    0.0|    0|
|   Jean-Paul Duminy|   30.23|7.62|   23.7| 1736|
|    Akhil Herwadkar|     0.0| 0.0|    0.0|    0|
|        Imran Tahir|   20.76|7.08|   17.5| 4785|
|       Shreyas Iyer|     0.0| 0.0|    0.0|    0|
|     Mahipal Lomror|   60.75|8.62|   42.2|  169|
|       Chama Milind|   15.33|7.49|   12.2|  589|
|        Amit Mishra|   21.79|7.15|   18.2| 4168|
|     Mohammed Shami|    26.6|8.28|   19.2| 1310|
|       Chris Morris|   22.18|7.64|   17.4| 3168|
|     Shahbaz Nadeem|   29.12|6.77|   25.7| 2296|


In [17]:
cols_bwl = list(data_bwl.columns)[2:]

assembler_bwl = VectorAssembler(inputCols=cols_bwl, outputCol='features')
assembled_data_bwl = assembler_bwl.transform(data_bwl)

In [18]:
scaler_bwl = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model_bwl = scaler_bwl.fit(assembled_data_bwl)
scaled_data_bwl = scaler_model_bwl.transform(assembled_data_bwl)

In [19]:
k_means = KMeans(featuresCol='scaledFeatures', k=5).setSeed(1)
model = k_means.fit(scaled_data_bwl)

temp = model.transform(scaled_data_bwl)
temp.write.mode('overwrite').format('parquet').save('bowler_cluster')
k=sqlContext.read.format('parquet').load('bowler_cluster')
k.show()

+-------------------+--------+----+-------+-----+------------------+--------------------+----------+
|        player_name|bowl_ave|econ|bowl_sr|balls|          features|      scaledFeatures|prediction|
+-------------------+--------+----+-------+-----+------------------+--------------------+----------+
|        Zaheer Khan|   27.35|7.49|   21.8| 3044|[7.49,21.8,3044.0]|[2.07768540020089...|         4|
|     Mayank Agarwal|     0.0| 8.0|    0.0|    6|     [8.0,0.0,6.0]|[2.21915663572859...|         0|
|      Khaleel Ahmed|   19.64|7.26|   16.2|  276| [7.26,16.2,276.0]|[2.01388464692370...|         3|
|       Sam Billings|     0.0| 0.0|    0.0|    0|         (3,[],[])|           (3,[],[])|         1|
|  Carlos Brathwaite|   25.94|7.86|   19.7| 2354|[7.86,19.7,2354.0]|[2.18032139460334...|         4|
|Nathan Coulter-Nile|   22.63|7.76|   17.4| 1924|[7.76,17.4,1924.0]|[2.15258193665674...|         4|
|    Quinton de Kock|     0.0| 0.0|    0.0|    0|         (3,[],[])|           (3,[],[])|  

In [20]:
k.groupBy('prediction').avg('bowl_ave','econ','bowl_sr','balls').collect()

[Row(prediction=1, avg(bowl_ave)=0.03787878787878788, avg(econ)=0.11363636363636363, avg(bowl_sr)=0.05303030303030303, avg(balls)=0.14393939393939395),
 Row(prediction=3, avg(bowl_ave)=27.86269230769231, avg(econ)=7.669653846153847, avg(bowl_sr)=21.53619230769231, avg(balls)=430.4730769230769),
 Row(prediction=4, avg(bowl_ave)=25.55840336134454, avg(econ)=7.593025210084038, avg(bowl_sr)=20.15714285714286, avg(balls)=2052.8823529411766),
 Row(prediction=2, avg(bowl_ave)=24.552093023255814, avg(econ)=7.4006976744186055, avg(bowl_sr)=19.913953488372094, avg(balls)=4280.627906976744),
 Row(prediction=0, avg(bowl_ave)=4.029999999999999, avg(econ)=10.850384615384616, avg(bowl_sr)=2.5250000000000004, avg(balls)=17.73076923076923)]

In [21]:
k.groupBy('prediction').count().collect()

[Row(prediction=1, count=132),
 Row(prediction=3, count=260),
 Row(prediction=4, count=119),
 Row(prediction=2, count=43),
 Row(prediction=0, count=52)]

In [25]:
pd.DataFrame(k.groupBy('prediction').avg('bowl_ave','econ','bowl_sr','balls').collect(),columns=['prediction','bowl_ave','econ','bowl_sr','balls']).to_csv('bowler_cluster_info.csv')