In [5]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=b9dd35ff1678e4aa8d0acd255efda199281e88884c79816c3c37010664aedcd7
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [8]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [7]:
spark = SparkSession.builder.appName("IPL").getOrCreate()

In [10]:
schea = StructType([StructField('season', IntegerType(), True),
  StructField('match_id', IntegerType(), True),
  StructField('match_name', StringType(), True),
  StructField('home_team', StringType(), True),
  StructField('away_team', StringType(), True),
  StructField('venue', StringType(), True),
  StructField('city', StringType(), True),
  StructField('country', StringType(), True),
  StructField('current_innings', StringType(), True),
  StructField('innings_id', IntegerType(), True),
  StructField('name', StringType(), True),
  StructField('fullName', StringType(), True),
  StructField('runs', IntegerType(), True),
  StructField('ballsFaced', IntegerType(), True),
  StructField('minutes', IntegerType(), True),
  StructField('fours', IntegerType(), True),
  StructField('sixes', IntegerType(), True),
  StructField('strikeRate', FloatType(), True),
  StructField('season', StringType(), True),
  StructField('captain', StringType(), True),
  StructField('isNotOut', BooleanType(), True),
  StructField('runningScore', IntegerType(), True),
  StructField('runningOver', IntegerType(), True),
  StructField('shortText', FloatType(), True),
  StructField('commentary', StringType(), True),
  StructField('link', StringType(), True)])

In [9]:
data = spark.read.option("header",True).option("inferSchema",True).csv('all_season_batting_card.csv')

In [11]:
dt = data.groupBy("season","fullName").agg(sum('runs').cast('int').alias('SumRuns'),sum('ballsFaced').cast('int').alias('TotalBalls'),avg('runs').alias('AvgRuns'),count('fullName').alias('TotalMatches'),max('runs').cast('int').alias('HS'),sum('fours').cast('int').alias('Fours'),sum('sixes').cast('int').alias('Sixes'))

In [15]:
dt = dt.withColumn("StrikeRate",(col('SumRuns')/col('TotalMatches'))*10)

In [66]:
dt.show()

+------+------------------+-------+----------+------------------+------------+---+-----+-----+------------------+
|season|          fullName|SumRuns|TotalBalls|           AvgRuns|TotalMatches| HS|Fours|Sixes|        StrikeRate|
+------+------------------+-------+----------+------------------+------------+---+-----+-----+------------------+
|  2020|       Priyam Garg|    133|       111|              13.3|          10| 51|    9|    4|             133.0|
|  2019|    Rahul Tripathi|    141|       118|20.142857142857142|           7| 50|   13|    2|201.42857142857142|
|  2019|       Trent Boult|      7|         4|2.3333333333333335|           3|  6|    0|    1|23.333333333333336|
|  2015|       David Wiese|    122|        86|17.428571428571427|           7| 47|   12|    4|174.28571428571428|
|  2013|     Robin Uthappa|    434|       371|            27.125|          16| 75|   42|   12|            271.25|
|  2013|      Albie Morkel|     46|        37|15.333333333333334|           3| 23|    2|

In [17]:
dz = data.filter(col('runs')>=50)
dz = dz.groupby('season','fullName').agg(count('fullName').alias('Fifty'))

In [18]:
dp = data.filter(col('runs')>=100)
dp = dp.groupby('season','fullName').agg(count('fullName').alias('Hundred'))

In [19]:
dp = dp.withColumnRenamed("season","sosn").withColumnRenamed("fullName","fnames")
dz = dz.withColumnRenamed("season","sosn").withColumnRenamed("fullName","fnames")
dp.show()

+----+----------------+-------+
|sosn|          fnames|Hundred|
+----+----------------+-------+
|2019|  Jonny Bairstow|      1|
|2018|    Rishabh Pant|      1|
|2021|    Sanju Samson|      1|
|2019|    David Warner|      1|
|2019|        KL Rahul|      1|
|2020|        KL Rahul|      1|
|2018|   Ambati Rayudu|      1|
|2015|Brendon McCullum|      1|
|2013|    David Miller|      1|
|2022| Quinton de Kock|      1|
|2016|    Steven Smith|      1|
|2014| Virender Sehwag|      1|
|2017|      Ben Stokes|      1|
|2011|  Adam Gilchrist|      1|
|2020|  Shikhar Dhawan|      2|
|2021| Ruturaj Gaikwad|      1|
|2022|   Rajat Patidar|      1|
|2016| Quinton de Kock|      1|
|2013|    Shane Watson|      1|
|2021|     Jos Buttler|      1|
+----+----------------+-------+
only showing top 20 rows



In [20]:
cond2 = [dt.season==dz.sosn, dt.fullName==dz.fnames]

In [21]:
cond1 = [dt.season==dp.sosn, dt.fullName==dp.fnames]

In [22]:
dff = dt.join(dp,cond1,'left')

In [23]:
df2 = dff.drop('sosn','fnames')

In [24]:
finaldf = df2.join(dz,cond2,'left')

In [25]:
finaldf = finaldf.drop('sosn','fnames')

In [26]:
finaldf.show()

+------+------------------+-------+----------+------------------+------------+---+-----+-----+------------------+-------+-----+
|season|          fullName|SumRuns|TotalBalls|           AvgRuns|TotalMatches| HS|Fours|Sixes|        StrikeRate|Hundred|Fifty|
+------+------------------+-------+----------+------------------+------------+---+-----+-----+------------------+-------+-----+
|  2020|       Priyam Garg|    133|       111|              13.3|          10| 51|    9|    4|             133.0|   null|    1|
|  2019|    Rahul Tripathi|    141|       118|20.142857142857142|           7| 50|   13|    2|201.42857142857142|   null|    1|
|  2019|       Trent Boult|      7|         4|2.3333333333333335|           3|  6|    0|    1|23.333333333333336|   null| null|
|  2015|       David Wiese|    122|        86|17.428571428571427|           7| 47|   12|    4|174.28571428571428|   null| null|
|  2013|     Robin Uthappa|    434|       371|            27.125|          16| 75|   42|   12|          

In [27]:
finaldf = finaldf.na.fill(value=0,subset=['Fifty','Hundred'])

In [28]:
finaldf.filter(finaldf.Hundred > 0).show()

+------+----------------+-------+----------+------------------+------------+---+-----+-----+------------------+-------+-----+
|season|        fullName|SumRuns|TotalBalls|           AvgRuns|TotalMatches| HS|Fours|Sixes|        StrikeRate|Hundred|Fifty|
+------+----------------+-------+----------+------------------+------------+---+-----+-----+------------------+-------+-----+
|  2019|  Jonny Bairstow|    445|       283|              44.5|          10|114|   48|   18|             445.0|      1|    3|
|  2018|    Rishabh Pant|    684|       394|48.857142857142854|          14|128|   68|   37|488.57142857142856|      1|    6|
|  2021|    Sanju Samson|    484|       354| 34.57142857142857|          14|119|   45|   17|345.71428571428567|      1|    3|
|  2019|    David Warner|    692|       481|57.666666666666664|          12|100|   57|   21| 576.6666666666666|      1|    9|
|  2019|        KL Rahul|    593|       438|42.357142857142854|          14|100|   49|   25|423.57142857142856|      1

In [29]:
finaldf.show()

+------+------------------+-------+----------+------------------+------------+---+-----+-----+------------------+-------+-----+
|season|          fullName|SumRuns|TotalBalls|           AvgRuns|TotalMatches| HS|Fours|Sixes|        StrikeRate|Hundred|Fifty|
+------+------------------+-------+----------+------------------+------------+---+-----+-----+------------------+-------+-----+
|  2020|       Priyam Garg|    133|       111|              13.3|          10| 51|    9|    4|             133.0|      0|    1|
|  2019|    Rahul Tripathi|    141|       118|20.142857142857142|           7| 50|   13|    2|201.42857142857142|      0|    1|
|  2019|       Trent Boult|      7|         4|2.3333333333333335|           3|  6|    0|    1|23.333333333333336|      0|    0|
|  2015|       David Wiese|    122|        86|17.428571428571427|           7| 47|   12|    4|174.28571428571428|      0|    0|
|  2013|     Robin Uthappa|    434|       371|            27.125|          16| 75|   42|   12|          

In [30]:
finaldf = finaldf.dropna()

In [31]:
finaldf.write.option("header",True) \
        .mode("overwrite") \
        .csv("orangecapdata")

In [32]:
finaldf.groupby('season').max('SumRuns').show()

+------+------------+
|season|max(SumRuns)|
+------+------------+
|  2016|         973|
|  2020|         670|
|  2012|         733|
|  2019|         692|
|  2017|         641|
|  2014|         660|
|  2013|         733|
|  2018|         735|
|  2009|         572|
|  2011|         608|
|  2022|         863|
|  2008|         616|
|  2021|         635|
|  2015|         562|
|  2010|         618|
+------+------------+



In [33]:
finaldf.createOrReplaceTempView("iplcap")
spark.sql("select fullName,season,max(SumRuns) from iplcap group by fullName, season order by max(SumRuns) desc").show()

+----------------+------+------------+
|        fullName|season|max(SumRuns)|
+----------------+------+------------+
|     Virat Kohli|  2016|         973|
|     Jos Buttler|  2022|         863|
|    David Warner|  2016|         848|
| Kane Williamson|  2018|         735|
|  Michael Hussey|  2013|         733|
|     Chris Gayle|  2012|         733|
|     Chris Gayle|  2013|         708|
|    David Warner|  2019|         692|
|  AB de Villiers|  2016|         687|
|    Rishabh Pant|  2018|         684|
|        KL Rahul|  2020|         670|
|   Robin Uthappa|  2014|         660|
|        KL Rahul|  2018|         659|
|    David Warner|  2017|         641|
| Ruturaj Gaikwad|  2021|         635|
|     Virat Kohli|  2013|         634|
|  Faf du Plessis|  2021|         633|
|        KL Rahul|  2021|         626|
|Sachin Tendulkar|  2010|         618|
|  Shikhar Dhawan|  2020|         618|
+----------------+------+------------+
only showing top 20 rows



In [34]:
df = finaldf.toPandas()

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [36]:
df.columns

Index(['season', 'fullName', 'SumRuns', 'TotalBalls', 'AvgRuns',
       'TotalMatches', 'HS', 'Fours', 'Sixes', 'StrikeRate', 'Hundred',
       'Fifty'],
      dtype='object')

In [37]:
df = df[df['TotalMatches']>=5]

In [38]:
df = df[['TotalMatches','fullName','SumRuns','HS','AvgRuns','StrikeRate','Hundred','Fifty',]]

In [39]:
df['AvgSR'] = df['AvgRuns']*df['StrikeRate']

In [40]:
df.head(5)

Unnamed: 0,TotalMatches,fullName,SumRuns,HS,AvgRuns,StrikeRate,Hundred,Fifty,AvgSR
0,10,Priyam Garg,133,51,13.3,133.0,0,1,1768.9
1,7,Rahul Tripathi,141,50,20.142857,201.428571,0,1,4057.346939
3,7,David Wiese,122,47,17.428571,174.285714,0,0,3037.55102
4,16,Robin Uthappa,434,75,27.125,271.25,0,2,7357.65625
6,12,Kumar Sangakkara,200,82,16.666667,166.666667,0,1,2777.777778


In [41]:
X = df[['TotalMatches','HS','AvgRuns','StrikeRate','Hundred','Fifty','AvgSR']]
y = df['SumRuns']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [43]:
model = LinearRegression()

In [44]:
model.fit(X_train,y_train)

In [45]:
y_pred = model.predict(X_test)

In [46]:
mse = mean_squared_error(y_test, y_pred)

In [47]:
print("Mean Squared Error", mse)

Mean Squared Error 490.2227480347855


In [48]:
#2023 Data 

In [49]:
sch = StructType([StructField('fullName', StringType(), True),
StructField('Matches', IntegerType(), True),
StructField('TotalMatches', IntegerType(), True),
StructField('NO', IntegerType(), True),
StructField('SumRuns', IntegerType(), True),
StructField('HS', IntegerType(), True),
StructField('AvgRuns', FloatType(), True),
StructField('TotalBalls', IntegerType(), True),
StructField('StrikeRate', FloatType(), True),
StructField('Hundred', IntegerType(), True),
StructField('Fifty', IntegerType(), True),
StructField('fours', IntegerType(), True),
StructField('sixes', IntegerType(), True)])

In [50]:
org2023 = spark.read.option("header",True).schema(sch).csv('2023orangecap.csv')

In [51]:
org2023.printSchema()

root
 |-- fullName: string (nullable = true)
 |-- Matches: integer (nullable = true)
 |-- TotalMatches: integer (nullable = true)
 |-- NO: integer (nullable = true)
 |-- SumRuns: integer (nullable = true)
 |-- HS: integer (nullable = true)
 |-- AvgRuns: float (nullable = true)
 |-- TotalBalls: integer (nullable = true)
 |-- StrikeRate: float (nullable = true)
 |-- Hundred: integer (nullable = true)
 |-- Fifty: integer (nullable = true)
 |-- fours: integer (nullable = true)
 |-- sixes: integer (nullable = true)



In [52]:
org2023 = org2023.withColumn("AvgSR",col("AvgRuns")*col("StrikeRate"))

In [53]:
org2023.show(5)

+----------------+-------+------------+---+-------+---+-------+----------+----------+-------+-----+-----+-----+---------+
|        fullName|Matches|TotalMatches| NO|SumRuns| HS|AvgRuns|TotalBalls|StrikeRate|Hundred|Fifty|fours|sixes|    AvgSR|
+----------------+-------+------------+---+-------+---+-------+----------+----------+-------+-----+-----+-----+---------+
|  Faf Du Plessis|     10|          10|  1|    511| 84|  56.78|       324|    157.71|      0|    5|   40|   29| 8954.774|
|    Devon Conway|     11|          10|  2|    458| 92|  57.25|       329|     139.2|      0|    5|   54|   13|7969.1997|
|Yashasvi Jaiswal|     10|          10|  0|    442|124|   44.2|       279|    158.42|      1|    3|   57|   19| 7002.164|
|     Virat Kohli|     10|          10|  1|    419| 82|  46.56|       310|    135.16|      0|    6|   39|   11|  6293.05|
| Ruturaj Gaikwad|     11|          10|  1|    384| 92|  42.67|       259|    148.26|      0|    2|   27|   21| 6326.254|
+----------------+------

In [54]:
df4 = org2023.toPandas()

In [55]:
df4.head()

Unnamed: 0,fullName,Matches,TotalMatches,NO,SumRuns,HS,AvgRuns,TotalBalls,StrikeRate,Hundred,Fifty,fours,sixes,AvgSR
0,Faf Du Plessis,10,10.0,1,511,84,56.779999,324,157.710007,0,5,40,29,8954.774414
1,Devon Conway,11,10.0,2,458,92,57.25,329,139.199997,0,5,54,13,7969.199707
2,Yashasvi Jaiswal,10,10.0,0,442,124,44.200001,279,158.419998,1,3,57,19,7002.164062
3,Virat Kohli,10,10.0,1,419,82,46.560001,310,135.160004,0,6,39,11,6293.049805
4,Ruturaj Gaikwad,11,10.0,1,384,92,42.669998,259,148.259995,0,2,27,21,6326.253906


In [56]:
df4 = df4[df4['TotalMatches']>7]

In [57]:
current_season_data = df4[['TotalMatches','HS','AvgRuns','StrikeRate','Hundred','Fifty','AvgSR']]

In [58]:
current_season_data = current_season_data.dropna()

In [59]:
predicted_runs = model.predict(current_season_data)

In [60]:
print(predicted_runs)

[116.96520966  88.88746456 120.56419782 141.61292323  87.28776335
 117.34036545 134.78036783  67.04810142 174.80166673 120.42115492
 141.94454152 188.7989209  122.61654501 130.66281994 122.55755259
  52.85716348  59.45143259 110.10493952 192.99408424  73.31534506
 144.43167224 158.80930929 132.66869424  70.7363349  142.96234157
 148.13261273 100.35679566  87.70053471 102.48210482 103.20037435
 101.9492475   83.62338127  74.5365409  130.82087145  14.45714655
 107.21196576  76.14596633 135.10207973  83.63003481 145.61530706
 115.63675227 149.49329705  97.1888302  136.89624809 123.66224519
 109.16338492  94.26604764]


In [61]:
ps = np.round(predicted_runs)

In [67]:
print(ps)

[117.  89. 121. 142.  87. 117. 135.  67. 175. 120. 142. 189. 123. 131.
 123.  53.  59. 110. 193.  73. 144. 159. 133.  71. 143. 148. 100.  88.
 102. 103. 102.  84.  75. 131.  14. 107.  76. 135.  84. 146. 116. 149.
  97. 137. 124. 109.  94.]


In [62]:
orange_cap_winner_index = ps.argmax()

In [63]:
print(orange_cap_winner_index)

18


In [64]:
ocw = current_season_data.iloc[orange_cap_winner_index]

In [65]:
print(df4['fullName'][orange_cap_winner_index])
print(ocw)

Cameron Green
TotalMatches      10.000000
HS                77.000000
AvgRuns           29.110001
StrikeRate       181.940002
Hundred            0.000000
Fifty              3.000000
AvgSR           5296.273438
Name: 19, dtype: float64


In [68]:
bowl_schema = StructType([StructField('season', IntegerType(), True),
  StructField('match_id', IntegerType(), True),
  StructField('match_name', StringType(), True),
  StructField('home_team', StringType(), True),
  StructField('away_team', StringType(), True),
  StructField('bowling_team', StringType(), True),
  StructField('venue', StringType(), True),
  StructField('city', StringType(), True),
  StructField('country', StringType(), True),
  StructField('innings_id', IntegerType(), True),
  StructField('name', StringType(), True),
  StructField('fullName', StringType(), True),
  StructField('overs', FloatType(), True),
  StructField('maidens', IntegerType(), True),
  StructField('conceded', IntegerType(), True),
  StructField('wickets', IntegerType(), True),
  StructField('economyRate', FloatType(), True),
  StructField('dots', IntegerType(), True),
  StructField('foursConceded', IntegerType(), True),
  StructField('sixesConceded', IntegerType(), True),
  StructField('wides', IntegerType(), True),
  StructField('noballs', IntegerType(), True),
  StructField('captain', BooleanType(), True),
  StructField('href', StringType(), True)])

In [69]:
bowl_data = spark.read.option("header",True).schema(bowl_schema).csv("all_season_bowling_card.csv")

In [70]:
bowl_data.show()

+------+--------+----------+---------+---------+------------+--------------------+------+-------+----------+-------------+------------------+-----+-------+--------+-------+-----------+----+-------------+-------------+-----+-------+-------+--------------------+
|season|match_id|match_name|home_team|away_team|bowling_team|               venue|  city|country|innings_id|         name|          fullName|overs|maidens|conceded|wickets|economyRate|dots|foursConceded|sixesConceded|wides|noballs|captain|                href|
+------+--------+----------+---------+---------+------------+--------------------+------+-------+----------+-------------+------------------+-----+-------+--------+-------+-----------+----+-------------+-------------+-----+-------+-------+--------------------+
|  2022| 1304047| CSK v KKR|      CSK|      KKR|         KKR|Wankhede Stadium,...|Mumbai|  India|         1|     UT Yadav|       Umesh Yadav|  4.0|      0|      20|      2|        5.0|  15|            1|            1|

In [71]:
df2 = bowl_data.groupBy("season","fullName").agg(sum('wickets').cast('int').alias('TotalWickets'),sum('overs').cast('int').alias('Overs'),sum('conceded').cast('int').alias("RunsConceded"),avg('wickets').alias('AvgWickets'),count('fullName').alias('TotalMatches'),max('wickets').cast('int').alias('MaxWickets'),sum('foursConceded').cast('int').alias('FoursConceded'),sum('sixesConceded').cast('int').alias('SixesConceded'),avg('economyRate').alias('AvgEconomyRate'))

In [72]:
df2.show()

+------+--------------------+------------+-----+------------+------------------+------------+----------+-------------+-------------+------------------+
|season|            fullName|TotalWickets|Overs|RunsConceded|        AvgWickets|TotalMatches|MaxWickets|FoursConceded|SixesConceded|    AvgEconomyRate|
+------+--------------------+------------+-----+------------+------------------+------------+----------+-------------+-------------+------------------+
|  2020|       Krunal Pandya|           6|   50|         380|             0.375|          16|         2|           27|           11|          7.828125|
|  2020|      Marcus Stoinis|          13|   29|         283|               1.0|          13|         3|           23|           14|  9.19384611569918|
|  2016| Ravichandran Ashwin|          10|   44|         319|0.7142857142857143|          14|         4|           22|           10| 7.612857137407575|
|  2016|Mitchell McClenaghan|          17|   52|         436|1.2142857142857142|        

In [74]:
df2 = df2.withColumn("AvgRate",df2['RunsConceded']/df2['TotalWickets']).withColumn("StrRate",df2['Overs']*6/df2['TotalWickets'])

In [75]:
df2.show()

+------+--------------------+------------+-----+------------+------------------+------------+----------+-------------+-------------+------------------+------------------+------------------+
|season|            fullName|TotalWickets|Overs|RunsConceded|        AvgWickets|TotalMatches|MaxWickets|FoursConceded|SixesConceded|    AvgEconomyRate|           AvgRate|           StrRate|
+------+--------------------+------------+-----+------------+------------------+------------+----------+-------------+-------------+------------------+------------------+------------------+
|  2020|       Krunal Pandya|           6|   50|         380|             0.375|          16|         2|           27|           11|          7.828125|63.333333333333336|              50.0|
|  2020|      Marcus Stoinis|          13|   29|         283|               1.0|          13|         3|           23|           14|  9.19384611569918| 21.76923076923077|13.384615384615385|
|  2016| Ravichandran Ashwin|          10|   44|  

In [76]:
df2.groupby('season').max('TotalWickets').show()

+------+-----------------+
|season|max(TotalWickets)|
+------+-----------------+
|  2018|               24|
|  2015|               26|
|  2022|               27|
|  2013|               32|
|  2014|               23|
|  2019|               26|
|  2020|               30|
|  2012|               25|
|  2009|               23|
|  2016|               23|
|  2010|               21|
|  2011|               28|
|  2008|               22|
|  2017|               26|
|  2021|               32|
+------+-----------------+



In [77]:
fnldf = df2.dropna()

In [78]:
fnldf.write.option("header",True) \
        .mode("overwrite") \
        .csv("purplecapdata")

In [79]:
bowl_Df = fnldf.toPandas()

In [80]:
bowldf = bowl_Df[['TotalMatches','TotalWickets','Overs','RunsConceded','AvgWickets','MaxWickets','FoursConceded','SixesConceded','AvgEconomyRate','AvgRate','StrRate']]

In [81]:
bowldf.head()

Unnamed: 0,TotalMatches,TotalWickets,Overs,RunsConceded,AvgWickets,MaxWickets,FoursConceded,SixesConceded,AvgEconomyRate,AvgRate,StrRate
0,16,6,50,380,0.375,2,27,11,7.828125,63.333333,50.0
1,13,13,29,283,1.0,3,23,14,9.193846,21.769231,13.384615
2,14,10,44,319,0.714286,4,22,10,7.612857,31.9,26.4
3,14,17,52,436,1.214286,4,47,16,8.202143,25.647059,18.352941
4,14,9,49,392,0.642857,2,28,16,8.112857,43.555556,32.666667


In [94]:
X = bowldf[['TotalMatches','Overs','RunsConceded','AvgWickets','AvgRate','AvgEconomyRate','StrRate']]
y = bowldf[['TotalWickets']]

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [96]:
model = LinearRegression()
model.fit(X_train, y_train)

In [97]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error", mse)

Mean Squared Error 2.808570687887071


In [98]:
#2023 Purple Cap Data 

In [120]:
bowl23_schema = StructType([StructField('fullName', StringType(), True),
  StructField('Matches', IntegerType(), True),
  StructField('TotalMatches', IntegerType(), True),
  StructField('Overs', FloatType(), True),
  StructField('RunsConceded', IntegerType(), True),
  StructField('TotalWickets', IntegerType(), True),
  StructField('AvgRate', FloatType(), True),
  StructField('AvgEconomyRate', FloatType(), True),
  StructField('StrRate', FloatType(), True),
  StructField('4w', IntegerType(), True),
  StructField('5w', IntegerType(), True)])

In [121]:
curr_data = spark.read.option("header",True).schema(bowl23_schema).csv("2023purplecap.csv")

In [122]:
curr_data.show()

+-------------------+-------+------------+-----+------------+------------+-------+--------------+-------+---+---+
|           fullName|Matches|TotalMatches|Overs|RunsConceded|TotalWickets|AvgRate|AvgEconomyRate|StrRate| 4w| 5w|
+-------------------+-------+------------+-----+------------+------------+-------+--------------+-------+---+---+
|   Tushar Deshpande|     11|          11| 38.2|         396|          19|  20.84|         10.33|   12.1|  0|  0|
|     Mohammad Shami|     10|          10| 39.0|         274|          18|  15.22|          7.02|   13.0|  1|  0|
|        Rashid Khan|     10|          10| 40.0|         322|          18|  17.88|          8.05|  13.33|  0|  0|
|      Piyush Chawla|     10|          10| 39.0|         280|          17|  16.47|          7.17|  13.76|  0|  0|
|     Arshdeep Singh|     10|          10| 36.5|         361|          16|  22.56|           9.8|  13.81|  1|  0|
|    Ravindra Jadeja|     11|          11| 39.0|         288|          15|   19.2|      

In [123]:
curr_data = curr_data.withColumn("AvgWickets", curr_data['TotalWickets']/curr_data['TotalMatches'])

In [124]:
curr_data.show()

+-------------------+-------+------------+-----+------------+------------+-------+--------------+-------+---+---+------------------+
|           fullName|Matches|TotalMatches|Overs|RunsConceded|TotalWickets|AvgRate|AvgEconomyRate|StrRate| 4w| 5w|        AvgWickets|
+-------------------+-------+------------+-----+------------+------------+-------+--------------+-------+---+---+------------------+
|   Tushar Deshpande|     11|          11| 38.2|         396|          19|  20.84|         10.33|   12.1|  0|  0|1.7272727272727273|
|     Mohammad Shami|     10|          10| 39.0|         274|          18|  15.22|          7.02|   13.0|  1|  0|               1.8|
|        Rashid Khan|     10|          10| 40.0|         322|          18|  17.88|          8.05|  13.33|  0|  0|               1.8|
|      Piyush Chawla|     10|          10| 39.0|         280|          17|  16.47|          7.17|  13.76|  0|  0|               1.7|
|     Arshdeep Singh|     10|          10| 36.5|         361|        

In [125]:
current_data = curr_data.toPandas()

In [126]:
X = current_data[['TotalMatches','Overs','RunsConceded','AvgWickets','AvgRate','AvgEconomyRate','StrRate']]
# y = bowldf[['TotalWickets']]

In [127]:
predicted_wickets = model.predict(X)

In [128]:
print(predicted_wickets)

[[14.43965727]
 [15.10169129]
 [15.01008449]
 [14.58195857]
 [13.21437821]
 [13.44085737]
 [13.21961601]
 [12.70951096]
 [12.28975904]
 [12.05687881]
 [11.42820047]
 [11.1992555 ]
 [10.77164255]
 [12.80528726]
 [ 9.70943579]
 [ 9.83240258]
 [ 9.94140369]
 [ 8.20712978]
 [ 7.70339254]
 [ 9.16736897]
 [ 8.83070673]
 [ 7.37894153]
 [ 9.21329615]
 [ 8.5336659 ]
 [ 8.6469002 ]
 [ 8.1207008 ]
 [ 8.37525943]
 [ 7.65885936]
 [ 7.76345931]
 [ 8.24851425]
 [ 8.16717373]
 [ 7.3295701 ]
 [ 8.04555434]
 [ 8.49424398]
 [ 7.42861813]
 [ 8.06249772]
 [ 8.56579572]
 [ 7.40612448]
 [ 5.88073884]
 [ 6.23366731]
 [ 6.18712893]
 [ 5.64987532]
 [ 6.96102644]
 [ 6.33662494]
 [ 6.36926028]
 [ 6.82340489]
 [ 6.22297523]
 [ 6.77227095]
 [ 5.0561609 ]
 [ 5.33618021]
 [ 5.75475316]
 [ 5.5408247 ]
 [ 5.23743308]
 [ 6.53408323]
 [ 5.00695619]
 [ 4.76004563]
 [ 4.28063722]
 [ 5.55122263]
 [ 3.94624381]
 [ 4.65516156]
 [ 3.08478205]
 [ 2.60096627]
 [ 4.64377117]
 [ 4.69293417]
 [ 3.32349777]
 [ 2.9950256 ]
 [ 4.90583