# IPL Score prediction

Using Apache Spark and Hadoop

# Step 1: Install Apache Spark and Java in google colab

In [None]:
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

In [None]:
# set your spark folder to your system path environment
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

In [None]:
# install findspark using pip
!pip install -q findspark

# Step 2: Test installation


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Load Dataset


In [None]:
import pandas as pd
import numpy as np

# Get data for all matches for training
df1 = pd.read_csv('all_matches.csv')
df2 = pd.read_csv('deliveries.csv')
# Get summary of dataset
df1.info()

  exec(code_obj, self.user_global_ns, self.user_ns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194354 entries, 0 to 194353
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   match_id                194354 non-null  int64  
 1   season                  194354 non-null  object 
 2   start_date              194354 non-null  object 
 3   venue                   194354 non-null  object 
 4   innings                 194354 non-null  int64  
 5   ball                    194354 non-null  float64
 6   batting_team            194354 non-null  object 
 7   bowling_team            194354 non-null  object 
 8   striker                 194354 non-null  object 
 9   non_striker             194354 non-null  object 
 10  bowler                  194354 non-null  object 
 11  runs_off_bat            194354 non-null  int64  
 12  extras                  194354 non-null  int64  
 13  wides                   5884 non-null    float64
 14  noballs             

# Step 3: Preprocessing of the dataset

In [None]:
# Drop not needed fields 
matches = df1.drop(['wicket_type','player_dismissed','other_wicket_type','other_player_dismissed'],axis=1)
# Replace null values by 0
matches['wides'].fillna(0.0,inplace=True)
matches['noballs'].fillna(0.0,inplace=True)
matches['legbyes'].fillna(0.0,inplace=True)
matches['byes'].fillna(0.0,inplace=True)
matches['penalty'].fillna(0.0,inplace=True)

# Only choose teams that are currently in the IPL
teams = [
         'Deccan Chargers','Pune Warriors','Gujarat Lions',
         'Rising Pune Supergiants','Kochi Tuskers Kerala',
         'Rising Pune Supergiant'
         ]
for team in teams:
  names_drop = matches[(matches['bowling_team'] == team) | (matches['batting_team'] == team)].index
  matches.drop(names_drop,inplace = True)

# Delhi daredevils is now Delhi Capitals
matches['batting_team'].replace('Delhi Daredevils','Delhi Capitals')
matches['bowling_team'].replace('Delhi Daredevils','Delhi Capitals')
matches.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,0.0,0.0,0.0,1.0,0.0
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,0.0,0.0,0.0,0.0,0.0
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,1,1.0,0.0,0.0,0.0,0.0
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,0.0,0.0,0.0,0.0,0.0
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Filling the values of city based on venue
cities = {
    'Rajiv Gandhi International Stadium, Uppal' : "Hyderabad",
    'Maharashtra Cricket Association Stadium':"Mumbai",
    'Saurashtra Cricket Association Stadium':"Rajkot",
    "Holkar Cricket Stadium":"Indore",
    "M Chinnaswamy Stadium":"Bengaluru",
    "Wankhede Stadium":"Mumbai",
     "Eden Gardens":"Kolkata",
     "Feroz Shah Kotla":"Delhi",
      "Punjab Cricket Association IS Bindra Stadium, Mohali":"Mohali",
      "Green Park":"Kanpur",
      "Punjab Cricket Association Stadium, Mohali":"Mohali",
      "Dr DY Patil Sports Academy":"Pune",
      "Sawai Mansingh Stadium":"Jaipur",
      "MA Chidambaram Stadium, Chepauk":"Chennai", 
      "Newlands":"Cape Town, SA",
      "St George's Park":"Port Elizabeth, SA" , 
      "Kingsmead":"Durban, SA", 
      "SuperSport Park":"Centurion, SA",
      "Buffalo Park":"Eastern Cape, SA",
      "New Wanderers Stadium":"Johannesburg, SA",
      "De Beers Diamond Oval":"Northern Cape. SA",
      "OUTsurance Oval":"Bloemfontein", 
      "Brabourne Stadium":"Mumbai",
      "Sardar Patel Stadium":"Ahemdabad", 
      "Barabati Stadium":"Cuttack", 
      "Vidarbha Cricket Association Stadium, Jamtha":"Jamtha",
      "Himachal Pradesh Cricket Association Stadium":"Dharamshala",
      "Nehru Stadium":"Chennai",
      "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium":"Vizag",
      "Subrata Roy Sahara Stadium":"Pune",
      "Shaheed Veer Narayan Singh International Stadium":"Raipur",
      "JSCA International Stadium Complex":"Ranchi",
      "Sheikh Zayed Stadium":"Abu Dhabi",
      "Sharjah Cricket Stadium":"Sharjah",
      "Dubai International Cricket Stadium":"Dubai",
      "M. A. Chidambaram Stadium":"Chennai",
      "Feroz Shah Kotla Ground":"Delhi",
      "M. Chinnaswamy Stadium":"Bengaluru",
      "Rajiv Gandhi Intl. Cricket Stadium":"Hyderabad" ,
      "IS Bindra Stadium":"Mohali",
      "ACA-VDCA Stadium":"Vizag",
      "MA Chidambaram Stadium, Chepauk, Chennai":"Chennai",
      'M.Chinnaswamy Stadium':'Bengaluru',
      'MA Chidambaram Stadium':"Chennai",
      "Arun Jaitley Stadium":'Delhi',
      "Rajiv Gandhi International Stadium":"Hyderabad",
      "Punjab Cricket Association IS Bindra Stadium":"Mohali",
      "Wankhede Stadium, Mumbai":"Mumbai"
}    

# Replace stadium name by location
matches['venue'].replace(cities,inplace=True)

In [None]:
# Input variables: venue,innings,striker,non-striker,bowler,runs_off_bat

# Convert all string values into categorical values
venue_factorized, venue_categories = pd.factorize(matches['venue'])
matches['venue'] = venue_factorized
batting_factorized, batting_categories = pd.factorize(matches['batting_team'])
matches['batting_team'] = batting_factorized
bowling_factorized, bowling_categories = pd.factorize(matches['bowling_team'])
matches['bowling_team'] = bowling_factorized
striker_factorized, striker_categories = pd.factorize(matches['striker'])
matches['striker'] = striker_factorized
nonstriker_factorized, nonstriker_categories = pd.factorize(matches['non_striker'])
matches['non_striker'] = nonstriker_factorized
bowler_factorized, bowler_categories = pd.factorize(matches['bowler'])
matches['bowler'] = bowler_factorized
# Display new dataframe with numeric values
matches.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty
0,335982,2007/08,2008-04-18,0,1,0.1,0,0,0,0,0,0,1,0.0,0.0,0.0,1.0,0.0
1,335982,2007/08,2008-04-18,0,1,0.2,0,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0
2,335982,2007/08,2008-04-18,0,1,0.3,0,0,1,1,0,0,1,1.0,0.0,0.0,0.0,0.0
3,335982,2007/08,2008-04-18,0,1,0.4,0,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0
4,335982,2007/08,2008-04-18,0,1,0.5,0,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Group matches into sets of 6 overs each. We choose 6 because we r doing it over-by-over as 6 balls = 1 over
test = matches.loc[(matches['ball'] < 6.1)] 
test.describe()

Unnamed: 0,match_id,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty
count,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0,47466.0
mean,788291.9,8.147727,1.504888,2.843166,3.447415,3.446067,151.864261,149.844647,152.275418,1.158008,0.073695,0.042051,0.00375,0.004045,0.023743,0.000105
std,311468.3,7.989467,0.513617,1.71843,2.361438,2.361925,127.407756,126.981214,106.408693,1.655325,0.372108,0.271784,0.068287,0.11532,0.220619,0.02295
min,335982.0,0.0,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,501260.0,2.0,1.0,1.3,1.0,1.0,40.0,40.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,734023.0,5.0,2.0,2.7,3.0,3.0,124.0,121.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1136584.0,16.0,2.0,4.3,6.0,6.0,252.0,249.0,235.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1254060.0,26.0,6.0,5.9,8.0,8.0,487.0,481.0,377.0,6.0,7.0,5.0,5.0,4.0,5.0,5.0


In [None]:
# Group by total runs scored per match
matches_sum = test.groupby(['match_id','innings']).sum()
# Append total runs per match to dataframe
total_runs = matches_sum[['runs_off_bat']]
df = pd.merge(test,total_runs,on=['match_id','innings'])
# Rename sum of runs column
df.rename(columns={'runs_off_bat_y':'runs_6_overs'},inplace=True)
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat_x,extras,wides,noballs,byes,legbyes,penalty,runs_6_overs
0,335982,2007/08,2008-04-18,0,1,0.1,0,0,0,0,0,0,1,0.0,0.0,0.0,1.0,0.0,51
1,335982,2007/08,2008-04-18,0,1,0.2,0,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,51
2,335982,2007/08,2008-04-18,0,1,0.3,0,0,1,1,0,0,1,1.0,0.0,0.0,0.0,0.0,51
3,335982,2007/08,2008-04-18,0,1,0.4,0,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,51
4,335982,2007/08,2008-04-18,0,1,0.5,0,0,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,51


In [None]:
# it has all the preprocessed and newly added data
final_data = df.to_csv('all_data.csv')

# Step 4: Import models from Apache

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [None]:
# Read dataset using Apache Spark
dataset = spark.read.csv('all_data.csv',inferSchema=True, header =True)
# Display data schema
dataset.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- match_id: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- venue: integer (nullable = true)
 |-- innings: integer (nullable = true)
 |-- ball: double (nullable = true)
 |-- batting_team: integer (nullable = true)
 |-- bowling_team: integer (nullable = true)
 |-- striker: integer (nullable = true)
 |-- non_striker: integer (nullable = true)
 |-- bowler: integer (nullable = true)
 |-- runs_off_bat_x: integer (nullable = true)
 |-- extras: integer (nullable = true)
 |-- wides: double (nullable = true)
 |-- noballs: double (nullable = true)
 |-- byes: double (nullable = true)
 |-- legbyes: double (nullable = true)
 |-- penalty: double (nullable = true)
 |-- runs_6_overs: integer (nullable = true)



# Step 5: Feature extraction

In [None]:
#Input all the features in one vector column. VectorAssembler converts all the columns into Spark Format
assembler = VectorAssembler(inputCols=['venue', 'innings', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat_x'], 
                            outputCol = 'Attributes')

output = assembler.transform(dataset)

# Input vs Output
finalized_data = output.select("Attributes","runs_6_overs")

finalized_data.show()

+--------------------+------------+
|          Attributes|runs_6_overs|
+--------------------+------------+
|       (8,[1],[1.0])|          51|
|(8,[1,4,5],[1.0,1...|          51|
|(8,[1,4,5],[1.0,1...|          51|
|(8,[1,4,5],[1.0,1...|          51|
|(8,[1,4,5],[1.0,1...|          51|
|(8,[1,4,5],[1.0,1...|          51|
|(8,[1,4,5],[1.0,1...|          51|
|(8,[1,4,5,6],[1.0...|          51|
|[0.0,1.0,0.0,0.0,...|          51|
|[0.0,1.0,0.0,0.0,...|          51|
|[0.0,1.0,0.0,0.0,...|          51|
|[0.0,1.0,0.0,0.0,...|          51|
|(8,[1,4,5,6],[1.0...|          51|
|       (8,[1],[1.0])|          51|
|       (8,[1],[1.0])|          51|
|       (8,[1],[1.0])|          51|
|(8,[1,4,5,7],[1.0...|          51|
|(8,[1,4,5,7],[1.0...|          51|
|       (8,[1],[1.0])|          51|
|(8,[1,4,5,6],[1.0...|          51|
+--------------------+------------+
only showing top 20 rows



# Step 6: Model training

*  Linear regression
*  Logistic Regression
*  Decision tree
*  Random Forest Regression





**Linear Regression**

In [None]:
# Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.75,0.25])

In [None]:
regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'runs_6_overs')

# Learn to fit the model from training set
regressor = regressor.fit(train_data)

# To predict the scores on testing set
pred = regressor.evaluate(test_data)

# Predict the model
pred.predictions.show()

+--------------------+------------+------------------+
|          Attributes|runs_6_overs|        prediction|
+--------------------+------------+------------------+
|(8,[0,1,3,6],[6.0...|          41| 40.52371093795165|
|(8,[0,1,3,6],[6.0...|          41| 40.53580397991817|
|(8,[0,1,3,6],[11....|          55| 40.17634236548799|
|(8,[0,1,4,5],[6.0...|          39|39.553644787540115|
|(8,[0,1,4,5],[10....|          39|  39.5395304506818|
|(8,[0,1,4,5],[10....|          39|  39.5395304506818|
|(8,[0,1,4,5],[10....|          39|  39.5395304506818|
|(8,[0,1,4,6],[6.0...|          43|40.024064119713756|
|(8,[0,1,4,6],[11....|          38|  38.8630392395166|
|(8,[0,1,5],[6.0,1...|          39|39.555710627790006|
|(8,[0,1,5,6],[6.0...|          39| 39.56780366975653|
|       (8,[1],[1.0])|          51| 40.41262470819753|
|(8,[1,2,4,5],[1.0...|          33|40.542307647401444|
|(8,[1,2,4,5],[1.0...|          44| 41.34336868774712|
|(8,[1,2,4,5],[1.0...|          44| 41.26201850148984|
|(8,[1,2,4

In [None]:
# coefficient of the regression model
coeff = regressor.coefficients

# X and Y intercept
intr = regressor.intercept

print ("The coefficient of the model is : %a" %coeff) # coefficient is the slope of the line used
print ("The Intercept of the model is : %f" %intr) # intercept is distance between c and y-axis

The coefficient of the model is : DenseVector([-0.1446, 0.727, 0.018, -0.218, 0.0017, 0.0037, 0.0121, 1.3282])
The Intercept of the model is : 39.685661


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="runs_6_overs", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)

# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 11.405
MAE: 8.987
r2: 0.058


**Logistic Regression**

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorIndexer

In [None]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol = 'Attributes', labelCol = 'runs_6_overs')

# Fit the model
lrModel = lr.fit(train_data)

In [None]:
#showing the predictions for the test data
lrResults = lrModel.evaluate(test_data).predictions
lrResults.select('Attributes','runs_6_overs','prediction').show()

+--------------------+------------+----------+
|          Attributes|runs_6_overs|prediction|
+--------------------+------------+----------+
|(8,[0,1,3,6],[6.0...|          41|      46.0|
|(8,[0,1,3,6],[6.0...|          41|      46.0|
|(8,[0,1,3,6],[11....|          55|      46.0|
|(8,[0,1,4,5],[6.0...|          39|      46.0|
|(8,[0,1,4,5],[10....|          39|      46.0|
|(8,[0,1,4,5],[10....|          39|      46.0|
|(8,[0,1,4,5],[10....|          39|      46.0|
|(8,[0,1,4,6],[6.0...|          43|      46.0|
|(8,[0,1,4,6],[11....|          38|      46.0|
|(8,[0,1,5],[6.0,1...|          39|      46.0|
|(8,[0,1,5,6],[6.0...|          39|      46.0|
|       (8,[1],[1.0])|          51|      46.0|
|(8,[1,2,4,5],[1.0...|          33|      46.0|
|(8,[1,2,4,5],[1.0...|          44|      46.0|
|(8,[1,2,4,5],[1.0...|          44|      46.0|
|(8,[1,2,4,5],[1.0...|          63|      46.0|
|(8,[1,2,4,5],[2.0...|          25|      46.0|
|(8,[1,2,4,5],[2.0...|          25|      46.0|
|(8,[1,2,4,5]

In [None]:
# Print the coefficients and intercept for logistic regression
coeff = lrModel.coefficientMatrix

# X and Y intercept
intr = lrModel.interceptVector

print ("The coefficient of LR model is : %a" %coeff)
print ("The Intercept of LR model is : %a" %intr)

The coefficient of LR model is : SparseMatrix(104, 8, [0, 0, 0, 0, 0, 0, 0, 0, 0], [], [], 0)
The Intercept of LR model is : DenseVector([-4.1116, -2.7235, -1.6688, -4.1116, -4.1116, -1.4265, -2.3169, -2.3169, -0.4759, -1.6688, -1.3571, -1.4265, -0.7527, -2.1622, 0.0855, 0.7033, -0.6091, -0.6431, 0.8952, 0.617, 1.5362, 1.7589, 1.4176, 1.8461, 1.8682, 1.7723, 2.2761, 1.805, 2.2986, 2.4824, 2.3916, 2.8361, 2.5561, 3.0488, 2.8897, 3.0237, 2.6773, 2.8213, 3.282, 3.1117, 3.2482, 3.1393, 3.18, 2.9469, 3.176, 2.9398, 3.3637, 2.6892, 3.16, 2.7511, 3.0209, 2.9479, 2.8167, 2.7838, 2.7276, 2.6639, 2.0563, 2.3297, 2.0538, 2.2966, 2.1936, 1.7985, 2.1085, 1.948, 1.2763, 1.8399, 0.7033, 0.7912, 1.2552, -0.6783, -0.6431, -0.5762, 0.051, -0.7527, 0.5656, 0.051, -0.7148, -0.6783, -0.0225, -4.1116, -4.1116, -4.1116, -0.5443, -4.1116, -4.1116, -0.5134, -0.5134, -4.1116, -4.1116, -4.1116, -4.1116, -4.1116, -4.1116, -4.1116, -0.6091, -4.1116, -4.1116, -4.1116, -4.1116, -4.1116, -4.1116, -4.1116, -4.1116, -0

In [None]:
eval = RegressionEvaluator(labelCol="runs_6_overs", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(lrResults)
print("RMSE: %.3f" % rmse)

# Mean Absolute Error
mae = eval.evaluate(lrResults, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(lrResults, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 12.097
MAE: 9.618
r2: -0.060


**Decision Tree Regressor**

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

In [None]:
# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol = 'Attributes', labelCol = 'runs_6_overs')
modelDT = dt.fit(train_data)
# Make predictions.
predictions = modelDT.transform(test_data)
predictions.show()

+--------------------+------------+-----------------+
|          Attributes|runs_6_overs|       prediction|
+--------------------+------------+-----------------+
|(8,[0,1,3,6],[6.0...|          41|42.12814070351759|
|(8,[0,1,3,6],[6.0...|          41|42.12814070351759|
|(8,[0,1,3,6],[11....|          55|39.31956689868523|
|(8,[0,1,4,5],[6.0...|          39|42.12814070351759|
|(8,[0,1,4,5],[10....|          39|39.31956689868523|
|(8,[0,1,4,5],[10....|          39|39.31956689868523|
|(8,[0,1,4,5],[10....|          39|39.31956689868523|
|(8,[0,1,4,6],[6.0...|          43|42.12814070351759|
|(8,[0,1,4,6],[11....|          38|39.31956689868523|
|(8,[0,1,5],[6.0,1...|          39|42.12814070351759|
|(8,[0,1,5,6],[6.0...|          39|42.12814070351759|
|       (8,[1],[1.0])|          51|42.12814070351759|
|(8,[1,2,4,5],[1.0...|          33|42.12814070351759|
|(8,[1,2,4,5],[1.0...|          44|42.12814070351759|
|(8,[1,2,4,5],[1.0...|          44|42.12814070351759|
|(8,[1,2,4,5],[1.0...|      

In [None]:
eval = RegressionEvaluator(labelCol="runs_6_overs", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(predictions)
print("RMSE: %.3f" % rmse)

# Mean Absolute Error
mae = eval.evaluate(predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 11.270
MAE: 8.920
r2: 0.080


In [None]:
#displaying the decision tree
print(modelDT.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_4dc407cf624b, depth=5, numNodes=57, numFeatures=8
  If (feature 7 <= 2.5)
   If (feature 1 <= 2.5)
    If (feature 7 <= 0.5)
     If (feature 5 <= 298.5)
      If (feature 4 <= 299.5)
       Predict: 40.372089437162685
      Else (feature 4 > 299.5)
       Predict: 44.00387784779448
     Else (feature 5 > 298.5)
      If (feature 0 <= 18.5)
       Predict: 45.239763196842624
      Else (feature 0 > 18.5)
       Predict: 41.61683848797251
    Else (feature 7 > 0.5)
     If (feature 4 <= 299.5)
      If (feature 5 <= 298.5)
       Predict: 42.76272361669671
      Else (feature 5 > 298.5)
       Predict: 46.00153609831029
     Else (feature 4 > 299.5)
      If (feature 0 <= 18.5)
       Predict: 47.54980694980695
      Else (feature 0 > 18.5)
       Predict: 43.955012853470436
   Else (feature 1 > 2.5)
    If (feature 6 <= 139.5)
     If (feature 6 <= 28.5)
      Predict: 10.0
     Else (feature 6 > 28.5)
      If (feature 2 <= 1.5)
 

**Random Forest Regression**

In [None]:
from pyspark.ml.regression import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(featuresCol = 'Attributes', labelCol = 'runs_6_overs')
modelRf = rf.fit(train_data)
rfResults = modelRf.transform(test_data)
rfResults.show()

+--------------------+------------+------------------+
|          Attributes|runs_6_overs|        prediction|
+--------------------+------------+------------------+
|(8,[0,1,3,6],[6.0...|          41|41.235087035447336|
|(8,[0,1,3,6],[6.0...|          41|41.235087035447336|
|(8,[0,1,3,6],[11....|          55| 39.92702479583312|
|(8,[0,1,4,5],[6.0...|          39|  41.0738844804947|
|(8,[0,1,4,5],[10....|          39|39.667177503169924|
|(8,[0,1,4,5],[10....|          39|39.667177503169924|
|(8,[0,1,4,5],[10....|          39|39.667177503169924|
|(8,[0,1,4,6],[6.0...|          43|  41.0738844804947|
|(8,[0,1,4,6],[11....|          38| 39.76135843159318|
|(8,[0,1,5],[6.0,1...|          39|  41.0738844804947|
|(8,[0,1,5,6],[6.0...|          39|  41.0738844804947|
|       (8,[1],[1.0])|          51|42.022935911910075|
|(8,[1,2,4,5],[1.0...|          33| 41.93032606693343|
|(8,[1,2,4,5],[1.0...|          44|41.836145138510176|
|(8,[1,2,4,5],[1.0...|          44| 41.64747053469232|
|(8,[1,2,4

In [None]:
eval = RegressionEvaluator(labelCol="runs_6_overs", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(rfResults)
print("RMSE: %.3f" % rmse)

# Mean Absolute Error
mae = eval.evaluate(rfResults, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(rfResults, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 11.109
MAE: 8.818
r2: 0.106
