In [None]:
#INSY 5376 Big Data Analytics - Project - IPL Player Performance Analysis
#Team Members :
# Amuluru, Sriram Sai
# Grandhi, Anish
# Potukuchi, Sameer Kumar
# Thanikonda, Pruthvi Sai Kumar

#Import all the required packages
from pyspark.ml.regression import LinearRegression,GeneralizedLinearRegression  
from pyspark.ml.feature import VectorAssembler  
from pyspark.ml.feature import StandardScaler, Normalizer  
from pyspark.ml import Pipeline  
from pyspark.sql.functions import *
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.ml.regression import DecisionTreeRegressor

#initialize spark conf, spark context and SQL context.
conf = SparkConf().setMaster('local[*]').setAppName('IPL Average Prediction')
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

In [None]:
#Determining a player variable to toggle between batsmen and bowler data and run separately.
player = 'batsman'
#Read the batsmen and bowler training and test data into Spark SQL dataframes.
batsmenTrainingData = sqlContext.read.options(header='true', inferSchema='true', delimiter = ',').csv('batsmen_training_data.csv')
batsmenTestData = sqlContext.read.options(header='true', inferSchema='true', delimiter = ',').csv('batsmen_test_data.csv')
bowlersTrainingData = sqlContext.read.options(header='true', inferSchema='true', delimiter = ',').csv('bowlers_training_data.csv')
bowlersTestData = sqlContext.read.options(header='true', inferSchema='true', delimiter = ',').csv('bowlers_test_data.csv')

In [None]:
#Depending on batsman or bowler define the features and label columns, our label is batting average for batsmen and strike rate for bowlers.
if player == 'batsman':
    features = ["balls", "batsmanRuns", "fours", "sixes","strikeRate","highestScore", "num_of_innings","num_not_outs","batsman_min_seasons","num_fifties","num_hundreds"]
    lr_train_data = batsmenTrainingData.select(col("battingAverage").alias('label'), *features)
    lr_test_data = batsmenTestData.select(col("battingAverage").alias('label'), *features)
else:
    bowlingFeatures = ["totalBalls", "totalWickets", "bowlerEconomy", "bowlingAverage","bestBowlingWickets","dot_balls", "num_wides","num_noballs","bowler_min_seasons"]
    lr_bowling_train_data = bowlersTrainingData.select(col("bowlingStrikeRate").alias('label'), *bowlingFeatures)
    lr_bowling_test_data = bowlersTestData.select(col("bowlingStrikeRate").alias('label'), *bowlingFeatures)

In [None]:
#Create a vector assembler, scale, normalize and run the linear regression, ridge and lasso models as necessary by creating a pipline with stages. 
if player == 'batsman':
    vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")
else:
    vectorAssembler = VectorAssembler(inputCols=bowlingFeatures, outputCol="unscaled_features")
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="scaled_features")  
normalizer = Normalizer(inputCol="scaled_features", outputCol="features", p=2.0)
lr = LinearRegression(maxIter=100, regParam=.01, elasticNetParam=0.5)
lasso = LinearRegression(maxIter=100, regParam=.01,elasticNetParam=1.0)
ridge = LinearRegression(maxIter=100, regParam=.01,elasticNetParam=0.0)

stages = [vectorAssembler,standardScaler, normalizer,  lr]
stages1 = [vectorAssembler,standardScaler, normalizer, lasso]
stages2 = [vectorAssembler,standardScaler, normalizer, ridge]  


pipeline = Pipeline(stages=stages) 
pipeline1 = Pipeline(stages=stages1)
pipeline2 = Pipeline(stages=stages2)

In [None]:
#Fit the linear regression, lasso and ridge models depending on batsman or bowler
if player == 'batsman':
    model = pipeline.fit(lr_train_data) 
    lassoModel = pipeline1.fit(lr_train_data)
    ridgeModel = pipeline2.fit(lr_train_data)
else:
    model = pipeline.fit(lr_bowling_train_data) 
    lassoModel = pipeline1.fit(lr_bowling_train_data)
    ridgeModel = pipeline2.fit(lr_bowling_train_data)

In [None]:
#Get the predictions for each model. 
if player == 'batsman':
    prediction = model.transform(lr_test_data)
    lassoPrediction = lassoModel.transform(lr_test_data)
    ridgePrediction = ridgeModel.transform(lr_test_data)
else:
    prediction = model.transform(lr_bowling_test_data)
    lassoPrediction = lassoModel.transform(lr_bowling_test_data)
    ridgePrediction = ridgeModel.transform(lr_bowling_test_data)

In [None]:
#Print the required metrics to measure the model performance. 
from pyspark.ml.evaluation import RegressionEvaluator  
eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(prediction)
lassoRmse = eval.evaluate(lassoPrediction)
ridgeRmse = eval.evaluate(ridgePrediction)
print("Linear Regression RMSE: %.3f" % rmse)
print("Lasso Regression RMSE: %.3f" % lassoRmse)
print("Ridge Regression RMSE: %.3f" % ridgeRmse)


# Mean Square Error
mse = eval.evaluate(prediction, {eval.metricName: "mse"}) 
lassoMse = eval.evaluate(lassoPrediction, {eval.metricName: "mse"}) 
ridgeMse = eval.evaluate(ridgePrediction, {eval.metricName: "mse"}) 


print("Linear Regression MSE: %.3f" % mse)
print("Lasso Regression MSE: %.3f" % lassoMse)
print("Ridge Regression MSE: %.3f" % ridgeMse)


# Mean Absolute Error
mae = eval.evaluate(prediction, {eval.metricName: "mae"})
lassoMae = eval.evaluate(lassoPrediction, {eval.metricName: "mae"})
ridgeMae = eval.evaluate(ridgePrediction, {eval.metricName: "mae"})

print("Linear Regression MAE: %.3f" % mae)
print("Lasso Regression MAE: %.3f" % lassoMae)
print("Ridge Regression MAE: %.3f" % ridgeMae)


# r2 - coefficient of determination
r2 = eval.evaluate(prediction, {eval.metricName: "r2"})
lassoR2 = eval.evaluate(lassoPrediction, {eval.metricName: "r2"})  
ridgeR2 = eval.evaluate(ridgePrediction, {eval.metricName: "r2"})  



print("Linear Regression R-Squared: %.3f" %r2)
print("Lasso Regression R-Squared: %.3f" %lassoR2)
print("Ridge Regression R-Squared: %.3f" %ridgeR2)

In [None]:

#Separate the prediction from the prediction dataframe and merge with the test data, create a monotonically increasing id as index, 
#Since we do not have any common columns on which we can join the data for batsmen test data and write to csv. 
prediction = prediction.withColumn('row_index', monotonically_increasing_id())
predictionDF = prediction.select('row_index', 'prediction')
batsmenTestData = batsmenTestData.withColumn('row_index', monotonically_increasing_id())
batsmenTestData = batsmenTestData.join(predictionDF, ['row_index']).drop('row_index')
batsmenTestDataPandas = batsmenTestData.toPandas()
batsmenTestDataPandas.to_csv('batsmen_predicted_LR.csv')

lassoPrediction = lassoPrediction.withColumn('row_index', monotonically_increasing_id())
lassoPredictionDF = lassoPrediction.select('row_index', 'prediction')
batsmenTestData = batsmenTestData.withColumn('row_index', monotonically_increasing_id())
batsmenTestData = batsmenTestData.drop('prediction')
batsmenTestData = batsmenTestData.join(lassoPredictionDF, ['row_index']).drop('row_index')
batsmenTestDataPandas = batsmenTestData.toPandas()
batsmenTestDataPandas.to_csv('batsmen_predicted_Lasso.csv')


ridgePrediction = ridgePrediction.withColumn('row_index', monotonically_increasing_id())
ridgePredictionDF = ridgePrediction.select('row_index', 'prediction')
batsmenTestData = batsmenTestData.withColumn('row_index', monotonically_increasing_id())
batsmenTestData = batsmenTestData.drop('prediction')
batsmenTestData = batsmenTestData.join(ridgePredictionDF, ['row_index']).drop('row_index')
batsmenTestDataPandas = batsmenTestData.toPandas()
batsmenTestDataPandas.to_csv('batsmen_predicted_Ridge.csv')

In [None]:
#Separate the prediction from the prediction dataframe and merge with the test data, create a monotonically increasing id as index, 
#Since we do not have any common columns on which we can join the data for bowlers test data and write to csv. 


prediction = prediction.withColumn('row_index', monotonically_increasing_id())
predictionDF = prediction.select('row_index', 'prediction')
bowlersTestData = bowlersTestData.withColumn('row_index', monotonically_increasing_id())
bowlersTestData = bowlersTestData.join(predictionDF, ['row_index']).drop('row_index')
bowlersTestDataPandas = bowlersTestData.toPandas()
bowlersTestDataPandas.to_csv('bowlers_predicted_LR.csv')

lassoPrediction = lassoPrediction.withColumn('row_index', monotonically_increasing_id())
lassoPredictionDF = lassoPrediction.select('row_index', 'prediction')
bowlersTestData = bowlersTestData.withColumn('row_index', monotonically_increasing_id())
bowlersTestData = bowlersTestData.drop('prediction')
bowlersTestData = bowlersTestData.join(lassoPredictionDF, ['row_index']).drop('row_index')
bowlersTestDataPandas = bowlersTestData.toPandas()
bowlersTestDataPandas.to_csv('bowlers_predicted_Lasso.csv')


ridgePrediction = ridgePrediction.withColumn('row_index', monotonically_increasing_id())
ridgePredictionDF = ridgePrediction.select('row_index', 'prediction')
bowlersTestData = bowlersTestData.withColumn('row_index', monotonically_increasing_id())
bowlersTestData = bowlersTestData.drop('prediction')
bowlersTestData = bowlersTestData.join(ridgePredictionDF, ['row_index']).drop('row_index')
bowlersTestDataPandas = bowlersTestData.toPandas()
bowlersTestDataPandas.to_csv('bowlers_predicted_Ridge.csv')

In [None]:
#Prepare the batsmen training dataframe to plot the correlation heat map. 
batsmenTrainingData = batsmenTrainingData.drop('battingAverage','batsman')


In [None]:
#Convert the Dataframe to pandas to get the correlation.
batsmenTrainingDataPandas = batsmenTrainingData.toPandas()

In [None]:
#Plot the correlation matrix using seaborn package. 
import seaborn as sns
import matplotlib.pyplot as plt

corr = batsmenTrainingDataPandas.corr()
sns.heatmap(corr, xticklabels=corr.columns.values,yticklabels=corr.columns.values,vmin=0, vmax=1)
plt.title("Batsmen Training Data Correlation Heatmap \n", {'fontsize': 14})
plt.show()

In [None]:
#Repeat the same process for batsmen test data. 
batsmenTestData = batsmenTestData.drop('battingAverage','batsman')

In [None]:
batsmenTestDataPandas = batsmenTestData.toPandas()

In [None]:
corr = batsmenTestDataPandas.corr()
sns.heatmap(corr, xticklabels=corr.columns.values,yticklabels=corr.columns.values,vmin=0, vmax=1)
plt.title("Batsmen Test Data Correlation Heatmap \n", {'fontsize': 14})
plt.show()

In [None]:
#Prepare the bowler training dataframe to plot the correlation heat map. 
bowlersTrainingData = bowlersTrainingData.drop('bowlingAverage','bowler')
bowlersTestData = bowlersTestData.drop('bowlingAverage','bowler')


In [None]:
#Convert the bowlers training and test data to pandas. 
bowlersTrainingDataPandas = bowlersTrainingData.toPandas()
bowlersTestDataPandas = bowlersTestData.toPandas()

In [None]:
#Plot the correlation matrix using seaborn package for both bowler training and test data.  
corr = bowlersTrainingDataPandas.corr()
sns.heatmap(corr, xticklabels=corr.columns.values,yticklabels=corr.columns.values,vmin=0, vmax=1)
plt.title("Bowlers Training Data Correlation Heatmap \n", {'fontsize': 14})
plt.show()

In [None]:
corr = bowlersTestDataPandas.corr()
sns.heatmap(corr, xticklabels=corr.columns.values,yticklabels=corr.columns.values,vmin=0, vmax=1)
plt.title("Bowlers Test Data Correlation Heatmap \n", {'fontsize': 14})
plt.show()