In [None]:
#INSY 5376 Big Data Analytics - Project - IPL Player Performance Analysis
#Team Members :
# Amuluru, Sriram Sai
# Grandhi, Anish
# Potukuchi, Sameer Kumar
# Thanikonda, Pruthvi Sai Kumar

#Import the required packages.
from pyspark.ml.regression import LinearRegression,GeneralizedLinearRegression  
from pyspark.ml.feature import VectorAssembler  
from pyspark.ml.feature import StandardScaler, Normalizer  
from pyspark.ml import Pipeline  
from pyspark.sql.functions import *
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

#Initialize the spark conf, spark context and sql context 
conf = SparkConf().setMaster('local[*]').setAppName('IPL Average Prediction')
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

In [None]:
#Determining a player variable to toggle between batsmen and bowler data and run separately.
player = 'batsman'
#Read the batsmen and bowlers training and test data into Spark SQL dataframes.
batsmenTrainingData = sqlContext.read.options(header='true', inferSchema='true', delimiter = ',').csv('batsmen_training_data.csv')
batsmenTestData = sqlContext.read.options(header='true', inferSchema='true', delimiter = ',').csv('batsmen_test_data.csv')
bowlersTrainingData = sqlContext.read.options(header='true', inferSchema='true', delimiter = ',').csv('bowlers_training_data.csv')
bowlersTestData = sqlContext.read.options(header='true', inferSchema='true', delimiter = ',').csv('bowlers_test_data.csv')

In [None]:
#Configure the features and training data and testing data, vector assembler depending on the player, we are predicting batting average for batsmen
#and bowling strike rate for bowlers. 
if player == 'batsman':
    features = ["balls", "batsmanRuns", "fours", "sixes","strikeRate","highestScore", "num_of_innings","num_not_outs","batsman_min_seasons","num_fifties","num_hundreds"]
    lr_train_data = batsmenTrainingData.select(col("battingAverage").alias('label'), *features)
    lr_test_data = batsmenTestData.select(col("battingAverage").alias('label'), *features)
    vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")
else:
    bowlingFeatures = ["totalBalls", "totalWickets", "bowlerEconomy", "bowlingAverage","bestBowlingWickets","dot_balls", "num_wides","num_noballs","bowler_min_seasons"]
    lr_bowling_train_data = bowlersTrainingData.select(col("bowlingStrikeRate").alias('label'), *bowlingFeatures)
    lr_bowling_test_data = bowlersTestData.select(col("bowlingStrikeRate").alias('label'), *bowlingFeatures)
    vectorAssembler = VectorAssembler(inputCols=bowlingFeatures, outputCol="unscaled_features")

#Initialize the linear regression object, define standard scaler and normalizer and pipeline.
lr = LinearRegression()
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="scaled_features")
normalizer = Normalizer(inputCol="scaled_features", outputCol="features", p=2.0)
stages = [vectorAssembler,standardScaler, normalizer, lr]
pipeline = Pipeline(stages=stages) 

In [None]:
#Define the param grid with all the required params to run linear regression, lasso and ridge models. 
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.01, 0.01])
             .addGrid(lr.elasticNetParam, [0.5, 1.0, 0.0])
             .addGrid(lr.maxIter, [100, 100, 100])
             .build())

In [None]:
#Import and configure cross validator as required, with number of folds as 10. 
from pyspark.ml.evaluation import RegressionEvaluator

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=10)

In [None]:
#Fit the model using training data depending on bowler or batsmen
if player == 'batsman':
    cvModel = cv.fit(lr_train_data)
else:
    cvModel = cv.fit(lr_bowling_train_data)

In [None]:
#Predict the label on test data depending on bowler or batsmen
if player == 'batsman':
    predictions = cvModel.transform(lr_test_data)
else:
    predictions = cvModel.transform(lr_bowling_test_data)

In [None]:
#View the prediction 
predictions.select('prediction').show()

In [None]:
#Using regression evaluator view the RMSE for the best model. 
evaluator = RegressionEvaluator()
evaluator.evaluate(predictions)