#IST718 Project - Google and NCAA Women's Basketball Tournament Prediction

#Datasets for Visualization and Interaction

@authors
Sanjana Rajagopala,
Shefali Vajramatti,
Apoorva Rajendra Angre,
Sandya Madhavan

In [2]:
#IMPORT ALL THE REQUIRED PACKAGES
import pandas as pd
from pyspark.ml import feature
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml import classification
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as fn
from pyspark.sql.types import IntegerType
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# package that makes it easy to build pipelines
from pyspark_pipes import pipe

In [3]:
#Read the required data from the CSV files uploaded in the FileStore dbfs of the Databricks environment
wteamDF = spark.read.csv("/FileStore/tables/WTeams.csv", header=True, inferSchema= True)
lteamDF = spark.read.csv("/FileStore/tables/WTeams.csv", header=True, inferSchema= True)

#Read the RegularSeasons CSV File
regularSeasonsDF = spark.read.csv("/FileStore/tables/WRegularSeasonCompactResults.csv", header=True, inferSchema= True)

#Read the Seeds and Slots CSV Files
seedsDF = spark.read.csv("/FileStore/tables/WNCAATourneySeeds.csv", header=True, inferSchema= True)
slotsDF = spark.read.csv("/FileStore/tables/WNCAATourneySlots.csv", header=True, inferSchema=True)

#Read the TourneyResults CSV File
CompactDF = spark.read.csv("/FileStore/tables/WNCAATourneyCompactResults.csv", header=True, inferSchema= True)

#Read the Team details CSV file
teamsDF = spark.read.csv("/FileStore/tables/WTeams.csv", header=True, inferSchema= True)

In [4]:
#Convert into Pandas DF from sql.dataframe for initial manipulation of data
wteamDF = wteamDF.toPandas()
lteamDF = lteamDF.toPandas()
regularSeasonsDF = regularSeasonsDF.toPandas()
slotsDF = slotsDF.toPandas()
seedsDF = seedsDF.toPandas()

#Rename the column to WTeamName
wteamDF.columns = ['WTeamID', 'WTeamName']
lteamDF.columns = ['LTeamID', 'LTeamName']

#Maintain a copy of the original data
NewseedsDF=seedsDF
CompactDF = CompactDF.toPandas()


In [5]:
#Create the DICTIONARY - team id,season as key and seed as value
seedsdict={}
for row in NewseedsDF.iterrows():
  seedsdict[(row[1]["Season"], row[1]["TeamID"])] = row[1]["Seed"]


In [6]:
#Merge the Season and team ID details from results DF and team DF 
temp_wCol = []
temp_lCol = []
for row in CompactDF.iterrows():
  year = row[1]['Season']
  wteamid = row[1]['WTeamID']
  lteamid = row[1]['LTeamID']
  temp_wCol.append(seedsdict[(year,wteamid)])
  temp_lCol.append(seedsdict[(year,lteamid)])
  

In [7]:
#Add the corresponding seed values into the dataframe
CompactDF['WSeed'] = temp_wCol
CompactDF['LSeed'] = temp_lCol

In [8]:
#Define weights for the seeds of each team - Meaning keep the highest weight of 16 for the team with Seed 1
weights_dict = {}
j = 1
for i in range(16,0,-1):
  weights_dict[j] = i
  j+=1


In [9]:
#PRE_PROCESSING THE DATAFRAME 

temp_win = []
wseed_num = []
lseed_num = []
diff_seed = []
loc_col = []
diff_score = []


for row in CompactDF.iterrows():
  
  team_1 = row[1]['WTeamID']
  team_2 = row[1]['LTeamID']
  loc_val = row[1]['WLoc']
  wseed = row[1]['WSeed']
  lseed =row[1]['LSeed']
  
  #Maintain the win column value as 1 if the team with lower teamID has won in the match
  if(team_1<team_2):
    temp_win.append(1)
  else:
    temp_win.append(0)
    
  #Give the highest weight when played in the home ground, least of outside home, medium vlaue otherwise   
  if(loc_val == 'H'):
    loc_col.append(3)
  elif(loc_val == 'N'):
    loc_col.append(2)
  elif(loc_val=='A'):
    loc_col.append(1)
    
  #Maintain the difference between seeds of the teams  
  temp_val = abs(weights_dict[int(wseed[1:])] - weights_dict[int(lseed[1:])])
  diff_seed.append(temp_val)
  
  #Maintain the column with difference between scores of the teams
  diff_score.append(abs(row[1]['WScore'] - row[1]['LScore']))


In [10]:
#Add the above obtained lists as columns into the DF
CompactDF['WLProb'] = temp_win
CompactDF['Seed_Diff'] = diff_seed
CompactDF['Loc'] = loc_col
CompactDF['Score_Diff'] = diff_score

In [11]:
#Check the results of the pre-processing
CompactDF[:5]

In [12]:
#Conversion into the Spark SQL Dataframe
sqlCtx = SQLContext(sc)
sql_compactDF = sqlCtx.createDataFrame(CompactDF)

#Rename the result column with the name label so that all the algorithms can be applied without any problems
sql_compactDF= sql_compactDF.withColumnRenamed("WLProb", "label")

In [13]:
#Obtain the totalMatches played and win percentage of the team in respective season

wDF = sql_compactDF.groupBy(['Season','WTeamID']).agg(fn.sum('label').alias('won'), fn.count('Season').alias('WCount'))
lDF = sql_compactDF.groupBy(['Season','LTeamID']).agg(fn.count('Season').alias('LCount'))

In [14]:
lDF_1.show()


In [15]:
#Rename and maintain a clean DF
wDF = wDF.selectExpr("WTeamID as teamID", "Season", "won", "WCount")
lDF = lDF.selectExpr("LTeamID as teamID", "Season", "LCount")

In [16]:
#Create a DF of matches with the above combined details
matchDF = wDF.join(lDF, (wDF.teamID== lDF.teamID) & (wDF.Season==lDF.Season), how='right')

In [17]:
matchDF = matchDF.withColumn("totalMatches", sum([matchDF[col] for col in ['WCount', 'LCount']]))
#Computing the win percetage for the individaul teams
matchDF=matchDF.withColumn("winPercentage", fn.col('WCount')/fn.col('totalMatches') )
#Create Pandas DF only for this manipulation
#Renaming and selecting required data - avoiding redundancy
match_pd_DF = matchDF.toPandas()
match_pd_DF = match_pd_DF.iloc[:,[0,1,8]]
matchDF = sqlCtx.createDataFrame(match_pd_DF)

In [18]:
#Count and display the DF to ensure the join has not missed any data rows and other details
display(matchDF)
#print(matchDF.count())
#Expect NaN because of the null values introduced during the join

In [19]:
#Add the details from matchDF to the initial integrated DF

winPercentage_DF = sql_compactDF.join(matchDF, (matchDF.teamID== sql_compactDF.WTeamID) & (sql_compactDF.Season==matchDF.Season), how='left').select('DayNum', sql_compactDF.Season, 'WTeamID', 'WScore', 'LTeamID', "LScore", 'NumOT', "WSeed", 'LSeed', 'label', 'Seed_Diff', 'Loc', "Score_Diff",fn.col('winPercentage').alias('W_win_percentage'))

winPercentage_DF = winPercentage_DF.join(matchDF, (matchDF.teamID== winPercentage_DF.LTeamID) & (winPercentage_DF.Season==matchDF.Season), how='left').select('DayNum', winPercentage_DF.Season, 'WTeamID', 'WScore', 'LTeamID', "LScore", 'NumOT', "WSeed", 'LSeed', 'label', 'Seed_Diff', 'Loc' ,"Score_Diff",'W_win_percentage',fn.col('winPercentage').alias('L_win_percentage'))

In [20]:
display(winPercentage_DF)

In [21]:
#Obtain the percentage as per the periods - 1998 to 2005; 2006 to 2010; 2011 to 2015; 2016 to 2017
#This further used as features in the models

groupedTeams_DF_1 = matchDF.where((fn.col('Season').cast(IntegerType())>=1998) & (fn.col('Season').cast(IntegerType())<=2005)).groupBy('teamID').agg(fn.avg('winPercentage').alias('1998_2005_win_percentage'))
#Repeat the grouping for remianing periods
groupedTeams_DF_2 = matchDF.where((fn.col('Season').cast(IntegerType())>=2006) & (fn.col('Season').cast(IntegerType())<=2010)).groupBy('teamID').agg(fn.avg('winPercentage').alias('2006_2010_win_percentage')) 
groupedTeams_DF_3 = matchDF.where((fn.col('Season').cast(IntegerType())>=2011) & (fn.col('Season').cast(IntegerType())<=2015)).groupBy('teamID').agg(fn.avg('winPercentage').alias('2011_2015_win_percentage'))
groupedTeams_DF_4 = matchDF.where((fn.col('Season').cast(IntegerType())>=2016) & (fn.col('Season').cast(IntegerType())<=2017)).groupBy('teamID').agg(fn.avg('winPercentage').alias('2016_2017_win_percentage')) 

In [22]:
#Removing null values 
winPercentage_DF=winPercentage_DF.na.fill(0)

#Display to check the final DF
display(winPercentage_DF)

In [23]:
#Join the data frame with the compact data frame
winPercentage_DF=groupedTeams_DF_1.join(winPercentage_DF, winPercentage_DF.WTeamID == groupedTeams_DF_1.teamID, how='right').select('DayNum', winPercentage_DF.Season, 'WTeamID', 'WScore', 'LTeamID', "LScore", 'NumOT', "WSeed", 'LSeed', 'label', 'Seed_Diff', 'Loc' ,"Score_Diff",'W_win_percentage', 'L_win_percentage', '1998_2005_win_percentage')

winPercentage_DF=groupedTeams_DF_2.join(winPercentage_DF, winPercentage_DF.WTeamID == groupedTeams_DF_2.teamID, how='right')
winPercentage_DF=groupedTeams_DF_3.join(winPercentage_DF, winPercentage_DF.WTeamID == groupedTeams_DF_3.teamID, how='right')
winPercentage_DF=groupedTeams_DF_4.join(winPercentage_DF, winPercentage_DF.WTeamID == groupedTeams_DF_4.teamID, how='right')


In [24]:
#Removing redundant columns
wpandasDF=winPercentage_DF.toPandas()
wpandasDF=wpandasDF.drop('teamID', axis=1)

#Creating spark sql dataframe
winPercentage_DF=sqlCtx.createDataFrame(wpandasDF)
#Removing null values
winPercentage_DF=winPercentage_DF.na.fill(0)



In [25]:
#Display and count to check the results
display(winPercentage_DF)



In [26]:
#Join the teamsDF with final DF to add the team names 
winPercentage_DF = winPercentage_DF.join(teamsDF, (teamsDF.TeamID == winPercentage_DF.WTeamID) , how='left')


In [27]:
#Display the results
winPercentage_DF = winPercentage_DF.drop('TeamID').withColumnRenamed('TeamName', 'WTeamName')
winPercentage_DF = winPercentage_DF.join(teamsDF, (teamsDF.TeamID == winPercentage_DF.LTeamID) , how='left')
winPercentage_DF = winPercentage_DF.drop('TeamID').withColumnRenamed('TeamName', 'LTeamName')


display(winPercentage_DF)

In [28]:
#INFERENCE OF THE ADDED FEATURES

#Checkimg the correlation between win percentages of each period with label
winPercentage_DF.select(fn.corr('1998_2005_win_percentage', 'label')).show()

In [29]:
winPercentage_DF.select(fn.corr('2006_2010_win_percentage', 'label')).show()

In [30]:
winPercentage_DF.select(fn.corr('2011_2015_win_percentage', 'label')).show()

In [31]:
winPercentage_DF.select(fn.corr('2016_2017_win_percentage', 'label')).show()

In [32]:
#MACHINE LEARNING 
# Split dataset randomly into Training , Validation and Test Datasets
trainingData, validationData, testData = winPercentage_DF.randomSplit([0.6,0.3,0.1])

In [33]:
#Feature Definition and Vector Assembler creation
##############################################   MODEL - 1 #####################################################
#Initial Set of Features with only the simple columns
#DayNum - With the higher day number means games played at later stages. Hence, add as feature so that it means a better performance
#WTeamID and LTeamID - the IDs indicate the possibility in that match up
#Score_Diff - Difference between win and lose scores of the match
#Seed_Diff - Difference between seeds of the two playing teams
#NUMOT - Number of Overtimes in the match
#Loc - Played at home, outside or neither
featureCols_1 = ["DayNum", "WTeamID", "Score_Diff", "Loc", "Seed_Diff", "NumOT"]


#set the input and output column names**
assembler_1 = feature.VectorAssembler(inputCols = featureCols_1, outputCol = "features")


In [34]:
# Train a Logistic Regression model
logisticReg_1 = LogisticRegression(maxIter=10, regParam=0.1, elasticNetParam=0.3)

# Chain vecAssembler and Logistic regression model  
pipeline_1 = Pipeline(stages=[ assembler_1, logisticReg_1])

# Run stages in pipeline and train model
model_1 = pipeline_1.fit(trainingData)

In [35]:
#Determine the validation accuracy for model selection
val_predictions_1 = model_1.transform(validationData)

In [36]:
#Determine the testing accuracy for the model performance
test_predictions_1 = model_1.transform(testData)

In [37]:
#Define the evaluator to obtain the areaUnderROC or the AUC score of the model
evaluator = BinaryClassificationEvaluator()

In [38]:
#Display the accuracies
print("The AUC metric for the validation dataset of model-1", evaluator.evaluate(val_predictions_1))
print("The AUC metric for the testing dataset of model-1", evaluator.evaluate(test_predictions_1))


In [39]:
val_predictions_1.select(fn.avg('prediction')).show()

In [40]:
#Checking the average prediction for the test data set ( A Balanced dataset)
test_predictions_1.select(fn.avg('prediction')).show()

In [41]:
display(test_predictions_1)

In [42]:
#####################################################MODEL 2 #########################################################
#Definition of the features with the win percentages of both the playing teams
featureCols_2 = ['WTeamID','1998_2005_win_percentage', '2006_2010_win_percentage', '2011_2015_win_percentage', '2016_2017_win_percentage']


logisticReg_2 = LogisticRegression()
assembler_2 = feature.VectorAssembler(inputCols=featureCols_2, outputCol="features")

pipeline_2 = Pipeline(stages=[ assembler_2, logisticReg_2])

# Run stages in pipeline and train model
model_2 = pipeline_2.fit(trainingData)

In [43]:
val_predictions_2 = model_2.transform(validationData)
test_predictions_2=model_2.transform(testData)

In [44]:
#Accuracy for validation data- using period features
print("The AUC Metric for validation data set of model-2 ", evaluator.evaluate(val_predictions_2))
print("The AUC Metric for test data set of model-2 ", evaluator.evaluate(test_predictions_2))

In [45]:
val_predictions_2.select(fn.avg('prediction')).show()

In [46]:
test_predictions_2.select(fn.avg('prediction')).show()

In [47]:
###############################MODEL - 3 ##################################################

#Defnition of Features - with all the match and team feautres, win and lose percetages and teh period win percetage feaures
featureCols_3 = ['WTeamID','1998_2005_win_percentage', '2006_2010_win_percentage', '2011_2015_win_percentage', '2016_2017_win_percentage',"W_win_percentage","L_win_percentage","DayNum", "Score_Diff", "Loc", "Seed_Diff", "NumOT"]

#Logistic Regression for all features
logisticReg_3 = LogisticRegression(maxIter=10, regParam=0.2, elasticNetParam=0.2)

assembler_3=feature.VectorAssembler(inputCols=featureCols_3,outputCol="features")
# Chain labelIndexer, vecAssembler and NBmodel in a 
pipeline_3 = Pipeline(stages=[ assembler_3, logisticReg_3])

# Run stages in pipeline and train model
model_3 = pipeline_3.fit(trainingData)

In [48]:
val_predictions_3 = model_3.transform(validationData)
test_predictions_3 = model_3.transform(testData)

In [49]:
#Accuracy for validation data- using all features
print("The AUC Metric for validation data set of model-3 ", evaluator.evaluate(val_predictions_3))
print("The AUC Metric for test data set of model-3 ", evaluator.evaluate(test_predictions_3))

In [50]:
val_predictions_3.select(fn.avg('prediction')).show()

In [51]:
test_predictions_3.select(fn.avg('prediction')).show()

In [52]:
display(test_predictions_3)

In [53]:
display(winPercentage_DF)

In [54]:
#Plotting the features before applying feature engineering
plt.figure()
winPercentage_DF.toPandas().Score_Diff.hist()
plt.xlabel('Score_Difference')
plt.title("Distribution of difference between scores of Win and Lose teams")
display()

In [55]:
plt.figure()
winPercentage_DF.toPandas().Score_Diff.hist()
plt.xlabel('Score_Difference')
display()

In [56]:

#Find the win percentage groups 
first_group = winPercentage_DF.toPandas()['1998_2005_win_percentage']
second_group = winPercentage_DF.toPandas()['2006_2010_win_percentage']
third_group = winPercentage_DF.toPandas()['2011_2015_win_percentage']
fourth_group = winPercentage_DF.toPandas()['2016_2017_win_percentage']

win_label_group = winPercentage_DF.toPandas()['label']

#Plot the scatter graph using the above data
winPercentage_DF.toPandas().plot('label', '1998_2005_win_percentage', color="green", label="1998_2005_win_percentage", kind='scatter')
#winPercentage_DF.toPandas().plot('label', '2006_2010_win_percentage', color="blue", label="2006_2010_win_percentage", kind='scatter')
#plt.plot(win_label_group, second_group, color="blue", label="2006_2010_win_percentage",kind='scatter')
#plt.plot(win_label_group, third_group, color="red", label="2011_2015_win_percentage",kind='scatter')
#plt.plot(win_label_group, fourth_group, color="black", label="2016_2017_win_percentage,kind='scatter'")


plt.xlabel("Win probablity")
plt.ylabel("Past Performance Percentage")
#plt.legend(["1998_2005_win_percentage", "2006_2010_win_percentage", "2011_2015_win_percentage","2016_2017_win_percentage" ])
plt.title("Winning Probablity versus Past Performance Percentage")

display()

In [57]:
#figure = plt.figure()
#plt.plot(x=win_label_group, y=winPercentage_DF.toPandas().Seed_Diff, color="green", kind = 'scatter')
#display()

In [58]:
#Feature Engineering technique 1
#Apply Bucketizer on Score_diff
plt.figure()
feature.Bucketizer(splits=[0, 5, 10, 15,20, 25,35,40,50,60,70,80,100], inputCol='Score_Diff').transform(winPercentage_DF).toPandas().iloc[:, -1].hist()
#plt.xticks([]);
plt.xlabel('Score_diff bucket')
display()

score_diff_features = Pipeline(stages = [feature.VectorAssembler(inputCols=['Score_Diff']),feature.Bucketizer(splits=[0, 5, 10, 15,20, 25,35,40,50,60,70,80,100], inputCol='Score_Diff')])



In [59]:
#Applying Feature Engineering Technique 2

plt.figure()
feature.QuantileDiscretizer(numBuckets=10, inputCol='W_win_percentage').fit(winPercentage_DF).transform(winPercentage_DF).toPandas().iloc[:, -1].hist()
plt.xticks([-1, 0, 1, 2, 3, 4, 5]);
plt.xlabel('Win percentage quantiles for Winning teams')
display()


In [60]:
#Applying Feature Engineering Technique 3

plt.figure()
feature.QuantileDiscretizer(numBuckets=10, inputCol='L_win_percentage').fit(winPercentage_DF).transform(winPercentage_DF).toPandas().iloc[:, -1].hist()
plt.xticks([-1, 0, 1, 2, 3, 4, 5]);
plt.xlabel('Win percentage quantiles for Losing teams')
display()

l_percentage_features = feature.QuantileDiscretizer(numBuckets=10,inputCol='L_win_percentage')


In [61]:

#Applying the above features to logistic regression

all_modified_features = pipe((score_diff_features,feature.QuantileDiscretizer(numBuckets=10, inputCol='W_win_percentage'),feature.QuantileDiscretizer(numBuckets=10, inputCol='L_win_percentage')
),feature.VectorAssembler())
logisticReg_4 = classification.LogisticRegression()


In [62]:
all_modified_features.explainParams()

In [63]:

model_4 = pipe(all_modified_features,logisticReg_4)

In [64]:
#Train the model
evaluator = BinaryClassificationEvaluator(labelCol=logisticReg_4.getLabelCol(), rawPredictionCol=logisticReg_4.getRawPredictionCol())


In [65]:
paramGrid = ParamGridBuilder() \
    .addGrid(logisticReg_4.elasticNetParam, [0., 0.01, 0.1]) \
    .addGrid(logisticReg_4.regParam, [0.1, 0.01, 0.001, 0.0001]) \
    .build()

In [66]:
crossval = CrossValidator(estimator=model_4, 
                          estimatorParamMaps=paramGrid, 
                          evaluator=evaluator, 
                          numFolds=2)

In [67]:
final_model_fitted = crossval.fit(trainingData)

In [68]:
evaluator.evaluate(final_model_fitted.transform(testData))

In [69]:
x = model_4.fit(trainingData)

In [70]:
#model_4.getStages()[-1].coefficients

In [71]:
val_predictions_4 = x.transform(testData)

In [72]:
display(val_predictions_4)

In [73]:
evaluator.evaluate(val_predictions_4)

In [75]:
#MaxAbsScaler
plt.figure()
d = pd.DataFrame(np.vstack(pipe(feature.VectorAssembler(inputCols=['WTeamID','1998_2005_win_percentage', '2006_2010_win_percentage', '2011_2015_win_percentage', '2016_2017_win_percentage',"W_win_percentage","L_win_percentage","DayNum", "Score_Diff", "Loc", "Seed_Diff", "NumOT"]),
    feature.MaxAbsScaler()).fit(trainingData).transform(testData).toPandas().iloc[:, -1]))
d.columns = ['WTeamID','1998_2005_win_percentage', '2006_2010_win_percentage', '2011_2015_win_percentage', '2016_2017_win_percentage',"W_win_percentage","L_win_percentage","DayNum", "Score_Diff", "Loc", "Seed_Diff", "NumOT"]
d.Seed_Diff.hist()
plt.xlabel('Seed_Diff')
display()

In [76]:
evaluator.evaluate(d)

In [77]:

sns.pairplot(pd.DataFrame(np.vstack(pipe(feature.VectorAssembler(inputCols=['WTeamID','1998_2005_win_percentage', '2006_2010_win_percentage', '2011_2015_win_percentage', '2016_2017_win_percentage',"W_win_percentage","L_win_percentage","DayNum", "Score_Diff", "Loc", "Seed_Diff", "NumOT"]),
    feature.StandardScaler(withMean=True)).fit(trainingData).transform(testData).toPandas().iloc[:, -1])))
d.columns = ['WTeamID','1998_2005_win_percentage', '2006_2010_win_percentage', '2011_2015_win_percentage', '2016_2017_win_percentage',"W_win_percentage","L_win_percentage","DayNum", "Score_Diff", "Loc", "Seed_Diff", "NumOT"]



In [78]:
#############################Main
#Function for calculating AUC
def binary_evaluation(model_pipeline, model_fitted, data):
  return BinaryClassificationEvaluator(labelCol=model_pipeline.getStages()[-1].getLabelCol(), 
                                rawPredictionCol=model_pipeline.getStages()[-1].getRawPredictionCol()).\
    evaluate(model_fitted.transform(data))

In [79]:
#Standard Scaler- normalizes to gaussian distribution
model1_pipeline = pipe(feature.VectorAssembler(inputCols=['WTeamID','1998_2005_win_percentage', '2006_2010_win_percentage', '2011_2015_win_percentage', '2016_2017_win_percentage',"W_win_percentage","L_win_percentage","DayNum", "Score_Diff", "Loc", "Seed_Diff", "NumOT"]),
              feature.StandardScaler(withMean=True),
             classification.LogisticRegression(labelCol='label'))

In [80]:
model1_fitted=model1_pipeline.fit(trainingData)

In [81]:
binary_evaluation(model1_pipeline, model1_fitted, testData)

In [82]:
#MaxAbsScaler                          
model2_pipeline = pipe(feature.VectorAssembler(inputCols=['WTeamID','1998_2005_win_percentage', '2006_2010_win_percentage', '2011_2015_win_percentage', '2016_2017_win_percentage',"W_win_percentage","L_win_percentage","DayNum", "Score_Diff", "Loc", "Seed_Diff", "NumOT"]),feature.MaxAbsScaler(),classification.LogisticRegression(labelCol='label'))

In [83]:
model2_fitted=model2_pipeline.fit(trainingData)

In [84]:
binary_evaluation(model2_pipeline, model2_fitted, testData)

In [85]:
#Quantile Discretizer

#feature.QuantileDiscretizer(numBuckets=10, inputCol='L_win_percentage').fit(winPercentage_DF).transform(winPercentage_DF).toPandas().iloc[:, -1]
f1 = feature.QuantileDiscretizer(numBuckets=10,inputCol='W_win_percentage')
f2 = feature.QuantileDiscretizer(numBuckets=10,inputCol='L_win_percentage')
f3 = feature.QuantileDiscretizer(numBuckets=10,inputCol='2016_2017_win_percentage')

In [86]:
all_modified_features3 = pipe((f1, f2, f3
),feature.VectorAssembler())
logisticReg_3 = classification.LogisticRegression()

In [87]:
all_modified_features3.explainParams()
model_3 = pipe(all_modified_features3,logisticReg_3)

#Train the model
evaluator = BinaryClassificationEvaluator(labelCol=logisticReg_3.getLabelCol(), rawPredictionCol=logisticReg_3.getRawPredictionCol())

In [88]:
paramGrid = ParamGridBuilder() \
    .addGrid(logisticReg_3.elasticNetParam, [0., 0.01, 0.1]) \
    .addGrid(logisticReg_3.regParam, [0.1, 0.01, 0.001, 0.0001]) \
    .build()

In [89]:
crossval3 = CrossValidator(estimator=model_3, 
                          estimatorParamMaps=paramGrid, 
                          evaluator=evaluator, 
                          numFolds=2)

In [90]:
final_model_fitted3 = crossval3.fit(trainingData)

In [91]:
evaluator.evaluate(final_model_fitted3.transform(testData))

In [92]:
ft1 = feature.QuantileDiscretizer(numBuckets=10,inputCol='Score_Diff')
ft2 = feature.StandardScaler(inputCol="W_win_percentage", outputCol="W_win_percentage", withMean=True)

In [93]:
#Combination of Bucketizer and Standard Scaler
all_modified_features4 = pipe((ft1, ft2),feature.VectorAssembler())
logisticReg_4 = classification.LogisticRegression()

In [94]:
all_modified_features4.explainParams()
model_4 = pipe(all_modified_features4,logisticReg_4)

#Train the model
evaluator4 = BinaryClassificationEvaluator(labelCol=logisticReg_4.getLabelCol(), rawPredictionCol=logisticReg_4.getRawPredictionCol())

In [95]:
paramGrid = ParamGridBuilder() \
    .addGrid(logisticReg_4.elasticNetParam, [0., 0.01, 0.1]) \
    .addGrid(logisticReg_4.regParam, [0.1, 0.01, 0.001, 0.0001]) \
    .build()

In [96]:
crossval4 = CrossValidator(estimator=model_4, 
                          estimatorParamMaps=paramGrid, 
                          evaluator=evaluator, 
                          numFolds=2)

In [97]:
final_model_fitted4 = crossval4.fit(trainingData)

In [98]:
wDF_1 = sql_compactDF.where((fn.col('Season').cast(IntegerType())>=1998) & (fn.col('Season').cast(IntegerType())<=2005)).groupBy(['Season','WTeamID']).agg(fn.avg('label'))

wDF_2 = sql_compactDF.where((fn.col('Season').cast(IntegerType())>=2006) & (fn.col('Season').cast(IntegerType())<=2010)).groupBy(['Season','WTeamID']).agg(fn.avg('label'))

wDF_3 = sql_compactDF.where((fn.col('Season').cast(IntegerType())>=2011) & (fn.col('Season').cast(IntegerType())<=2015)).groupBy(['Season','WTeamID']).agg(fn.avg('label'))

wDF_4 = sql_compactDF.where((fn.col('Season').cast(IntegerType())>=2016) & (fn.col('Season').cast(IntegerType())<=2017)).groupBy(['Season','WTeamID']).agg(fn.avg('label'))

lDF_1 = sql_compactDF.where((fn.col('Season').cast(IntegerType())>=1998) & (fn.col('Season').cast(IntegerType())<=2005)).groupBy(['Season','LTeamID']).agg(fn.avg('label'))

lDF_2 = sql_compactDF.where((fn.col('Season').cast(IntegerType())>=2006) & (fn.col('Season').cast(IntegerType())<=2010)).groupBy(['Season','LTeamID']).agg(fn.avg('label'))

lDF_3 = sql_compactDF.where((fn.col('Season').cast(IntegerType())>=2011) & (fn.col('Season').cast(IntegerType())<=2015)).groupBy(['Season','LTeamID']).agg(fn.avg('label'))

lDF_4 = sql_compactDF.where((fn.col('Season').cast(IntegerType())>=2016) & (fn.col('Season').cast(IntegerType())<=2017)).groupBy(['Season','LTeamID']).agg(fn.avg('label'))

