#IST718 Project - Google and NCAA Women's Basketball Tournament Prediction

#Data Exploration

@authors
Sanjana Rajagopala,
Shefali Vajramatti,
Apoorva Rajendra Angre,
Sandya Madhavan

In [2]:
#IMPORT ALL THE REQUIRED PACKAGES
import pandas as pd
from pyspark.ml import feature
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import classification
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as fn
from pyspark.sql.types import IntegerType
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import StringIndexer
import seaborn as sns

In [3]:
#Read the required data from the CSV files uploaded in the FileStore dbfs of the Databricks environment
wteamDF = spark.read.csv("/FileStore/tables/WTeams.csv", header=True, inferSchema= True)
lteamDF = spark.read.csv("/FileStore/tables/WTeams.csv", header=True, inferSchema= True)

#Read the RegularSeasons CSV File
regularSeasonsDF = spark.read.csv("/FileStore/tables/WRegularSeasonCompactResults.csv", header=True, inferSchema= True)

#Read the Seeds and Slots CSV Files
seedsDF = spark.read.csv("/FileStore/tables/WNCAATourneySeeds_SampleTourney2018.csv", header=True, inferSchema= True)

slotsDF = spark.read.csv("/FileStore/tables/WNCAATourneySlots.csv", header=True, inferSchema=True)

#Read the PrelimData2018 CSV File
detailedDF = spark.read.csv("/FileStore/tables/WRegularSeasonDetailedResults_PrelimData2018.csv", header=True, inferSchema= True)



In [4]:
#Initial manipulation of data
wteamDF = wteamDF.toPandas()
lteamDF = lteamDF.toPandas()
regularSeasonsDF = regularSeasonsDF.toPandas()
slotsDF = slotsDF.toPandas()
seedsDF = seedsDF.toPandas()
detailedDF = detailedDF.toPandas()

#Rename the column to WTeamName
wteamDF.columns = ['WTeamID', 'WTeamName']
lteamDF.columns = ['LTeamID', 'LTeamName']

#Maitain a copy of the original data
NewseedsDF=seedsDF

In [5]:
#Create the DICTIONARY - team id,season as key and seed as value
seedsdict={}
for row in NewseedsDF.iterrows():
  seedsdict[(row[1]["Season"], row[1]["TeamID"])] = row[1]["Seed"]


In [6]:
#Merge the Season and team ID details-DetailedDF and dictionary used
#Append 0 if there is no seed
temp_wCol = []
temp_lCol = []
for row in detailedDF.iterrows():
  year = row[1]['Season']
  wteamid = row[1]['WTeamID']
  lteamid = row[1]['LTeamID']
  if(seedsdict.has_key((year,wteamid))):
    temp_wCol.append(seedsdict[(year,wteamid)])
  else:
    temp_wCol.append('0')
  if(seedsdict.has_key((year,lteamid))):
    temp_lCol.append(seedsdict[(year,lteamid)])
  else:
    temp_lCol.append('0')
  

In [7]:
#Add the corresponding seed values into the dataframe
detailedDF['WSeed'] = temp_wCol
detailedDF['LSeed'] = temp_lCol

In [8]:
#Define weights for the seeds of each team - Meaning keep the highest weight of 16 for the team with Seed 1
weights_dict = {}
j = 1
weights_dict[0] = 0
for i in range(16,0,-1):
  weights_dict[j] = i
  j+=1

In [9]:
#PRE_PROCESSING THE DATAFRAME 

temp_win = []
wseed_num = []
lseed_num = []
diff_seed = []
loc_col = []
diff_score = []
#low_team = []
#high_team = []


for row in detailedDF.iterrows():
  
  team_1 = row[1]['WTeamID']
  team_2 = row[1]['LTeamID']
  loc_val = row[1]['WLoc']
  wseed = row[1]['WSeed']
  lseed =row[1]['LSeed']
  
  #Maintain the win column value as 1 if the team with lower teamID has won in the match
  if(team_1<team_2):
    temp_win.append(1)
  else:
    temp_win.append(0)
      
    
  #Give the highest weight when played in the home ground, least of outside home, medium vlaue otherwise   
  if(loc_val == 'H'):
    loc_col.append(3)
  elif(loc_val == 'N'):
    loc_col.append(2)
  elif(loc_val=='A'):
    loc_col.append(1)
    
  #Maintain the difference between seeds of the teams  
  if(wseed == '0' and lseed == '0'):
    temp_val = abs(weights_dict[0]- weights_dict[0])
  elif(wseed=='0' and lseed != '0'):
    temp_val = abs(weights_dict[0]- weights_dict[int(lseed[1:])])
  elif(wseed != '0' and lseed == '0'):
    temp_val = abs(weights_dict[int(wseed[1:])]- weights_dict[0])
  else:
    temp_val = abs(weights_dict[int(wseed[1:])] - weights_dict[int(lseed[1:])])
  diff_seed.append(temp_val)
  
  #Maintain the column with difference between scores of the teams
  diff_score.append(abs(row[1]['WScore'] - row[1]['LScore']))
  

In [10]:
#Add the above obtained lists as columns into the DF
detailedDF['WLProb'] = temp_win
detailedDF['Seed_Diff'] = diff_seed
detailedDF['Loc'] = loc_col
detailedDF['Score_Diff'] = diff_score

In [11]:
#Check the results of the pre-processing
detailedDF[:2]

In [12]:
#Conversion into the Spark SQL Dataframe
sqlCtx = SQLContext(sc)
sql_compactDF = sqlCtx.createDataFrame(detailedDF)

#Rename the result column with the name label so that all the algorithms can be applied without any problems
sql_compactDF= sql_compactDF.withColumnRenamed("WLProb", "label")

In [13]:
#FEATURE ENGINEERING

#Definition of new features from existing data

#Obtain the totalMatches played and win percentage of the team in respective season

wDF = sql_compactDF.groupBy(['Season','WTeamID']).agg(fn.sum('label').alias('won'), fn.count('Season').alias('WCount'))
lDF = sql_compactDF.groupBy(['Season','LTeamID']).agg(fn.count('Season').alias('LCount'))

In [14]:
wDF.show()

In [15]:
lDF.show()

In [16]:
#Rename and maintain a clean DF
wDF = wDF.selectExpr("WTeamID as teamID", "Season", "won", "WCount")
lDF = lDF.selectExpr("LTeamID as teamID", "Season", "LCount")

In [17]:
#Create a DF of matches with the above combined details
matchDF = wDF.join(lDF, (wDF.teamID== lDF.teamID) & (wDF.Season==lDF.Season), how='right')

In [18]:
matchDF = matchDF.withColumn("totalMatches", sum([matchDF[col] for col in ['WCount', 'LCount']]))
#Computing the win percetage for the individaul teams
matchDF=matchDF.withColumn("winPercentage", fn.col('WCount')/fn.col('totalMatches') )
#Create Pandas DF only for this manipulation
#Renaming and selecting required data - avoiding redundancy
match_pd_DF = matchDF.toPandas()
match_pd_DF = match_pd_DF.iloc[:,[0,1,8]]
matchDF = sqlCtx.createDataFrame(match_pd_DF)

In [19]:
#Count and display the DF to ensure the join has not missed any data rows and other details
display(matchDF)
#Expect NaN because of the null values introduced during the join

In [20]:
#Add the details from matchDF to the initial integrated DF

winPercentage_DF = sql_compactDF.join(matchDF, (matchDF.teamID== sql_compactDF.WTeamID) & (sql_compactDF.Season==matchDF.Season), how='left').select('DayNum', sql_compactDF.Season, 'WTeamID', 'WScore', 'LTeamID', "LScore", 'NumOT', "WSeed", 'LSeed', 'label', 'Seed_Diff', 'Loc', "Score_Diff",fn.col('winPercentage').alias('W_win_percentage'),'WFGM', 'WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst','WTO','WStl','WBlk','WPF','LFGM', 'LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst','LTO','LStl','LBlk','LPF') 

winPercentage_DF = winPercentage_DF.join(matchDF, (matchDF.teamID== winPercentage_DF.LTeamID) & (winPercentage_DF.Season==matchDF.Season), how='left').select('DayNum', winPercentage_DF.Season, 'WTeamID', 'WScore', 'LTeamID', "LScore", 'NumOT', "WSeed", 'LSeed', 'label', 'Seed_Diff', 'Loc' ,"Score_Diff",'W_win_percentage',fn.col('winPercentage').alias('L_win_percentage'), 'WFGM', 'WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst','WTO','WStl','WBlk','WPF','LFGM', 'LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst','LTO','LStl','LBlk','LPF' )

In [21]:
display(winPercentage_DF)

In [22]:
#Understand teh relationship between the match characteristics of the winning and losing teams  - Goals, 3 pointers, Location
winAll=winPercentage_DF
winAll_pd=winAll.toPandas()[['WFGM','WFGA','WFGM3','WFGA3','Loc']]#,'WFTM','WFTA',
winAll_pd.rename(index=str, columns={"WFGM": "GoalsMade", "WFGA": "GoalsAttempted",'WFGM3':"3pointersMade","WFGA3":"3pointersAttempted","Loc":"Location"})
sns.pairplot(winAll_pd,hue='Loc')
#sns.PairGrid(winAll)
display()

In [23]:
#Understand the relationship between the match charactersitics - free throws, offensive & defensive rebounds, assists, turnovers with the Location
winAll=winPercentage_DF
winAll_pd=winAll.toPandas()[['WFTM','WFTA','WOR','WDR','Loc']]#,
sns.pairplot(winAll_pd,hue='Loc')
#sns.PairGrid(winAll)
display()

In [24]:
#Depict how the number of assists, turnovers, blocks, steals with the Location
winAll=winPercentage_DF.toPandas()[['WAst','WTO','WStl','WBlk','WPF','Loc']]
sns.pairplot(winAll,hue='Loc')
#sns.PairGrid(winAll)
display()

In [25]:
#Obtain the percentage as per the periods - 2010 to 2015; 2016 to 2017
#This further used as features in the models

groupedTeams_DF_1 = matchDF.where((fn.col('Season').cast(IntegerType())>=2010) & (fn.col('Season').cast(IntegerType())<=2015)).groupBy('teamID').agg(fn.avg('winPercentage').alias('2010_2015_win_percentage'))

#Repeat the grouping for remianing periods

groupedTeams_DF_2 = matchDF.where((fn.col('Season').cast(IntegerType())>=2016) & (fn.col('Season').cast(IntegerType())<=2017)).groupBy('teamID').agg(fn.avg('winPercentage').alias('2016_2017_win_percentage')) 

In [26]:
#Removing null values 
winPercentage_DF=winPercentage_DF.na.fill(0)

#Display to check the final DF
display(winPercentage_DF)

In [27]:
#Join the data frame with groupedTeams_DF_1&2
winPercentage_DF=groupedTeams_DF_1.join(winPercentage_DF, winPercentage_DF.WTeamID == groupedTeams_DF_1.teamID, how='right').select('DayNum', winPercentage_DF.Season, 'WTeamID', 'WScore','LTeamID', "LScore", 'NumOT', "WSeed", 'LSeed', 'label', 'Seed_Diff', 'Loc' ,"Score_Diff",'W_win_percentage', 'L_win_percentage', 'WFGM', 'WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst','WTO','WStl','WBlk','WPF','LFGM', 'LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst','LTO','LStl','LBlk','LPF','2010_2015_win_percentage')

winPercentage_DF=groupedTeams_DF_2.join(winPercentage_DF, winPercentage_DF.WTeamID == groupedTeams_DF_2.teamID, how='right')


In [28]:
#Removing redundant columns
wpandasDF=winPercentage_DF.toPandas()
wpandasDF=wpandasDF.drop('teamID', axis=1)




In [29]:
#Creating spark sql dataframe
winPercentage_DF=sqlCtx.createDataFrame(wpandasDF)
#Removing null values
winPercentage_DF=winPercentage_DF.na.fill(0)


In [30]:
temp_DF = winPercentage_DF.select('DayNum', 'WTeamID', 'WScore','LTeamID', "LScore", 'NumOT', "WSeed", 'LSeed', 'label', 'Seed_Diff', 'Loc' ,"Score_Diff",'W_win_percentage', 'L_win_percentage', 'Season','2010_2015_win_percentage', '2016_2017_win_percentage',(fn.col('WFGM')/fn.col('WFGA')).alias('WGoals_ratio'),(fn.col('WFGM3')/fn.col('WFGA3')).alias('W3pointers_ratio'),(fn.col('WFTM')/fn.col('WFTA')).alias('WFreethrows_ratio'),(fn.col('WOR')+fn.col('WDR')+fn.col('WAst')+fn.col('WTO')+fn.col('WStl')+fn.col('WBlk')-fn.col('WPF')).alias('Win_accomplish'), (fn.col('LFGM')/fn.col('LFGA')).alias('LGoals_ratio'),(fn.col('LFGM3')/fn.col('LFGA3')).alias('L3pointers_ratio'),(fn.col('LFTM')/fn.col('LFTA')).alias('LFreethrows_ratio'),(fn.col('LOR')+fn.col('LDR')+fn.col('LAst')+fn.col('LTO')+fn.col('LStl')+fn.col('LBlk')-fn.col('LPF')).alias('Lose_accomplish'))

In [31]:
display(temp_DF)

In [32]:
winPercentage_DF = temp_DF

In [33]:
#Display and count to check the results
display(winPercentage_DF)



In [34]:
#Understand teh correlation between the match characteristics
winPercentage_DF = winPercentage_DF.na.fill(0)
winPercDF=winPercentage_DF.toPandas()[['WGoals_ratio','W3pointers_ratio','WFreethrows_ratio','LGoals_ratio','L3pointers_ratio','LFreethrows_ratio']]
sns.pairplot(winPercDF)
display()