In [26]:
#Decision Trees and Random Forests
#Sukhada Sheth 

#Decision Trees - Classification Algorithm in Supervised Learning which can work with variety of features including 
#                  Categorical features
#Random Forest  - Can be further used to improve upon Decision trees

#Data Source - https://www.basketball-reference.com/leagues/NBA_2016_games.html 


# Predicting the winner of games of the National Basketball Association(NBA)

In [2]:

import pandas as pd

data_filename = "basketball.csv"
dataset = pd.read_csv(data_filename)

dataset.head() #print top 5 rows

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Tue Oct 27 2015,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,Tue Oct 27 2015,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,Tue Oct 27 2015,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,Wed Oct 28 2015,7:30p,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,18624,
4,Wed Oct 28 2015,7:30p,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,17732,


In [3]:
#format the date column
dataset = pd.read_csv(data_filename, parse_dates=["Date"])
#label the columns of the dataset
dataset.columns = ["Date", "Start (ET)", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type", "Attend.", "Notes"]

dataset.head() #print top 5 rows

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes
0,2015-10-27,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,2015-10-27,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,2015-10-27,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,2015-10-28,7:30p,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,18624,
4,2015-10-28,7:30p,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,17732,


In [4]:
print(dataset.dtypes) #print datatype of each column of dataset

dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]  #add new feature/column depending on who has more points

dataset.head()

Date            datetime64[ns]
Start (ET)              object
Visitor Team            object
VisitorPts               int64
Home Team               object
HomePts                  int64
OT?                     object
Score Type              object
Attend.                  int64
Notes                   object
dtype: object


Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes,HomeWin
0,2015-10-27,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,,False
1,2015-10-27,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,,True
2,2015-10-27,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,,True
3,2015-10-28,7:30p,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,18624,,True
4,2015-10-28,7:30p,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,17732,,False


In [5]:
#creating a class array for later to be used in scikit-learn
y_true = dataset["HomeWin"].values #create list of class for homewinner
print(type(y_true))

dataset["HomeWin"].mean()  #mean of homewinner chances

print(y_true[0:5])

<class 'numpy.ndarray'>
[False  True  True  True False]


Now here we cannot use the game points to train the model becuase for the new game we would not know the points before the game happens. Hence creating new features which can be used to train the model

First new feature - 1.which team had won the previous game 

In [6]:
from collections import defaultdict
won_last = defaultdict(int)   #will use this dictionary to hold the previous game result.
#key - team name and value - won/last previous game

#create two new columns which would hold value of result of previous game
dataset["HomeLastWin"] = 0  
dataset["VisitorLastWin"] = 0



In [28]:
#for each row , update the winner of previous game 

#for index, row in enumerate(dataset):
for index, row in dataset.sort_values("Date").iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    dataset.at[index, "HomeLastWin"] = won_last[home_team]
    dataset.at[index, "VisitorLastWin"] =  won_last[visitor_team]
#also, update the won_last depending upon the current row data
    won_last[home_team] = int(row["HomeWin"])
    won_last[visitor_team] = 1 - int(row["HomeWin"])
    
dataset.tail(5)

#this entire process would set the valu 1 or 0 for home team and visitor team indicating if those team had won their
#previous game. At this moment we are only checking if team had won or not, we are not considering if it had won or lost
#against the same opponent

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRanksHigher,HomeTeamWonLast
1311,2016-06-08,9:00p,Golden State Warriors,90,Cleveland Cavaliers,120,Box Score,,20562,,True,0,1,0,0
1312,2016-06-10,9:00p,Golden State Warriors,108,Cleveland Cavaliers,97,Box Score,,20562,,False,1,0,0,1
1313,2016-06-13,9:00p,Cleveland Cavaliers,112,Golden State Warriors,97,Box Score,,19596,,False,1,0,1,1
1314,2016-06-16,9:00p,Golden State Warriors,101,Cleveland Cavaliers,115,Box Score,,20562,,True,1,0,0,1
1315,2016-06-19,8:00p,Cleveland Cavaliers,93,Golden State Warriors,89,Box Score,,19596,,False,0,1,1,0


In [8]:
dataset.iloc[1000:1005]

#create numpy array 
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values
print(type(X_previouswins))

<class 'numpy.ndarray'>


Will use Decision Tree algorithm implementation of sklearn library which uses Classification and Regression Trees(CART)

In [9]:
####################decision tree application starts ##################

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)


In [10]:

from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(clf, X_previouswins, y_true,scoring='accuracy',cv=3)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 59.4%


Now here the score is just 59.4 which is almost same as average of number of times home team has won
which we calculated in previous steps. Hence we need to improve this. One of the way is feature engineering.

So for feature engineering considering two new questions :
   1.  which team is considered better generally - we will use standings(rankings) to decide this. New set of data is pulled from https://www.basketball-reference.com/leagues/NBA_2015_standings.html
   2. which team won their last encounter

In [11]:
#testing with different feature
import os
standings_filename = os.path.join("standings.csv")

standings = pd.read_csv(standings_filename, skiprows=1)

standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,=3,=10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Golden State Warriors,67-15,39-2,28-13,25-5,42-10,9-1,7-3,9-1,...,25-6,5-3,45-9,1-0,13-2,11-3,12-3,8-3,16-2,6-2
1,2,Atlanta Hawks,60-22,35-6,25-16,38-14,22-8,12-6,14-4,12-4,...,17-11,6-4,30-10,0-1,9-5,14-2,17-0,7-4,9-7,4-3
2,3,Houston Rockets,56-26,30-11,26-15,23-7,33-19,9-1,8-2,6-4,...,20-9,8-4,31-14,2-0,11-4,9-5,11-6,7-3,10-6,6-2
3,4,Los Angeles Clippers,56-26,30-11,26-15,19-11,37-15,7-3,6-4,6-4,...,21-7,3-5,33-9,2-0,9-5,11-6,11-4,5-6,11-5,7-0
4,5,Memphis Grizzlies,55-27,31-10,24-17,20-10,35-17,8-2,5-5,7-3,...,16-13,9-3,26-13,2-0,13-2,8-6,12-4,7-4,9-8,4-3


Code to add new feature discussed above. which team is considered better generally

In [12]:
dataset["HomeTeamRanksHigher"] = 0
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    dataset.at[index, "HomeTeamRanksHigher"] =  int(home_rank < visitor_rank)
    
X_homehigher = dataset[[ "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values

In [13]:
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy',cv=3)

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 60.9%


By adding additional feature of which team is standing higher in the rank has gained us little more improvement.
Will try to imrpove it even further by adding yet another feature. Who won in the last encounter

In [14]:
last_match_winner = defaultdict(int) #will hold pair of playing teams as a tuple
dataset["HomeTeamWonLast"] = 0 

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering

    # Set in the row, who won the last encounter
    home_team_won_last = 1 if last_match_winner[teams] == row["Home Team"] else 0
    dataset.at[index, "HomeTeamWonLast"] =  home_team_won_last
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner

In [27]:
dataset.loc[400:405]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRanksHigher,HomeTeamWonLast
400,2015-12-19,5:00p,Los Angeles Lakers,78,Oklahoma City Thunder,118,Box Score,,18203,,True,0,0,1,0
401,2015-12-19,7:00p,Charlotte Hornets,101,Washington Wizards,109,Box Score,,16987,,True,0,1,1,0
402,2015-12-20,1:00p,Minnesota Timberwolves,100,Brooklyn Nets,85,Box Score,,14552,,False,0,1,1,0
403,2015-12-20,3:30p,Philadelphia 76ers,86,Cleveland Cavaliers,108,Box Score,,20562,,True,1,0,1,1
404,2015-12-20,8:00p,New Orleans Pelicans,130,Denver Nuggets,125,Box Score,,13857,,False,0,0,0,1
405,2015-12-20,1:00p,Portland Trail Blazers,109,Miami Heat,116,Box Score,,19600,,True,0,0,0,0


In [16]:
X_lastwinner = dataset[[ "HomeTeamWonLast", "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy',cv=3)

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 62.2%


Accuracy has again improved by adding one more feature

In [17]:
############# experimenting with transformers 

## assigning numbers to teams instead of names
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
encoding.fit(dataset["Home Team"].values)
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
home_teams


array([0, 4, 9, ..., 9, 5, 9])

In [18]:
X_teams = np.vstack([home_teams, visitor_teams]).T
X_teams




array([[ 0,  8],
       [ 4,  5],
       [ 9, 18],
       ...,
       [ 9,  5],
       [ 5,  9],
       [ 9,  5]])

In [19]:
# converting team names to categorical 
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(categories='auto')
X_teams = onehot.fit_transform(X_teams).todense()

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy',cv=3)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 62.8%


result is stil pretty same and only the team names were fed to model..hence not so reliable

# Using Random Forest to overcome possibilties of overfitting

In case of random forest, smaller trees are built using randomly selected smaller subset and also randomly selected features 

In [20]:
#random forest 

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14 , n_estimators = 10)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy', cv=3)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 65.3%


In [21]:
#with all the features
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14, n_estimators = 10)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy' , cv=3)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 63.3%


In [22]:
#with estimators 250
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14, n_estimators=250)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy' , cv=3)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 64.5%


In [23]:
from sklearn.model_selection import GridSearchCV
parameter_space = {
    "max_features": [2, 10, 'auto'],
    "n_estimators": [100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6],
}


In [24]:
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space , cv=3)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

Accuracy: 67.4%


In [25]:
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=14, verbose=0, warm_start=False)


# Summary : 
Accuracies achieved :
1. Decision tree - 

  a. Features - if team had won previously - 59.4%
  
  b. Features - if team had won previously , which team has higher rankings , which team had won in previous encounter - 60.9%
  
  
2. Random Forest -

  a. Features - only team participating - 65.3%
  
  b. Features - All the previosuly derived features - 63.3%
  
  c. With increased estimators - 64.5%
  
  d. Using model selection of GridSearchCV - 67.4%
        