In [1]:
"""
In this program we will look at predicting the winner of sports matches using a different type of classification algorithm: decision trees.
Advantages of decision trees:
a) They are readable by human, allowing for their use in human-driven decision making.
b) They work with a variety of features, including categorical.
We will look at predicting the winner of games of the NBA using an entry level basketball match prediction algorithm.
"""
import pandas as pd
data_filename = "basketball.csv"
dataset = pd.read_csv(data_filename)


In [2]:
# Testing dataset and showing first five rows
dataset.head(5)

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Tue Oct 27 2015,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,Tue Oct 27 2015,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,Tue Oct 27 2015,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,Wed Oct 28 2015,7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,
4,Wed Oct 28 2015,7:30p,Indiana Pacers,99,Toronto Raptors,106,Box Score,,19800,


In [3]:
# Changing the headings after loading the file
dataset = pd.read_csv(data_filename, parse_dates=["Date"])
dataset.columns = ["Date", "Start (ET)", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type", "Attend.", "Notes"]

# Testing dataset with those changes
dataset.head(5)

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes
0,2015-10-27,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,2015-10-27,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,2015-10-27,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,2015-10-28,7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,
4,2015-10-28,7:30p,Indiana Pacers,99,Toronto Raptors,106,Box Score,,19800,


In [4]:
# Checking the data type of the data
print(dataset.dtypes)

Date            datetime64[ns]
Start (ET)              object
Visitor Team            object
VisitorPts               int64
Home Team               object
HomePts                  int64
OT?                     object
Score Type              object
Attend.                  int64
Notes                   object
dtype: object


In [5]:
# Specifying class value as 1 if the home team wins and 0 if the visitor team wins
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]

In [6]:
# Extracting the values to use with scikit-learn
y_true = dataset["HomeWin"].values

# Checking home team advantage
dataset["HomeWin"].mean()

0.5942249240121581

In [7]:
# Creating feature to check if the two teams won their previous game, so we check if they are playing well
from collections import defaultdict

# Dictionary will be used to store the team's last result
won_last = defaultdict(int)

# We then create a new feature in our dataset to store the results of our new features
dataset["HomeLastWin"] = 0
dataset["VisitorLastWin"] = 0

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    dataset.at[index, "HomeLastWin"] = won_last[home_team]
    dataset.at[index, "VisitorLastWin"] = won_last[visitor_team]
    won_last[home_team] = int(row["HomeWin"])
    won_last[visitor_team] = 1 - int(row["HomeWin"])
    

In [8]:
# Checking columns inclusion
dataset.head(6)

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin
0,2015-10-27,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,,False,0,0
1,2015-10-27,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,,True,0,0
2,2015-10-27,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,,True,0,0
3,2015-10-28,7:00p,Washington Wizards,88,Orlando Magic,87,Box Score,,18846,,False,0,0
4,2015-10-28,7:30p,Indiana Pacers,99,Toronto Raptors,106,Box Score,,19800,,True,0,0
5,2015-10-28,7:30p,Charlotte Hornets,94,Miami Heat,104,Box Score,,19724,,True,0,0


In [9]:
# Check last columns to see example of home team and visitor team that won their recent game
dataset.loc[1000:1005]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin
1000,2016-03-15,8:30p,Los Angeles Clippers,87,San Antonio Spurs,108,Box Score,,18418,,True,1,0
1001,2016-03-15,10:30p,Sacramento Kings,106,Los Angeles Lakers,98,Box Score,,18997,,False,0,0
1002,2016-03-16,7:00p,Oklahoma City Thunder,130,Boston Celtics,109,Box Score,,18624,,False,0,1
1003,2016-03-16,7:00p,Orlando Magic,99,Charlotte Hornets,107,Box Score,,16148,,True,0,1
1004,2016-03-16,7:00p,Dallas Mavericks,98,Cleveland Cavaliers,99,Box Score,,20562,,True,0,1
1005,2016-03-16,7:00p,Chicago Bulls,96,Washington Wizards,117,Box Score,,19556,,True,1,1


In [10]:
# Importing tools to create a decision tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)

In [11]:
# Extracting the values of dataset
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values

In [12]:
# Checking data values
X_previouswins

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [1, 0],
       [1, 0],
       [0, 1]], dtype=int64)

In [13]:
# Since decision trees are estimators, they have fit and predict methods. Using cross_val_score we can get the average score
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print(f'Accuracy: {np.mean(scores)*100:.1f}%')

Accuracy: 59.4%


In [14]:
"""
Using two features to get better results and answer the following questions:
1) Which team is considered better generally? 
    - A team will be considered better if it ranked higher in last season than other team.
2) Which team won their last encounter?

Obtaining the standings data to perform is the first step.
"""
import os

standing_filename = os.path.join("standings.csv")
standings = pd.read_csv(standing_filename, skiprows=1)

standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,=3,=10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Golden State Warriors,67-15,39-2,28-13,25-5,42-10,9-1,7-3,9-1,...,25-6,5-3,45-9,1-0,13-2,11-3,12-3,8-3,16-2,6-2
1,2,Atlanta Hawks,60-22,35-6,25-16,38-14,22-8,12-6,14-4,12-4,...,17-11,6-4,30-10,0-1,9-5,14-2,17-0,7-4,9-7,4-3
2,3,Houston Rockets,56-26,30-11,26-15,23-7,33-19,9-1,8-2,6-4,...,20-9,8-4,31-14,2-0,11-4,9-5,11-6,7-3,10-6,6-2
3,4,Los Angeles Clippers,56-26,30-11,26-15,19-11,37-15,7-3,6-4,6-4,...,21-7,3-5,33-9,2-0,9-5,11-6,11-4,5-6,11-5,7-0
4,5,Memphis Grizzlies,55-27,31-10,24-17,20-10,35-17,8-2,5-5,7-3,...,16-13,9-3,26-13,2-0,13-2,8-6,12-4,7-4,9-8,4-3


In [15]:
# Creating a feature similar to the previous to get standings for home team and visitor team
dataset["HomeTeamRanksHigher"] = 0

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
    dataset.at[index, "HomeTeamRanksHigher"] = int(home_rank < visitor_rank)

In [16]:
# Using cross_val_score to test the result
X_homehigher = dataset[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values

In [17]:
# Checking data values
X_homehigher

array([[0, 0, 1],
       [0, 0, 0],
       [0, 0, 1],
       ...,
       [1, 0, 1],
       [1, 0, 0],
       [0, 1, 1]], dtype=int64)

In [18]:
# Creating a new decision tree classifier to run the evaluation
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print(f'Accuracy: {np.mean(scores)*100:.1f}%')

Accuracy: 61.8%


In [19]:
# Creating a dictionary to store the winner of the past game and creating a new feature in the dataframe
last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    # Sort for a consistent ordering
    teams = tuple(sorted([home_team, visitor_team]))
    # Set in the row who won the last encounter
    home_team_won_last = 1 if last_match_winner[teams] == row["Home Team"] else 0
    dataset.at[index, "HomeTeamWonLast"] = home_team_won_last
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner

In [20]:
# Evaluating the result
X_lastwinner = dataset[["HomeTeamWonLast", "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin"]].values
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")
scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy')
print(f'Accuracy: {np.mean(scores)*100:.1f}%')

Accuracy: 62.6%


In [21]:
# Then check what happens if a lot of data is thrown at the Decision Tree and see it can learn an effective model
# Converting string-based team names into assigned integer values
from sklearn.preprocessing import LabelEncoder

encoding = LabelEncoder()
encoding.fit(dataset["Home Team"].values)
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

In [22]:
# Transforming those integers into a number of binary features
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

In [23]:
# Checking data values
X_teams

matrix([[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
# Running decision tree on the new dataset
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print(f'Accuracy: {np.mean(scores)*100:.1f}%')

Accuracy: 63.3%


In [25]:
# Testing Random Forests algorithm to get accuracy
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print(f'Accuracy: {np.mean(scores)*100:.1f}%')

Accuracy: 64.3%


In [26]:
# Using subsets of the features with Random Forests
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print(f'Accuracy: {np.mean(scores)*100:.1f}%')

Accuracy: 65.1%


In [27]:
# Testing GridSearchCV algorithm to get accuracy
from sklearn.model_selection import GridSearchCV

parameter_space = {
    "max_features": [2, 10, 'auto'],
    "n_estimators": [100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print(f'Accuracy: {grid.best_score_*100:.1f}%')

Accuracy: 68.2%


In [28]:
# Checking best model found in the grid search
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features=10,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=14, verbose=0,
                       warm_start=False)
