<a href="https://colab.research.google.com/github/steensme/nba-predictions/blob/main/NBA_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
# https://medium.com/analytics-vidhya/how-to-web-scrape-tables-online-using-python-and-beautifulsoup-36d5bafeb982

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://www.espn.com/nba/stats/team/_/view/team/table/offensive/sort/fieldGoalPct/dir/desc"'
requests.get(url)
page = requests.get(url)

soup = BeautifulSoup(page.text, 'lxml')
# print(soup)

table_data = soup.find('table', class_ = 'Table Table--align-right')

headers = []
for i in table_data.find_all('th'):
    title = i.text
    headers.append(title)

team_names = []

team = soup.find('tbody')
for teamlogo in team.find_all('div', class_='flex items-start mr7'):
  teamname = teamlogo.find('img', class_='Image Logo Logo__sm')['title']
  team_names.append(teamname)

stats = pd.DataFrame(columns = headers)

for j in table_data.find_all('tr')[1:]:
        row_data = j.find_all('td')
        row = [tr.text for tr in row_data]
        length = len(stats)
        stats.loc[length] = row

stats.insert(0, 'Team', team_names)

In [26]:
# Scrape the games from November
url = 'https://www.basketball-reference.com/leagues/NBA_2022_games-november.html'
requests.get(url)
page = requests.get(url)

soup = BeautifulSoup(page.text, 'lxml')
# print(soup)

table_data = soup.find('table', id = 'schedule') #class_ = 'suppress_glossary sortable stats_table now_sortable')

headers = []
for i in table_data.thead.find_all('th'):
    title = i.text
    headers.append(title)

scores_nov = pd.DataFrame(columns = headers[1:]) # Leave out 'Date' column becaues this is hard to scrape ('th' tag, not 'td')

for j in table_data.tbody.find_all('tr'):
  row_data = j.find_all('td')
  row = [tr.text for tr in row_data]
  length = len(scores_nov)
  scores_nov.loc[length] = row

In [27]:
# Scrape the games from December
url = 'https://www.basketball-reference.com/leagues/NBA_2022_games-december.html'
requests.get(url)
page = requests.get(url)

soup = BeautifulSoup(page.text, 'lxml')
# print(soup)

table_data = soup.find('table', id = 'schedule') #class_ = 'suppress_glossary sortable stats_table now_sortable')

headers = []
for i in table_data.thead.find_all('th'):
    title = i.text
    headers.append(title)

scores_dec = pd.DataFrame(columns = headers[1:]) # Leave out 'Date' column becaues this is hard to scrape ('th' tag, not 'td')

for j in table_data.tbody.find_all('tr'):
  row_data = j.find_all('td')
  row = [tr.text for tr in row_data]
  length = len(scores_dec)
  scores_dec.loc[length] = row

In [28]:
scores = pd.concat([scores_nov,scores_dec])

In [29]:
scores

Unnamed: 0,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,7:00p,Cleveland Cavaliers,113,Charlotte Hornets,110,Box Score,,13889,
1,7:00p,San Antonio Spurs,118,Indiana Pacers,131,Box Score,,10227,
2,7:00p,Portland Trail Blazers,103,Philadelphia 76ers,113,Box Score,,20115,
3,7:30p,Washington Wizards,111,Atlanta Hawks,118,Box Score,,14632,
4,7:30p,Chicago Bulls,128,Boston Celtics,114,Box Score,,19156,
...,...,...,...,...,...,...,...,...,...
204,7:30p,Los Angeles Clippers,108,Toronto Raptors,116,Box Score,,0,
205,8:00p,San Antonio Spurs,105,Memphis Grizzlies,118,Box Score,,15412,
206,8:00p,New York Knicks,80,Oklahoma City Thunder,95,Box Score,,16451,
207,9:00p,Minnesota Timberwolves,108,Utah Jazz,120,Box Score,,18306,


In [30]:
# Drop Games that do not have scores yet
import numpy as np
scores['PTS'] = scores['PTS'].replace('', np.nan, inplace=False)
scores = scores.dropna(subset=['PTS'], inplace=False)

In [31]:
# Convert 'PTS' columns into integers and subtract them to see the score margin
scores['result_v'] = scores.iloc[:,2].astype("int64") - scores.iloc[:,4].astype("int64")

In [32]:
# make positive result a win for the visitor and a loss otherwise
scores.loc[scores['result_v'] > 0, 'visitor'] = 'W'
scores.loc[scores['result_v'] < 0, 'visitor'] = "L"

# make negative result v a win for the home and a loss otherwise
scores.loc[scores['result_v'] < 0, 'home'] = "W"
scores.loc[scores['result_v'] > 0, 'home'] = "L"

# Change Los Angeles Clippers to LA Clippers
scores["Visitor/Neutral"] = scores["Visitor/Neutral"].replace("Los Angeles Clippers", "LA Clippers", regex=True)
scores["Home/Neutral"] = scores["Home/Neutral"].replace("Los Angeles Clippers", "LA Clippers", regex=True)

In [33]:
# Make a table with only the teams, whether they were @ home and the win or loss
result1 = pd.DataFrame({'Team':scores['Visitor/Neutral'],'Site': 'V','result':scores['visitor']})
result2 = pd.DataFrame({'Team':scores['Home/Neutral'],'Site': 'H','result':scores['home']})

# combine the tables to get a probability of win based on stats
results = pd.concat([result1,result2])

In [34]:
# add the statistical data to the teams
merged = pd.merge(results, stats, on='Team', how='left')

In [35]:
# Select only the attributes that it makes sense to study
import numpy as np
results_stats = pd.DataFrame({'Site':merged['Site'].astype("category"), 'PTS':pd.to_numeric(merged['PTS']), 'FG%':pd.to_numeric(merged['FG%']), 
                              '3P%':pd.to_numeric(merged['3P%']), 'FT%':pd.to_numeric(merged['FT%']), 'OR':pd.to_numeric(merged['OR']), 
                              'DR':pd.to_numeric(merged['DR']), 'AST':pd.to_numeric(merged['AST']), 'STL':pd.to_numeric(merged['STL']), 
                              'BLK':pd.to_numeric(merged['BLK']), 'TO':pd.to_numeric(merged['TO']), 'PF':pd.to_numeric(merged['PF']), 
                              'Result':merged['result'].astype("category")})
results_stats.Site = results_stats.Site.map({'V': 0, 'H':1})
results_stats.Result = results_stats.Result.map({'L': 0, 'W':1})

In [36]:
# Check for null values
is_NaN = merged.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = merged[row_has_NaN]
print(rows_with_NaN)

Empty DataFrame
Columns: [Team, Site, result, GP, PTS, FGM, FGA, FG%, 3PM, 3PA, 3P%, FTM, FTA, FT%, OR, DR, REB, AST, STL, BLK, TO, PF]
Index: []


In [37]:
# Method from https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python

cols=['Site', 'PTS', 'FG%', '3P%', 'FT%', 'OR', 'DR', 'AST', 'STL', 'BLK', 'TO', 'PF']
# cols=['Site', 'FG%', '3P%', 'OR', 'AST', 'TO']

X = results_stats[cols]
y = results_stats.Result

In [38]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [39]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(max_iter=500) # needed to add this for the model to converge

# fit the model with data
logreg.fit(X_train,y_train)

# Make predictions on the test set
y_pred=logreg.predict(X_test)

In [40]:
# import the metrics class to test the model accuracy
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[75, 38],
       [44, 60]])

In [41]:
# Try out predicting the probabilities of W v. L for just one item from a dataframe
one_sample = X_test.iloc[1,:].tolist()

logreg.predict_proba([one_sample])

  "X does not have valid feature names, but"


array([[0.57576987, 0.42423013]])

In [42]:
# Scrape the current days games
url = 'https://www.espn.com/nba/schedule'
requests.get(url)
page = requests.get(url)

soup = BeautifulSoup(page.text, 'lxml')
# print(soup)

table_data = soup.find('table', class_ = 'schedule has-team-logos align-left')

team_names = []

schedule = soup.find('tbody')

for team in schedule.find_all("abbr"):
  team_name = team['title']
  team_names.append(team_name)

# print(team_names)

# Break the list into several lists
games = pd.DataFrame([team_names[i:i + 2] for i in range(0, len(team_names), 2)]) # help from https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks

games

Unnamed: 0,0,1
0,New Orleans Pelicans,Milwaukee Bucks
1,San Antonio Spurs,Detroit Pistons
2,Chicago Bulls,Washington Wizards
3,LA Clippers,Brooklyn Nets
4,Denver Nuggets,Houston Rockets
5,Golden State Warriors,Utah Jazz


In [None]:
# Join the value in the first column to the stats and then predict the probability of winning
# Next, join the value in the second column to the stats and then predict the probability of winning
# Select the team with the higher probabiliyt as the team that will win

In [43]:
visitor = pd.DataFrame({'Team': games.iloc[:,0], 'Site': 'V'})
visitor_stats = pd.merge(visitor, stats, on='Team', how='left')
home = pd.DataFrame({'Team': games.iloc[:,1], 'Site': 'H'})
home_stats = pd.merge(home, stats, on='Team', how='left')

In [44]:
visitor_trans = pd.DataFrame({'Site':visitor_stats['Site'].astype("category"), 'PTS':pd.to_numeric(visitor_stats['PTS']), 'FG%':pd.to_numeric(visitor_stats['FG%']), 
                              '3P%':pd.to_numeric(visitor_stats['3P%']), 'FT%':pd.to_numeric(visitor_stats['FT%']), 'OR':pd.to_numeric(visitor_stats['OR']), 
                              'DR':pd.to_numeric(visitor_stats['DR']), 'AST':pd.to_numeric(visitor_stats['AST']), 'STL':pd.to_numeric(visitor_stats['STL']), 
                              'BLK':pd.to_numeric(visitor_stats['BLK']), 'TO':pd.to_numeric(visitor_stats['TO']), 'PF':pd.to_numeric(visitor_stats['PF'])})
visitor_trans.Site = visitor_trans.Site.map({'V': 0, 'H':1})
home_trans = pd.DataFrame({'Site':home_stats['Site'].astype("category"), 'PTS':pd.to_numeric(home_stats['PTS']), 'FG%':pd.to_numeric(home_stats['FG%']), 
                              '3P%':pd.to_numeric(home_stats['3P%']), 'FT%':pd.to_numeric(home_stats['FT%']), 'OR':pd.to_numeric(home_stats['OR']), 
                              'DR':pd.to_numeric(home_stats['DR']), 'AST':pd.to_numeric(home_stats['AST']), 'STL':pd.to_numeric(home_stats['STL']), 
                              'BLK':pd.to_numeric(home_stats['BLK']), 'TO':pd.to_numeric(home_stats['TO']), 'PF':pd.to_numeric(home_stats['PF'])})
home_trans.Site = home_trans.Site.map({'V': 0, 'H':1})

In [45]:
v_prob = logreg.predict_proba(visitor_trans[cols])
h_prob = logreg.predict_proba(home_trans[cols])

In [48]:
games_prob = pd.DataFrame({'Visitor': games.iloc[:,0], 'Prob_V': v_prob[:,1], 'Home': games.iloc[:,1], 'Prob_H':h_prob[:,1]})

In [49]:
games_prob

Unnamed: 0,Visitor,Prob_V,Home,Prob_H
0,New Orleans Pelicans,0.472756,Milwaukee Bucks,0.672599
1,San Antonio Spurs,0.401961,Detroit Pistons,0.297185
2,Chicago Bulls,0.611196,Washington Wizards,0.463177
3,LA Clippers,0.517797,Brooklyn Nets,0.714347
4,Denver Nuggets,0.432552,Houston Rockets,0.328106
5,Golden State Warriors,0.715466,Utah Jazz,0.773122


In [50]:
prediction = []
margin = []
for row in games_prob.iterrows():
  if row[1].Prob_V > row[1].Prob_H:
    prediction.append(row[1].Visitor)
    margin.append(row[1].Prob_V - row[1].Prob_H)
  else:
    prediction.append(row[1].Home)
    margin.append(row[1].Prob_H - row[1].Prob_V)

In [51]:
games_pred = pd.DataFrame({'Visitor': games.iloc[:,0], 'Home': games.iloc[:,1], 'Prediction': prediction, 'Margin': margin})

In [52]:
games_pred

Unnamed: 0,Visitor,Home,Prediction,Margin
0,New Orleans Pelicans,Milwaukee Bucks,Milwaukee Bucks,0.199843
1,San Antonio Spurs,Detroit Pistons,San Antonio Spurs,0.104776
2,Chicago Bulls,Washington Wizards,Chicago Bulls,0.148019
3,LA Clippers,Brooklyn Nets,Brooklyn Nets,0.19655
4,Denver Nuggets,Houston Rockets,Denver Nuggets,0.104446
5,Golden State Warriors,Utah Jazz,Utah Jazz,0.057656
