In [1]:
import os
import re
import datetime
import requests
import json
import pandas as pd
import numpy as np
import warnings
from scipy import stats
from sbrscrape import Scoreboard

warnings.filterwarnings("ignore")


In [2]:
def scrape_games(sport="NBA", date="", current_line=True):
    
        sport_dict = {"NBA": "nba-basketball",
              "NFL": "nfl-football",
              "NHL": "nhl-hockey",
              "MLB": "mlb-baseball",
              "NCAAB": "ncaa-basketball"}

        if date == "":
            print("Huh")
            date = datetime.today().strftime("%Y-%m-%d")
        _line = 'currentLine' if current_line else 'openingLine'

        spreads = moneylines = totals = []

        spread_url = f"https://www.sportsbookreview.com/betting-odds/{sport_dict[sport]}/?date={date}"
        r = requests.get(spread_url)
        j = re.findall('__NEXT_DATA__" type="application/json">(.*?)</script>',r.text)
        try:
            build_id = json.loads(j[0])['buildId']
            spreads_url = f"https://www.sportsbookreview.com/_next/data/{build_id}/betting-odds/{sport_dict[sport]}.json?date={date}"
            spreads_json = requests.get(spreads_url).json()
            spreads = spreads_json['pageProps']['oddsTables'][0]['oddsTableModel']['gameRows']
        except IndexError:
            print("Nope")
            return []
        return spreads

In [3]:
# Historical odds data downloaded from: https://github.com/kyleskom/NBA-Machine-Learning-Sports-Betting/tree/master/Odds-Data/Odds-Data-Clean
# Download 2015-2022 odds data from above source, and create a concatenated dataframe
directory = '/Users/ryanmitchell/nba_line_predictions/odds_data'

full_odds_df = pd.DataFrame()
for dirpath, dirnames, filenames in os.walk(directory):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)
        tmp = pd.read_excel(full_path)
        full_odds_df = pd.concat([full_odds_df, tmp])

In [4]:
def parse_game_date(date_str):
    try:
        year, season_end, month_day = date_str.split('-')
        year = int(year)
        month, day = int(month_day[:2]), int(month_day[2:])

        if month >= 10:
            game_date = f"{year}-{month:02d}-{day:02d}"
        else:
            game_date = f"{year + 1}-{month:02d}-{day:02d}"
        return game_date
    except ValueError:
        # return a default value or an informative error message
        return "Invalid Date Format"


In [5]:
team_abbreviation_mapping = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Charlotte Bobcats': 'CHA',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'Brooklyn Nets': 'BKN',
    'New Orleans Pelicans': 'NOP',
    'New York Knicks': 'NYK',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHX',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS',
    'Oklahoma City Thunder': 'OKC',
    'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS'
}

In [6]:
def clean_spread(row):
    # PK = pick 'em 
    if row['Spread'] == 'PK' or float(row['Spread']) == 0:
        return 0.0
    # If Home Money-Line is < 0, then the HOME team is favored.
    elif float(row['ML_Home']) < 0:
        return -abs(float(row['Spread']))
    else:
        return float(row['Spread'])

In [7]:
full_odds_df['Date'] = full_odds_df['Date'].apply(lambda x: parse_game_date(str(x)))
full_odds_df['Home'] = full_odds_df['Home'].apply(lambda x: team_abbreviation_mapping[x])
full_odds_df['Away'] = full_odds_df['Away'].apply(lambda x: team_abbreviation_mapping[x])
full_odds_df['Spread'] = full_odds_df.apply(clean_spread, axis=1)
full_odds_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Home,Away,OU,Spread,ML_Home,ML_Away,Points,Win_Margin
0,0,2021-10-19,MIL,BKN,240.5,0.0,-125,105,231,23
1,1,2021-10-19,LAL,GSW,230.5,-4.5,-160,140,235,-7
2,2,2021-10-20,CHA,IND,228.5,2.0,105,-125,245,1
3,3,2021-10-20,DET,CHI,220.5,2.5,190,-220,182,-6
4,4,2021-10-20,TOR,WAS,220.0,-3.5,-140,120,181,-15


In [8]:
# Create final dataframe with historical odds from 2015-16 through 2021-22 seasons. We will pull 2022-23 data separately using the scrape_games function.

final_odds_df = full_odds_df[['Date', 'Home', 'Away']]
final_odds_df['Home Spread'] = full_odds_df['Spread']
final_odds_df['Home Team Plus Minus'] = full_odds_df['Win_Margin']
final_odds_df.head()

Unnamed: 0,Date,Home,Away,Home Spread,Home Team Plus Minus
0,2021-10-19,MIL,BKN,0.0,23
1,2021-10-19,LAL,GSW,-4.5,-7
2,2021-10-20,CHA,IND,2.0,1
3,2021-10-20,DET,CHI,2.5,-6
4,2021-10-20,TOR,WAS,-3.5,-15


In [11]:
# Read in final dataframe from our main.ipynb file so that we can get all dates associated with the 2022-2023 NBA season
final_df = pd.read_csv('../../generated_datasets//final_stats_df.csv')
final_df['SEASON_ID'] = final_df['SEASON_ID'].astype(str)
gamedates_2022_2023 = set(final_df[final_df.SEASON_ID.str[-4:]=='2022']['GAME_DATE'])

In [12]:
# 2023-2024 season opener
betmgm = scrape_games(sport="NBA", date='2023-10-24')[0]['oddsViews'][0]
draftkings = scrape_games(sport="NBA", date='2023-10-24')[0]['oddsViews'][1]
betmgm

{'gameId': 290551,
 'sportsbook': 'fanduel',
 'sportsbookId': None,
 'viewType': 'PointspreadDataOpeningAndLatestOddsDataView',
 'openingLine': {'odds': None,
  'homeOdds': -110,
  'awayOdds': -110,
  'overOdds': None,
  'underOdds': None,
  'drawOdds': None,
  'homeSpread': -5.5,
  'awaySpread': 5.5,
  'total': None},
 'currentLine': {'odds': None,
  'homeOdds': -102,
  'awayOdds': -120,
  'overOdds': None,
  'underOdds': None,
  'drawOdds': None,
  'homeSpread': -5.5,
  'awaySpread': 5.5,
  'total': None},
 'moneyLineHistory': None,
 'spreadHistory': None,
 'totalHistory': None}

In [13]:
# Pull odds data from 2022-23 season (and future seasons) 

lines = pd.DataFrame()
dates = []
home = []
away = []
home_odds_mode = []
away_odds_mode = []
home_spread_mode = []
home_team_plus_minus = []
i = 0
# For each date, for each game (and each betting site available), add the odds data
for date in list(gamedates_2022_2023):
    for game in range(len(scrape_games(sport="NBA", date=date))):
        json_data = scrape_games(sport="NBA", date=date)[game]
        dates.append(date)
        home.append(json_data['gameView']['homeTeam']['shortName'])
        away.append(json_data['gameView']['awayTeam']['shortName'])

        home_score = json_data['gameView']['homeTeamScore'] 
        away_score = json_data['gameView']['awayTeamScore']
        diff = home_score - away_score
        home_team_plus_minus.append(diff)

        home_odds = []
        away_odds = []
        home_spread = []

        for site in range(len(json_data['oddsViews'])):
            try:
                home_odds.append(json_data['oddsViews'][site]['currentLine']['homeOdds'])
                away_odds.append(json_data['oddsViews'][site]['currentLine']['awayOdds'])
                home_spread.append(json_data['oddsViews'][site]['currentLine']['homeSpread'])
            except: 
                pass

        # Add mode of odds and spread. This function returns the smallest value among modes in the case of a tie. If there is no mode, the function will return the smallest value in the dataset.
        home_odds_mode.append(stats.mode(home_odds)[0].item())
        away_odds_mode.append(stats.mode(away_odds)[0].item())
        home_spread_mode.append(stats.mode(home_spread)[0].item())
    i+=1
    if i % 20 == 0:
        print(i, 'days completed')

lines['Date'] = dates
lines['Home'] = home
lines['Away'] = away
lines['Home Spread'] = home_spread_mode
lines['Home Team Plus Minus'] = home_team_plus_minus
lines 

20 days completed
40 days completed
60 days completed
80 days completed
100 days completed
120 days completed
140 days completed
160 days completed


Unnamed: 0,Date,Home,Away,Home Spread,Home Team Plus Minus
0,2022-11-10,WAS,DAL,6.0,8
1,2022-11-10,ATL,PHI,-1.0,9
2,2022-11-10,MIA,CHA,-11.0,5
3,2022-11-10,NO,POR,-10.0,-11
4,2022-12-09,CHA,NY,4.0,-19
...,...,...,...,...,...
1226,2023-01-22,TOR,NY,-2.5,9
1227,2023-01-22,DEN,OKC,-3.0,-2
1228,2023-01-22,PHO,MEM,8.5,2
1229,2023-01-22,GS,BK,-7.5,-4


In [14]:
# Some of the team abbreviations don't match, so let's build another dictionary to map

team_abbreviation_mapping2 = {
    'ATL': 'ATL',
    'BOS': 'BOS',
    'CHA': 'CHA',
    'CHI': 'CHI',
    'CLE': 'CLE',
    'DAL': 'DAL',
    'DEN': 'DEN',
    'DET': 'DET',
    'GS': 'GSW',
    'HOU': 'HOU',
    'IND': 'IND',
    'LAC': 'LAC',
    'LAL': 'LAL',
    'MEM': 'MEM',
    'MIA': 'MIA',
    'MIL': 'MIL',
    'MIN': 'MIN',
    'BK': 'BKN',
    'NO': 'NOP',
    'NY': 'NYK',
    'ORL': 'ORL',
    'PHI': 'PHI',
    'PHO': 'PHX',
    'POR': 'POR',
    'SAC': 'SAC',
    'SA': 'SAS',
    'OKC': 'OKC',
    'TOR': 'TOR',
    'UTA': 'UTA',
    'WAS': 'WAS'
}

In [15]:
lines['Home'] = lines['Home'].apply(lambda x: team_abbreviation_mapping2[x])
lines['Away'] = lines['Away'].apply(lambda x: team_abbreviation_mapping2[x])

In [16]:
final_odds_df.head()

Unnamed: 0,Date,Home,Away,Home Spread,Home Team Plus Minus
0,2021-10-19,MIL,BKN,0.0,23
1,2021-10-19,LAL,GSW,-4.5,-7
2,2021-10-20,CHA,IND,2.0,1
3,2021-10-20,DET,CHI,2.5,-6
4,2021-10-20,TOR,WAS,-3.5,-15


In [17]:
lines.head()

Unnamed: 0,Date,Home,Away,Home Spread,Home Team Plus Minus
0,2022-11-10,WAS,DAL,6.0,8
1,2022-11-10,ATL,PHI,-1.0,9
2,2022-11-10,MIA,CHA,-11.0,5
3,2022-11-10,NOP,POR,-10.0,-11
4,2022-12-09,CHA,NYK,4.0,-19


In [18]:
final_odds = pd.concat([final_odds_df, lines])
final_odds.sort_values(by='Date', ascending=False, inplace=True)
final_odds.head(10)

Unnamed: 0,Date,Home,Away,Home Spread,Home Team Plus Minus
624,2023-04-09,MIA,ORL,-5.0,13
625,2023-04-09,CHI,DET,-8.5,22
618,2023-04-09,TOR,MIL,-3.5,16
619,2023-04-09,BOS,ATL,-4.5,6
620,2023-04-09,BKN,PHI,-2.0,-29
628,2023-04-09,DEN,SAC,5.5,14
629,2023-04-09,LAL,UTA,-16.5,11
630,2023-04-09,POR,GSW,17.0,-56
631,2023-04-09,PHX,LAC,9.5,-5
632,2023-04-09,MIN,NOP,-3.0,5


In [19]:
# Find outliers and drop them
final_odds_filtered = final_odds[abs(final_odds['Home Spread']) <= 35]
final_odds_filtered.to_csv('../../generated_datasets/final_odds_df.csv', index=False)