In [78]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from urllib.request import urlopen, Request
import datetime
import time


# This notebook creates the beginning of the dataframe with data from www.basketball-reference.com. It uses BeautifulSoup to scrape the data

In [111]:
#returns a list of strings all of the months since january
#if lower=True will return non-capitalized months
def getPastMonths(lower=False):    
    all_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July','August', 'September',
                  'October', 'November', 'December']
    current_month = datetime.datetime.now().strftime('%B')
    if lower==True:
        all_months = [i.lower() for i in all_months]
        current_month = datetime.datetime.now().strftime('%B').lower()

    months = all_months[0:all_months.index(current_month)+1]
    return(months)

In [112]:
#returns a list of strings all of the months since the start of the nba season in october
#if lower=True will return non-capitalized months
def getPastMonths_nba(lower=False):    
    all_months = ['October', 'November', 'December', 'January', 'March', 'April', 'May','June' ]
    current_month = datetime.datetime.now().strftime('%B')

    if lower==True:
        all_months = [i.lower() for i in all_months]
        current_month = datetime.datetime.now().strftime('%B').lower()
        
    if current_month not in all_months:
        months = all_months
    else:
        months = all_months[0:all_months.index(current_month)+1]
    return(months)

In [30]:
months = getPastMonths_nba(lower=True)

In [31]:
months

['october', 'november', 'december', 'january', 'march', 'april']

In [102]:
base_url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-'
end_url = '.html'
url = base_url+months[0]+end_url
url

'https://www.basketball-reference.com/leagues/NBA_2020_games-october.html'

In [103]:
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")

In [55]:
#testing dates
dt = soup.find_all('th',{'data-stat':'date_game','scope':'row'})[0].text
dt

'Tue, Oct 22, 2019'

In [93]:
#Formating dates to be uniform. Can be converted back into datetime by datetime.datetime.strptime(dt,'%Y-%m-%d')
datetime.datetime.strptime(dt, '%a, %b %d, %Y').strftime('%Y-%m-%d')

'2019-10-22'

In [78]:
#now doing tmes. 
starttime = soup.find_all('td', {'class':'right', 'data-stat':'game_start_time'})[0].text
starttime

'8:00p'

In [98]:
#converting to 24hr
if starttime[-1]=='p':
    timetail = 'PM'
else:
    timetail='AM'
datetime.datetime.strptime(starttime[:-1]+timetail,'%I:%M%p').strftime('%H:%M:%S')

'20:00:00'

In [101]:
#visiting teams
vis_team = soup.find_all('td', {'class':'left', 'data-stat':'visitor_team_name'})[0].text
vis_team

'New Orleans Pelicans'

In [105]:
#home teams
home_team = soup.find_all('td', {'class':'left', 'data-stat':'home_team_name'})[0].text
home_team

'Toronto Raptors'

In [110]:
#visiting points 
vis_points = soup.find_all('td' ,{'class':'right', 'data-stat':'visitor_pts'})[0].text
vis_points = int(vis_points)
vis_points


122

In [113]:
#home points
home_points = soup.find_all('td' ,{'class':'right', 'data-stat':'home_pts'})[0].text
home_points = int(home_points)
home_points

130

In [104]:
#Overtime?
ot = soup.find_all('td' ,{'class':'center', 'data-stat':'overtimes'})[0].text
ot

'OT'

In [132]:
#Attendance
attendance = soup.find_all('td' ,{'class':'right', 'data-stat':'attendance'})[0].text
attendance = int(attendance.replace(',', ''))
attendance

20787

# OK now i know how to get all of the variables. I'm going to make a dataframe of every game since october


In [118]:
months = getPastMonths_nba(lower=True)
months = months[:-2]
months

['october', 'november', 'december', 'january', 'march']

In [119]:
base_url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-'
end_url = '.html'
urls = [base_url+i+end_url for i in months]
urls

['https://www.basketball-reference.com/leagues/NBA_2020_games-october.html',
 'https://www.basketball-reference.com/leagues/NBA_2020_games-november.html',
 'https://www.basketball-reference.com/leagues/NBA_2020_games-december.html',
 'https://www.basketball-reference.com/leagues/NBA_2020_games-january.html',
 'https://www.basketball-reference.com/leagues/NBA_2020_games-march.html']

In [120]:
#creating lists to go into dataframe
dates = []
starttimes = []
visteams = []
hometeams = []
ots = []
attendances = []
vispoints = []
homepoints = []

In [121]:
for path in urls:
    html = urlopen(path)
    soup = BeautifulSoup(html,"html.parser")
    #date
    dts = soup.find_all('th',{'data-stat':'date_game','scope':'row'})
    for dt in dts:
        dates.append(datetime.datetime.strptime(dt.text, '%a, %b %d, %Y').strftime('%Y-%m-%d'))
    
    #time
    st = soup.find_all('td', {'class':'right', 'data-stat':'game_start_time'})
    for s in st:
        starttime = s.text
        if starttime[-1]=='p':
            timetail = 'PM'
        else:
            timetail='AM'
        starttimes.append(datetime.datetime.strptime(starttime[:-1]+timetail,'%I:%M%p').strftime('%H:%M:%S'))

    #vis team
    vis_team = soup.find_all('td', {'class':'left', 'data-stat':'visitor_team_name'})
    for vt in vis_team:
        visteams.append(vt.text)
        
    #home team
    home_team = soup.find_all('td', {'class':'left', 'data-stat':'home_team_name'})
    for ht in home_team:
        hometeams.append(ht.text)
        
    #visiting points 
    vis_points = soup.find_all('td' ,{'class':'right', 'data-stat':'visitor_pts'})
    for vp in vis_points:
        try:
            vispoints.append(int(vp.text))     
        except:
            vispoints.append(np.nan)

    #home points 
    home_points = soup.find_all('td' ,{'class':'right', 'data-stat':'home_pts'})
    for hp in home_points:
        try:
            homepoints.append(int(hp.text))
        except:
            homepoints.append(np.nan)
        
    #Overtime
    o = soup.find_all('td' ,{'class':'center', 'data-stat':'overtimes'})
    for ot in o:
        if np.logical_and(ot.text =='',~np.isnan(homepoints[-1])):
            otval = 0
        elif ot.text=='OT':
            otval =1
        else:
            otval = np.nan
        ots.append(otval)

    #Attendance
    attendance = soup.find_all('td' ,{'class':'right', 'data-stat':'attendance'})
    for a in attendance:
        try:
            attendances.append(int(a.text.replace(',', '')))
        except:
            attendances.append(np.nan)


In [122]:
nba = pd.DataFrame({'date':dates,'start_time':starttimes,'vis_team':visteams,'home_team':hometeams,
                    'vis_points':vispoints,'home_points':homepoints,'ot':ots,'attendance':attendances})

In [125]:
nba.head(10)

Unnamed: 0,date,start_time,vis_team,home_team,vis_points,home_points,ot,attendance
0,2019-10-22,20:00:00,New Orleans Pelicans,Toronto Raptors,122,130,1.0,20787
1,2019-10-22,22:30:00,Los Angeles Lakers,Los Angeles Clippers,102,112,0.0,19068
2,2019-10-23,19:00:00,Chicago Bulls,Charlotte Hornets,125,126,0.0,15424
3,2019-10-23,19:00:00,Detroit Pistons,Indiana Pacers,119,110,0.0,17923
4,2019-10-23,19:00:00,Cleveland Cavaliers,Orlando Magic,85,94,0.0,18846
5,2019-10-23,19:30:00,Minnesota Timberwolves,Brooklyn Nets,127,126,1.0,17732
6,2019-10-23,19:30:00,Memphis Grizzlies,Miami Heat,101,120,0.0,19600
7,2019-10-23,19:30:00,Boston Celtics,Philadelphia 76ers,93,107,0.0,20422
8,2019-10-23,20:30:00,Washington Wizards,Dallas Mavericks,100,108,0.0,19816
9,2019-10-23,20:30:00,New York Knicks,San Antonio Spurs,111,120,0.0,18354


In [126]:
nba.to_csv('nba_games_2020.csv',index=False)