In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import itertools as itertools
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from copy import deepcopy
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup
import requests
import time

In [16]:
#function to get all game ids for one team
def year_schedule(months, year):
    # game_ids assigns number to a particular game to keep track
    game_ids = []
    for m in months:
        url1 = "https://www.basketball-reference.com/leagues/NBA_"+ year + "_games-"
        req = requests.get(url1 + m + '.html')
        page = req.text
        soup = BeautifulSoup(page, 'html.parser')
        print(m, year)
        div = soup.find('div', {'id': 'div_schedule'})
        if div != None:
            table = div.find('table', {'id': 'schedule'})
            rows = table.find_all('tr')
            for row in rows:
                center_cells = row.find_all('td', 'center', {'data-stat': 'box_score_text'})
                for center_cell in center_cells:
                    if center_cell.get_text() == "Box Score":
                        link = center_cell.find_all('a')[0]
                        url = link.get('href')
                        game_id = url.split('/')[2]
                        game_ids.append(game_id)
    return game_ids

def to_pbp_url(game_id):
    return "https://www.basketball-reference.com/boxscores/pbp/" + game_id

In [15]:
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']

In [13]:
# creating the month, year combination to later scrape the games
years = ['2009', '2010', '2011', '2012', '2013', '2016', '2017']
schedule = []
for year in years:
    schedule = schedule + year_schedule(months, year)
    time.sleep(2)
len(schedule)

october 2009
november 2009
december 2009
january 2009
february 2009
march 2009
april 2009
may 2009
june 2009
october 2010
november 2010
december 2010
january 2010
february 2010
march 2010
april 2010
may 2010
june 2010
october 2011
november 2011
december 2011
january 2011
february 2011
march 2011
april 2011
may 2011
june 2011
october 2012
november 2012
december 2012
january 2012
february 2012
march 2012
april 2012
may 2012
june 2012
october 2013
november 2013
december 2013
january 2013
february 2013
march 2013
april 2013
may 2013
june 2013
october 2016
november 2016
december 2016
january 2016
february 2016
march 2016
april 2016
may 2016
june 2016
october 2017
november 2017
december 2017
january 2017
february 2017
march 2017
april 2017
may 2017
june 2017


8951

In [17]:
# changes the game id to a full play-by-play url
schedule1 = [to_pbp_url(x) for x in schedule]

In [6]:
# populates the dataframe with data scraped
dataframes1 = []

start = time.time()
for url in schedule1[450:600]:
    r = requests.get(url)
    if r.status_code != 404:
        pbp = pd.read_html(url,header=1)[0]
        dataframes1.append(pbp)
    time.sleep(2)
print(time.time() - start)

dataframes1[0].head()

744.8523976802826


Unnamed: 0,Time,Orlando,Unnamed: 2,Score,Unnamed: 4,Detroit
0,12:00.0,Start of 1st quarter,,,,
1,12:00.0,Jump ball: A. Johnson vs. D. Howard (R. Wallac...,,,,
2,11:43.0,,,0-0,,R. Wallace misses 3-pt shot from 23 ft
3,11:42.0,Defensive rebound by H. Turkoglu,,0-0,,
4,11:31.0,Turnover by J. Nelson (bad pass; steal by A. I...,,0-0,,


In [8]:
# creates test copies of the dataframes for test use below
test1 = [i.copy() for i in dataframes1]
test2 = [i.copy() for i in dataframes1]

In [9]:
# turns time string into an integer measured in seconds
def time_to_int(t):
    if len(t) > 5:
        numbers = t.split(":")
        seconds = numbers[1].split(".")
        t_sec = (int(numbers[0])*60) + int(seconds[0])
    else:
        t_sec = None
    return t_sec

In [18]:
#function to clean and format scraped data
def format_frames(pbp):
    # getting information of what team is away and what team is home
    awayteam = pbp.columns[1]
    hometeam = pbp.columns[5]
    pbp.columns = ['time', 'awayevents','awaypts','score','homepts','homeevents']
    
    # assigning these values to column
    pbp['awayteam'] = awayteam
    pbp['hometeam'] = hometeam
    
    # merging the events to create one column with all the events of both home and away
    events = pbp['awayevents']
    events = events.fillna(pbp['homeevents'])
    pbp['event'] = events
    
    #possession with 1 indicating that the away team is making the play and 0 indicating the opposite
    pbp['possession'] = 1-pd.isnull(pbp['awayevents'])
    #first need to replace scores at the beginning of games and the rows that just mark the end of quarters 
    pbp['score'] = pbp['score'].replace(to_replace='Score',method='ffill')
    pbp['score'] = pbp['score'].fillna(method='ffill')
    pbp['score'] = pbp['score'].fillna(method='bfill')

    #then we split these to the away team scores and home team scores
    pbp['score'] = [str(x) for x in pbp['score']]
    score = [x.split('-') for x in pbp['score']]
    awayscore,homescore = np.transpose(np.array(score))
    awayscore = [int(x) for x in awayscore]
    homescore = [int(x) for x in homescore]
    pbp['awayscore'] = awayscore
    pbp['homescore'] = homescore

    #now drop the redundant variables
    pbp = pbp.drop(['awayevents','awaypts','score','homepts','homeevents'], axis=1)
    pbp1 = pbp[pd.notnull(pbp['event'])]
    #adding quarter to dataframe
    quarter = []
    count2 = 1
    for x, y in zip(pbp1['event'], pbp1['time']):
        if 'End of' in x and '0:00.0' in y:
            count2 += 1;
        quarter.append(count2)

    pbp1['quarter'] = quarter
    pbp1['time'] = pbp1['time'].apply(time_to_int)
    pbp2 = pbp1[pd.notnull(pbp1['time'])]
    total_time = []
    for x, y in zip(pbp2['quarter'], pbp2['time']):
        if y == 0:
            time = 720*(x)
            if time == 0:
                total_time.append(2880)
            else:
                total_time.append(time)
        else:
            total_time.append(2880 - ((5-x)*y))
    
    # creating total time column 
    pbp2["total_time"] = total_time
    z_count = -1
    for time in pbp2['total_time']:
        if time == 0:
            z_count += 1
    pbp2 = pbp2.iloc[z_count:]
    away_final = np.max(pbp2['awayscore'])
    home_final = np.max(pbp2['homescore'])
    if away_final > home_final:
        pbp2['win'] = [1 for x in pbp2['possession']]
    else: 
        pbp2['win'] = [0 for x in pbp2['possession']]
    return pbp2

In [12]:
# formatting the data scraped
formatted_frames = list(map(format_frames, test1))
for i, dataframe in enumerate(formatted_frames):
    dataframe["game_id"] = i
season_df = pd.concat(formatted_frames, ignore_index = True)
season_df
season_reference = season_df[['game_id', 'time', 'quarter', 'event', 'total_time']].copy()
season_reference.head()
season_df = season_df.drop(['time', 'event', 'quarter'], axis=1)
season_df.head()

# saving the data as a csv file
season_df.to_csv("Rosa1_seasons.csv", sep = ',')
season_reference.to_csv("Rosa1_season_reference.csv", sep = ',')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
