In [107]:
#NBA Player Game Log Scraper v1
#Copyright 2018 Rakesh Bhatia

#This scraper demonstrates the use of Python with BeautifulSoup,
#requests, and Pandas. NBA player game log data is scraped from
#www.basketball-reference.com and stored in a Pandas dataframe.

import numpy as np
import pandas as pd
import requests
import scipy
import seaborn as sns
from bs4 import BeautifulSoup
from collections import Counter
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline

base_url = 'https://www.basketball-reference.com'

teams = {'BOS':'https://www.basketball-reference.com/teams/BOS/2019.html',
         'BKN':'https://www.basketball-reference.com/teams/BRK/2019.html',
         'NYK':'https://www.basketball-reference.com/teams/NYK/2019.html',
         'PHI':'https://www.basketball-reference.com/teams/PHI/2019.html',
         'TOR':'https://www.basketball-reference.com/teams/TOR/2019.html',

         'CHI':'https://www.basketball-reference.com/teams/CHI/2019.html',
         'CLE':'https://www.basketball-reference.com/teams/CLE/2019.html',
         'DET':'https://www.basketball-reference.com/teams/DET/2019.html',
         'IND':'https://www.basketball-reference.com/teams/IND/2019.html',
         'MIL':'https://www.basketball-reference.com/teams/MIL/2019.html',

         'ATL':'https://www.basketball-reference.com/teams/ATL/2019.html',
         'CHA':'https://www.basketball-reference.com/teams/CHO/2019.html',
         'MIA':'https://www.basketball-reference.com/teams/MIA/2019.html',
         'ORL':'https://www.basketball-reference.com/teams/ORL/2019.html',
         'WAS':'https://www.basketball-reference.com/teams/WAS/2019.html',

         'GSW':'https://www.basketball-reference.com/teams/GSW/2019.html',
         'LAC':'https://www.basketball-reference.com/teams/LAC/2019.html',
         'LAL':'https://www.basketball-reference.com/teams/LAL/2019.html',
         'PHO':'https://www.basketball-reference.com/teams/PHO/2019.html',
         'SAC':'https://www.basketball-reference.com/teams/SAC/2019.html',

         'DAL':'https://www.basketball-reference.com/teams/DAL/2019.html',
         'HOU':'https://www.basketball-reference.com/teams/HOU/2019.html',
         'MEM':'https://www.basketball-reference.com/teams/MEM/2019.html',
         'NOP':'https://www.basketball-reference.com/teams/NOP/2019.html',
         'SAS':'https://www.basketball-reference.com/teams/SAS/2019.html',

         'DEN':'https://www.basketball-reference.com/teams/DEN/2019.html',
         'MIN':'https://www.basketball-reference.com/teams/MIN/2019.html',
         'OKC':'https://www.basketball-reference.com/teams/OKC/2019.html',
         'POR':'https://www.basketball-reference.com/teams/POR/2019.html',
         'UTA':'https://www.basketball-reference.com/teams/UTA/2019.html'}

for team, url in teams.items():
    # only process one team for now (for test purposes)
    if team != 'BOS':
        continue
        
    res = requests.get(url, timeout=5)
    soup = BeautifulSoup(res.content,'lxml')
    table = soup.find_all('table')[0]
    player_links = []

    for row in table.find('tbody').find_all('tr'):
        # find player's jersey number
        jersey_number = row.find('th', attrs={'data-stat':'number'})
        
        # if jersey number field is empty, player is inactive
        if jersey_number.get_text() == '':
            print('player inactive')
            continue
            
        # get the link to each player page
        a = row.find_all('a')[0]
        player_links.append(base_url + a['href'].replace('.html', '/gamelog/2019'))

    #read roster table into pandas dataframe
    roster = pd.read_html(str(table))[0]
    print('read_html output\n')
    print(roster)

    for link in player_links:
        res = requests.get(link, timeout=5)
        soup = BeautifulSoup(res.content,'lxml')
        table = soup.find_all('table', attrs={'class':'row_summable sortable stats_table'})

        if table:
            # load stats table into dataframe
            df = pd.read_html(str(table))[0]
            
            # set the columns of dataframe
            df.columns = ['Rk', 'G', 'Date', 'Age', 'Tm', 'Location', 'Opp', 'Result', 'GS', 'MP', \
                          'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', \
                          'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-']
                          
            # indicate which columns to treat as float
            columns_float = ['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', \
                             'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-']

            df[columns_float] = df[columns_float].astype(float)
            
            # add new column that calculates fanduel points earned for each game
            df['Fanduel Pts'] = 2*df['FG'] + df['FT'] + df['3P'] + 1.2*df['TRB'] + 1.5*df['AST'] + 3*df['STL'] + 3*df['BLK'] - df['TOV']
            
            print(df)
        else:
            continue

player inactive
player inactive
player inactive
read_html output

     No.                Player Pos    Ht   Wt         Birth Date Unnamed: 6  \
0   11.0          Kyrie Irving  PG   6-3  193     March 23, 1992         au   
1   13.0         Marcus Morris  PF   6-9  235  September 2, 1989         us   
2   12.0          Terry Rozier  PG   6-2  190     March 17, 1994         us   
3   42.0            Al Horford   C  6-10  245       June 3, 1986         do   
4    7.0          Jaylen Brown  SG   6-7  225   October 24, 1996         us   
5    0.0          Jayson Tatum  SF   6-8  205      March 3, 1998         us   
6   20.0        Gordon Hayward  SF   6-8  226     March 23, 1990         us   
7   37.0          Semi Ojeleye  PF   6-7  241   December 5, 1994         us   
8   30.0    Guerschon Yabusele  PF   6-7  260  December 17, 1995         fr   
9   27.0          Daniel Theis   C   6-9  215      April 4, 1992         de   
10   9.0        Brad Wanamaker  PG   6-4  210      July 25, 1989 

    Rk   G        Date     Age   Tm Location  Opp   Result  GS     MP  \
0    1   1  2018-10-16  24-213  BOS      NaN  PHI  W (+18)   0  26:31   
1    2   2  2018-10-19  24-216  BOS        @  TOR  L (-12)   0  23:13   
2    3   3  2018-10-20  24-217  BOS        @  NYK   W (+2)   0  22:54   
3    4   4  2018-10-22  24-219  BOS      NaN  ORL   L (-3)   0  15:29   
4    5   5  2018-10-25  24-222  BOS        @  OKC   W (+6)   0  24:57   
5    6   6  2018-10-27  24-224  BOS        @  DET  W (+20)   0  27:21   
6    7   7  2018-10-30  24-227  BOS      NaN  DET   W (+3)   0  19:28   
7    8   8  2018-11-01  24-229  BOS      NaN  MIL   W (+4)   0  26:08   
8    9   9  2018-11-03  24-231  BOS        @  IND   L (-1)   0  18:56   
9   10  10  2018-11-05  24-233  BOS        @  DEN   L (-8)   0  22:21   
10  11  11  2018-11-08  24-236  BOS        @  PHO   W (+7)   0  20:35   
11  12  12  2018-11-09  24-237  BOS        @  UTA   L (-8)   1  36:09   
12  13  13  2018-11-11  24-239  BOS        @  POR  

    Rk   G        Date     Age   Tm Location  Opp   Result  GS     MP  \
0    1   1  2018-10-16  20-227  BOS      NaN  PHI  W (+18)   1  28:56   
1    2   2  2018-10-19  20-230  BOS        @  TOR  L (-12)   1  36:47   
2    3   3  2018-10-20  20-231  BOS        @  NYK   W (+2)   1  35:11   
3    4   4  2018-10-22  20-233  BOS      NaN  ORL   L (-3)   1  34:25   
4    5   5  2018-10-25  20-236  BOS        @  OKC   W (+6)   1  34:26   
5    6   6  2018-10-27  20-238  BOS        @  DET  W (+20)   1  28:19   
6    7   7  2018-10-30  20-241  BOS      NaN  DET   W (+3)   1  30:06   
7    8   8  2018-11-01  20-243  BOS      NaN  MIL   W (+4)   1  31:39   
8    9   9  2018-11-03  20-245  BOS        @  IND   L (-1)   1  33:17   
9   10  10  2018-11-05  20-247  BOS        @  DEN   L (-8)   1  34:42   
10  11  11  2018-11-08  20-250  BOS        @  PHO   W (+7)   1  29:40   
11  12  12  2018-11-09  20-251  BOS        @  UTA   L (-8)   1  36:13   
12  13  13  2018-11-11  20-253  BOS        @  POR  

    Rk    G        Date     Age   Tm Location  Opp   Result            GS  \
0    1  1.0  2018-10-16  22-303  BOS      NaN  PHI  W (+18)             0   
1    2  NaN  2018-10-19  22-306  BOS        @  TOR  L (-12)  Did Not Play   
2    3  2.0  2018-10-20  22-307  BOS        @  NYK   W (+2)             0   
3    4  3.0  2018-10-22  22-309  BOS      NaN  ORL   L (-3)             0   
4    5  NaN  2018-10-25  22-312  BOS        @  OKC   W (+6)  Did Not Play   
5    6  4.0  2018-10-27  22-314  BOS        @  DET  W (+20)             0   
6    7  5.0  2018-10-30  22-317  BOS      NaN  DET   W (+3)             0   
7    8  NaN  2018-11-01  22-319  BOS      NaN  MIL   W (+4)  Did Not Play   
8    9  NaN  2018-11-03  22-321  BOS        @  IND   L (-1)  Did Not Play   
9   10  NaN  2018-11-05  22-323  BOS        @  DEN   L (-8)  Did Not Play   
10  11  6.0  2018-11-08  22-326  BOS        @  PHO   W (+7)             0   
11  12  7.0  2018-11-09  22-327  BOS        @  UTA   L (-8)             0   

    Rk    G        Date     Age   Tm Location  Opp   Result            GS  \
0    1  NaN  2018-10-16  20-364  BOS      NaN  PHI  W (+18)      Inactive   
1    2  NaN  2018-10-19  21-002  BOS        @  TOR  L (-12)      Inactive   
2    3  NaN  2018-10-20  21-003  BOS        @  NYK   W (+2)  Did Not Play   
3    4  1.0  2018-10-22  21-005  BOS      NaN  ORL   L (-3)             0   
4    5  NaN  2018-10-25  21-008  BOS        @  OKC   W (+6)  Did Not Play   
5    6  2.0  2018-10-27  21-010  BOS        @  DET  W (+20)             0   
6    7  3.0  2018-10-30  21-013  BOS      NaN  DET   W (+3)             0   
7    8  4.0  2018-11-01  21-015  BOS      NaN  MIL   W (+4)             0   
8    9  NaN  2018-11-03  21-017  BOS        @  IND   L (-1)  Did Not Play   
9   10  5.0  2018-11-05  21-019  BOS        @  DEN   L (-8)             0   
10  11  NaN  2018-11-08  21-022  BOS        @  PHO   W (+7)  Did Not Play   
11  12  NaN  2018-11-09  21-023  BOS        @  UTA   L (-8)  Did Not Play   