In [2]:
import sys
sys.path.append("/Users/rohanramesh/Documents/GitHub/Basketball_predictions/lib/")
import requests
from bs4 import BeautifulSoup
import numpy as np
from IPython.core.debugger import set_trace
import re
import difflib
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import time
from scipy import stats
import pickle
import get_player_team_data as ptd

# Single player statistics

For single player data, I will be scraping this data from www.basketball-reference.com for the last 10 years. There are three different types of statistics I will be scraping. Per game statistics are how many points, assists, minutes, etc. that player averaged per game. For these statistics, the number of minutes a player plays will heavily influence the actual numbers. Per 100 possession statistics, will not factor in the number of minutes played, but rather will extrapolate out as if each player had the same number of possessions. These are useful numbers, but can suffer due to small sample size for players who do not play a lot. Finally, advanced statistics are an amalgamation of single player statistics.

In [3]:
# Scrape data for yearly stats from basketball reference.com
# these are the years we will analyze
years_to_grab = range(2008,2019)

# addressses of the websites from 2012
per_game_stats = 'https://www.basketball-reference.com/leagues/NBA_2012_per_game.html'
advanced_stats = 'https://www.basketball-reference.com/leagues/NBA_2012_advanced.html'
per_100_pos = 'https://www.basketball-reference.com/leagues/NBA_2012_per_poss.html'

# iterate through each year
all_soups = {}
all_adv_soups = {}
all_100pos_soups = {}
for curr_year in years_to_grab:
    output_soups = ptd.get_soups([per_game_stats, advanced_stats, per_100_pos], curr_year)
    all_soups[str(curr_year)] = output_soups['0']
    all_adv_soups[str(curr_year)] = output_soups['1']
    all_100pos_soups[str(curr_year)] = output_soups['2']


In [4]:
# For each year lets extract the data and respective headers
yearly_data = {}
yearly_data_adv = {}
yearly_data_100pos = {}
yearly_headers = {}
yearly_headers_adv = {}
yearly_headers_100pos = {}
for curr_year in years_to_grab:
    (yearly_data[str(curr_year)],headers) = ptd.get_data_table(all_soups[str(curr_year)])
    (yearly_data_adv[str(curr_year)],headers_adv) = ptd.get_data_table(all_adv_soups[str(curr_year)])
    (yearly_data_100pos[str(curr_year)],headers_100pos) = ptd.get_data_table(all_100pos_soups[str(curr_year)])
    # remove the ranking label bc meaningless
    headers.remove('Rk')
    headers_adv.remove('Rk')
    headers_100pos.remove('Rk')
    yearly_headers[str(curr_year)] = headers
    yearly_headers_adv[str(curr_year)] = headers_adv
    yearly_headers_100pos[str(curr_year)] = headers_100pos

In [5]:
# Build the DataFrame for player stats per year
# Because of table formatting first will create a dict and then convert to a dataFrame
Yearly_df = {}
Yearly_df_adv = {}
Yearly_df_100pos = {}
# iterate through all years
for curr_year in years_to_grab:
    print(curr_year)
    output_df = ptd.build_dataframe([yearly_data, yearly_data_adv, yearly_data_100pos], 
                    [yearly_headers, yearly_headers_adv, yearly_headers_100pos], curr_year)
    Yearly_df[str(curr_year)] = output_df[0]
    Yearly_df_adv[str(curr_year)] = output_df[1]
    Yearly_df_100pos[str(curr_year)] = output_df[2]


2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


This is what the dataFrame for a current year looks like for per game stats. Output is this year's Cleveland Cavaliers:

In [6]:
idx = Yearly_df['2018']['Tm'] == 'CLE'
print(Yearly_df['2018'][['Player', 'Pos', 'MP', 'PS/G', 'AST', '2P%', '3P%']][idx].sort_values('MP', ascending=False))

               Player Pos    MP  PS/G  AST    2P%    3P%
303      LeBron James  PF  36.9  27.5  9.1  0.603  0.367
561        J.R. Smith  SG  28.1   8.3  1.8  0.453  0.375
375        Kevin Love   C  28.0  17.6  1.7  0.494  0.415
266       George Hill  PG  27.9   9.4  2.8  0.509  0.351
584     Isaiah Thomas  PG  27.1  14.7  4.5  0.449  0.253
141       Jae Crowder  SF  25.4   8.6  1.1  0.508  0.328
276       Rodney Hood  SG  25.3  10.8  1.4  0.492  0.352
226        Jeff Green  PF  23.4  10.8  1.3  0.540  0.312
609       Dwyane Wade  SG  23.2  11.2  3.5  0.479  0.329
118   Jordan Clarkson  SG  22.6  12.6  1.7  0.486  0.407
347       Kyle Korver  SG  21.6   9.2  1.2  0.537  0.436
450       Larry Nance   C  20.8   8.9  1.0  0.572  0.125
588  Tristan Thompson   C  20.2   5.8  0.6  0.562  0.000
551     Iman Shumpert  SG  19.7   4.4  1.2  0.469  0.269
534      Derrick Rose  PG  19.3   9.8  1.6  0.481  0.250
98      Jose Calderon  PG  16.0   4.5  2.1  0.545  0.464
497  Kendrick Perkins   C  15.0

In [10]:
# lets built a stat that is fractional scoring from 3p and from 2p
for curr_year in years_to_grab:
    FG_fraction_2 = Yearly_df[str(curr_year)]['2P']/Yearly_df[str(curr_year)]['FG']
    FG_fraction_3 = Yearly_df[str(curr_year)]['3P']/Yearly_df[str(curr_year)]['FG']
    Scoring_fraction_2 = (Yearly_df[str(curr_year)]['2P']*2)/Yearly_df[str(curr_year)]['PS/G']
    Scoring_fraction_3 = (Yearly_df[str(curr_year)]['3P']*3)/Yearly_df[str(curr_year)]['PS/G']
    Yearly_df[str(curr_year)]['2P/FG'] = FG_fraction_2
    Yearly_df[str(curr_year)]['3P/FG'] = FG_fraction_3
    Yearly_df[str(curr_year)]['2P/PS'] = Scoring_fraction_2
    Yearly_df[str(curr_year)]['3P/PS'] = Scoring_fraction_3
    Yearly_df[str(curr_year)]['3PA/FGA'] = Yearly_df[str(curr_year)]['3PA']/Yearly_df[str(curr_year)]['FGA']
    Yearly_df[str(curr_year)]['2PA/FGA'] = Yearly_df[str(curr_year)]['2PA']/Yearly_df[str(curr_year)]['FGA']
    


In [12]:
# remove doubles for each year
for curr_year in years_to_grab:
    print(curr_year)
    Yearly_df[str(curr_year)] = ptd.keep_duplicate_player_most_minutes(Yearly_df[str(curr_year)])
    Yearly_df_adv[str(curr_year)] = ptd.keep_duplicate_player_most_minutes(Yearly_df_adv[str(curr_year)])
    Yearly_df_100pos[str(curr_year)] = ptd.keep_duplicate_player_most_minutes(Yearly_df_100pos[str(curr_year)])

In [14]:
# Build new statistics that look at how stats change for a given player from year 1 to year 2
stats_to_look_at_increment = ['PER','VORP','USG%','WS','MP']
for curr_stat in stats_to_look_at_increment:
    Yearly_df_adv = ptd.add_column_for_change_in_stat(Yearly_df_adv,curr_stat)
    
# for basic stats
stats_to_look_at_increment = ['3PA','3P%','3P','3P/PS','AST','2PA','2P%','PS/G','MP']
for curr_stat in stats_to_look_at_increment:
    Yearly_df = ptd.add_column_for_change_in_stat(Yearly_df,curr_stat)

# for per 100 pos stats
stats_to_look_at_increment = ['3PA','3P%','3P','AST','2PA','2P%','PTS','MP','DRtg','ORtg','FG%']
for curr_stat in stats_to_look_at_increment:
    Yearly_df_100pos = ptd.add_column_for_change_in_stat(Yearly_df_100pos,curr_stat)

In [15]:
# Combine all statistics into one dataFrame that is just for players
# from get_player_team_data import combine_df
df_all_stats = {}
labels_for_each_df = ['_pergame','_adv','_per100']
for curr_year in years_to_grab[1:]:
    df_all_stats[str(curr_year)] = ptd.combine_df([Yearly_df[str(curr_year)],
                                          Yearly_df_adv[str(curr_year)], 
                                          Yearly_df_100pos[str(curr_year)]],labels_for_each_df)

In [None]:
# for saving
with open('/Users/rohanramesh/Documents/SportsData/NBA/YearlyBasicData.pickle', 'wb') as handle:
    pickle.dump(Yearly_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('/Users/rohanramesh/Documents/SportsData/NBA/YearlyAdvData.pickle', 'wb') as handle:
    pickle.dump(Yearly_df_adv, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('/Users/rohanramesh/Documents/SportsData/NBA/Yearly100possData.pickle', 'wb') as handle:
    pickle.dump(Yearly_df_100pos, handle, protocol=pickle.HIGHEST_PROTOCOL)

# save new combined df
with open('/Users/rohanramesh/Documents/SportsData/NBA/AllYearlyData_2008_2018.pickle', 'wb') as handle:
    pickle.dump(df_all_stats, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Team statistics

For team statistics we will scrape the data from www.foxsports.com. We will scrape 4 different types of team statistics: scoring statistics, shooting statistics, assist statistics, record (wins and losses). Unfortunately, for whatever reason the formatting between some of the team pages is slightly different and so we will have to retroactively edit some of the labels and heuristics for consistency within our dataset in order to build a common dataFrame.

In [17]:
# lets get the data from a page listing the ppg for and ppg against
scoring_page = 'https://www.foxsports.com/nba/team-stats?season=2012&category=SCORING&group=1&time=0'
shooting_page = 'https://www.foxsports.com/nba/team-stats?season=2012&category=SHOOTING&group=1&time=0'
assists_page = 'https://www.foxsports.com/nba/team-stats?season=2012&category=ASSISTS&group=1&time=0'
wins_page = 'https://www.foxsports.com/nba/standings?season=2012&seasonType=1&grouping=3&advanced=0'

# iterate through each year
all_scoring_soups = {}
all_shooting_soups = {}
all_ast_soups = {}
all_wins_soups = {}
for curr_year in years_to_grab:
    # get soup for scoring, shooting, and assists
    output_soups = ptd.get_soups([scoring_page, shooting_page, assists_page], curr_year)
    all_scoring_soups[str(curr_year)] = output_soups['0']
    all_shooting_soups[str(curr_year)] = output_soups['1']
    all_ast_soups[str(curr_year)] = output_soups['2']    
    # bc of formatting of the wins page have to use curr_year-1
    output_soups2 = ptd.get_soups([wins_page], curr_year-1)
    all_wins_soups[str(curr_year)] = output_soups2['0']    
    

In [18]:
# from get_player_team_data import get_yearly_data_foxsports
# get the data from the foxsports websites and put into a dict for each year
yearly_scoring_data = {}
yearly_scoring_headers = {}
yearly_shooting_data = {}
yearly_shooting_headers = {}
yearly_ast_data = {}
yearly_ast_headers = {}
yearly_wins_data = {}
yearly_wins_headers = {}
for curr_year in years_to_grab:
    # for scoring
    [a, curr_headers] = ptd.get_yearly_data_foxsports(all_scoring_soups[str(curr_year)])
    yearly_scoring_data[str(curr_year)] = a
    yearly_scoring_headers[str(curr_year)] = curr_headers
    # for shooting
    [a, curr_headers] = ptd.get_yearly_data_foxsports(all_shooting_soups[str(curr_year)])
    yearly_shooting_data[str(curr_year)] = a
    yearly_shooting_headers[str(curr_year)] = curr_headers
    # for assists
    [a, curr_headers] = ptd.get_yearly_data_foxsports(all_ast_soups[str(curr_year)])
    yearly_ast_data[str(curr_year)] = a
    yearly_ast_headers[str(curr_year)] = curr_headers
    # for wins
    [a, curr_headers] = ptd.get_yearly_data_foxsports(all_wins_soups[str(curr_year)],wins_tag=True)
    yearly_wins_data[str(curr_year)] = a
    yearly_wins_headers[str(curr_year)] = curr_headers

In [19]:
# lets build the DataFrame - one for each year for all team based stats
# Because of table formatting lets build giant dict with data from that year
# and then convert this to a dataFrame as last step
Yearly_df_team_shooting = {}
Yearly_df_team_scoring = {}
Yearly_df_team_ast = {}
Yearly_df_team_wins = {}
# Do independently for each year
for curr_year in years_to_grab:
    print(curr_year)
    output_df = ptd.build_dataframe([yearly_scoring_data, yearly_shooting_data, yearly_ast_data, yearly_wins_data], 
                [yearly_scoring_headers, yearly_shooting_headers, yearly_ast_headers, yearly_wins_headers], curr_year)
    Yearly_df_team_shooting[str(curr_year)] = output_df[0]
    Yearly_df_team_scoring[str(curr_year)] = output_df[1]
    Yearly_df_team_ast[str(curr_year)] = output_df[2]
    Yearly_df_team_wins[str(curr_year)] = output_df[3]    
    
    


2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


In [20]:
# Let's change the wins team labels to match the other groups
for curr_year in years_to_grab:
    teams = Yearly_df_team_scoring[str(curr_year)]['Tm'].tolist()
    teams_according_to_curr_df = Yearly_df_team_wins[str(curr_year)]['Tm'].tolist()
    # remove everything after city name but before team acronym
    new_team_ = [curr_team[curr_team.rfind(" ")+1:] for curr_team in teams_according_to_curr_df]
    # do same for the teams variable from first df
    root_team_ = [curr_team[curr_team.rfind(" ")+1:] for curr_team in teams]

    # bc of team transferring from New Jersey to Brooklyn and websited not being up to date add
    if curr_year == 2012:
        new_team_ = [w.replace('NJ', 'BKN') for w in new_team_]

    # now iterate through new team and find best match for the acronyms from all_teams
    new_team_aligned = []
    for curr_team in new_team_:   
        aligned_tmp = (difflib.get_close_matches(curr_team,root_team_, n=1)[0])
        new_team_aligned.append(teams[root_team_.index(aligned_tmp)])

    Yearly_df_team_wins[str(curr_year)]['Tm'] = new_team_aligned

In [21]:
# combine all of team dataFrames into one giant dataFrame with all relevant stats
Yearly_df_team = {}
for curr_year in years_to_grab[1:]:
    Yearly_df_team[str(curr_year)] = ptd.reorder_team_df_and_combine([Yearly_df_team_scoring[str(curr_year)],
                      Yearly_df_team_shooting[str(curr_year)], Yearly_df_team_ast[str(curr_year)], 
                                                  Yearly_df_team_wins[str(curr_year)]],
                                                  ['_score','_shoot','_ast','_wins'])


For a single team we can now easily extract multiple statistics about that team over time. Shown below is the team statistics for the Cleveland Cavaliers and the Golden State Warriors. The stats shown here include wins (W), losses (L), points per game (PPG), assists per game (APG), field goal percentage (FG%), and three-point field goals attempted per game (3FGA/G)

In [22]:
idx = Yearly_df_team['2018']['Tm'] == 'Cleveland CLE'
stats_show = ['Tm', 'W', 'L', 'PPG', 'APG', 'FG%', '3FGA/G']
print(Yearly_df_team['2018'][stats_show][idx])
idx = Yearly_df_team['2018']['Tm'] == 'Golden State GS'
print(Yearly_df_team['2018'][stats_show][idx])

              Tm     W     L    PPG   APG    FG%  3FGA/G
3  Cleveland CLE  50.0  32.0  110.9  23.4  0.476    32.1
                Tm     W     L    PPG   APG    FG%  3FGA/G
0  Golden State GS  58.0  24.0  113.5  29.3  0.503    28.9


In [63]:
print(list(Yearly_df_team['2018']))

['3FG%', '3FG% ALLOW', '3FG%_shoot', '3FGA', '3FGA/G', '3FGM', '3FGM/G', 'APG', 'APG ALLOW', 'AST', 'AST/TO', 'Away', 'Conf', 'Diff', 'Div', 'FG%', 'FG% ALLOW', 'FG%_shoot', 'FGA', 'FGA/G', 'FGM', 'FGM/G', 'FT%', 'FT% ALLOW', 'FT%_shoot', 'FTA', 'FTA/G', 'FTM', 'FTM/G', 'GB', 'GP', 'GP_ast', 'GP_shoot', 'Home', 'L', 'L10', 'PA', 'PF', 'PPG', 'PPG ALLOW', 'PPG DIFF', 'PPG_shoot', 'PPS', 'PPS_shoot', 'PTS IN PAINT', 'PTS OFF TO', 'PTS/POSS', 'Pct', 'SEC CHANCE PTS', 'Strk', 'TO', 'TO%', 'TPG', 'TPG ALLOW', 'TPG DIFF', 'TS%', 'Tm', 'W', 'eFG%']


In [505]:
# lets save the dataframe
with open('/Users/rohanramesh/Documents/SportsData/NBA/YearlyTeamData.pickle', 'wb') as handle:
    pickle.dump(Yearly_df_team, handle, protocol=pickle.HIGHEST_PROTOCOL)