In [1]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from datetime import datetime
from time import sleep
import sys

In [2]:
def gather_data(team, year):
    # Load URL
    # Customize for specific url
    url = 'http://www.hoopsstats.com/basketball/fantasy/nba/%s/team/schedule/%d/%d/1-2-2-date' % (teams[team][1], year, teams[team][0])
    words = urllib.request.urlopen(url)
    words2 = BeautifulSoup(words, 'html.parser')
    if words.code != 200:
        print(team)
        print(url)
        sys.exit(1)
    # Get all of the game tables
    # This includes the oponent and when the game was
    stats = []
    for tab in words2.find_all('table'):
        if tab.has_attr('class'):
            if tab['class'][0] == 'statscontent':
                stats.append(tab)
    # Get all of the stats in the rows
    stats2 = []
    for t in words2.find_all('center'):
        stats2.append(t)
    # Create data frame
    nrow=int(len(stats)/2)
    games = pd.DataFrame(columns=['home_team', 'away_team', 'date', 'result', 'score' ,'min',
                              'H_pts', 'H_reb', 'H_ast', 'H_stl', 'H_blk', 'H_to',
                              'H_pf', 'H_dreb', 'H_oreb', 'H_fgma', 'H_fgpct', 'H_3gma', 'H_3pct',
                              'H_ftma', 'H_ftpct', 'H_eff', 
                              'dummy1', 'dummy2', 'dummy_min',
                              'A_pts', 'A_reb', 'A_ast', 'A_stl', 'A_blk', 'A_to',
                              'A_pf', 'A_dreb', 'A_oreb', 'A_fgma', 'A_fgpct', 'A_3gma', 'A_3pct',
                              'A_ftma', 'A_ftpct', 'A_eff'], index=range(nrow))
    # Home Team
    games['home_team'] = team
    # Fill with first stats
    ## Away team and date
    game_num = 0
    for g in stats:
        k = 0
        # For each game, the info is in td tags
        for j in g.find_all('td'):
            val = j.get_text()
            # Every other game is a blank line, break out and go to next game
            if val == "":
                break
            if(k == 0):
                try:
                    if val[0:3] in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']:
                        yr = str(20) + str(year)
                        val += '/' + yr
                    else:
                        yr = str(20) + str(year-1)
                        val += '/' + yr
                    # Convert date into a date object
                    d = datetime.strptime(val, '%b %d/%Y')
                    d2 = datetime.strftime(d, "%m/%d/%Y")
                except ValueError:
                    print(team)
                    print(val)
                # Try to put the value in, if out of bounds why?
                try:
                    games.set_value(game_num, 'date', d2)
                except IndexError:
                    print(game_num)
                    print(val)
            # Strip the vs from the Away team and add to df.
            elif(k == 1):
                try:
                    games.set_value(game_num, 'away_team', re.findall(r'\s.+$', val)[0].strip())
                except IndexError:
                    print(game_num)
                    print(val)
            k += 1
        # If we weren't dealing with the blank table.
        if val != '':
            game_num += 1
    
    # Fill with the rest of the stats
    game_num = 0
    idx = 0
    game_stats = stats2[21::] # The first 21 values are the column headers
    while game_num < len(stats):
        # There are 38 columns to fill with the current games stats
        this_game = game_stats[idx:idx+38]
        # the random non-stats rows of the tables.
        try:
            # If it is one of the weird non-stats columns, skip to the next game by adding 21
            if(this_game[0].get_text() == "Date"):
                idx += 21
                this_game = game_stats[idx:idx+38]
            col_num = 3
            # Loop over the stats for that specific game
            # Add to data frame
            for i in range(len(this_game)):
                try:
                    games.iloc[game_num, col_num] = this_game[i].get_text()
                except IndexError:
                    print(game_num, col_num, game_stats[i].get_text())
                col_num += 1
            # Go to the next game
            idx += 38
            # Increment game number
            game_num += 1
        # We have probably just reached the end
        except IndexError:
            break
    # Get rid of dummy cols
    games.drop(['score', 'dummy1', 'dummy2', 'dummy_min'], axis=1, inplace=True)
    return games

In [3]:
teams = {
    "Atlanta": (1, "atlanta-hawks"),
    "Brooklyn": (18,"brooklyn-nets"),
    "Boston": (2, "boston-celtics"),
    "Charlotte": (3, "charlotte-hornets"),
    "Chicago": (4, "chicago-bulls"),
    "Cleveland": (5, "cleveland-cavaliers"),
    "Dallas": (6, "dallas-mavericks"),
    "Denver": (7, "denver-nuggets"),
    "Detroit": (8, "detroit-pistons"),
    "Golden State": (9, "golden-state-warriors"),
    "Houston": (10, "houston-rockets"),
    "Indiana": (11, "indiana-pacers"),
    "L.A. Clippers": (12, "los-angeles-clippers"),
    "L.A. Lakers": (13, "los-angeles-lakers"),
    "Memphis": (14, "memphis-grizzlies"),
    "Miami": (15, "miami-heat"),
    "Milwaukee": (16, "milwaukee-bucks"),
    "Minnesota": (17, "minnesota-timberwolves"),
    "New Orleans": (19, "new-orleans-pelicans"),
    "New York": (20, "new-york-knicks"),
    "Oklahoma City": (21, "oklahoma-city-thunder"),
    "Orlando Magic": (22, "orlando-magic"),
    "Philadelphia": (23, "philadelphia-76ers"),
    "Phoenix": (24, "phoenix-suns"),
    "Portland": (25, "portland-trail-blazers"),
    "Sacramento": (26, "sacramento-kings"),
    "San Antonio": (27, "san-antonio-spurs"),
    "Toronto": (28, "toronto-raptors"),
    "Utah": (29, "utah-jazz"),
    "Washington": (30, "washington-wizards")
}

In [390]:
atl = gather_data("Atlanta", 17)

In [385]:
# First file
with open('stats551data.csv', 'a') as f:
    atl.to_csv(f, header=True, index=False)

In [386]:
# For rest of teams:
for team in teams:
    if team == "Atlanta":
        next
    else:
        sleep(3)
        df = gather_data(team, 17)
        with open('stats551data.csv', 'a') as f:
            df.to_csv(f, header=False, index=False)

Read in the data to do a little more data management.

In [4]:
# Join together the team index
teamsdf = pd.DataFrame(teams).transpose().reset_index()
teamsdf.rename(index=str, columns={0: 'team_idx', 1:'website'}, inplace=True)
teamsdf.drop('website', axis=1, inplace=True)
teamsdf.head()

Unnamed: 0,index,team_idx
0,Atlanta,1
1,Boston,2
2,Brooklyn,18
3,Charlotte,3
4,Chicago,4


In [68]:
data = pd.read_csv("../Data/stats551data.csv")

In [5]:
def clean_data(data, teamsdf, reg_season_date, all_star_date):
    '''Function to add team indicator values, split the field goal type columns
    and add indicators for regular season and before all star games.
    The two date arguments must be strings of the form %m/%d/%Y
    '''
    # Merge the data with the teams
    data2 = pd.merge(data, teamsdf, how="left", left_on='home_team', right_on='index')
    data2.rename(index=str, columns={'team_idx': 'home_team_idx'}, inplace=True)
    data2 = pd.merge(data2, teamsdf, how="left", left_on='away_team', right_on='index')
    data2.rename(index=str, columns={'team_idx':'away_team_idx'}, inplace=True)
    data2.drop(['index_x', 'index_y'], axis=1, inplace=True)
    
    # Split the FG, 3p and FT columns
    data2['H_FGM'], data2['H_FGA'] = data2['H_fgma'].str.split('-').str
    data2['A_FGM'], data2['A_FGA'] = data2['A_fgma'].str.split('-').str
    data2['H_3M'], data2['H_3A'] = data2['H_3gma'].str.split('-').str
    data2['A_3M'], data2['A_3A'] = data2['A_3gma'].str.split('-').str
    data2['H_FTM'], data2['H_FTA'] = data2['H_ftma'].str.split('-').str
    data2['A_FTM'], data2['A_FTA'] = data2['A_ftma'].str.split('-').str
    
    data2.drop(['H_fgma', 'A_fgma', 'H_3gma', 'A_3gma', 'H_ftma', 'A_ftma'], axis=1, inplace=True)
    
    # Now make indicators for regular season games
    data2['date'] = pd.to_datetime(data2['date'])
    data2['reg_season'] = np.where(data2['date'] < datetime.strptime(reg_season_date, '%m/%d/%Y'), True, False)
    data2['pre_allstar'] = np.where(data2['date'] < datetime.strptime(all_star_date, '%m/%d/%Y'), True, False)
    
    return(data2)

Now make indicators for regular season games and pre/post all star games

In [None]:
data2_1617 = clean(data, teamsdf, '4/13/2017','2/19/2017')

In [78]:
data2.to_csv('../Data/stats551data_updated.csv', index=False)

Let's also get the data from the 2015-2016 season.

In [6]:
# First file
atl = gather_data("Atlanta", 16)

In [7]:
with open('../Data/stats551data_2015_2016.csv', 'a') as f:
    atl.to_csv(f, header=True, index=False)

In [8]:
# For rest of teams:
for team in teams:
    if team == "Atlanta":
        next
    else:
        df = gather_data(team, 16)
        with open('../Data/stats551data_2015_2016.csv', 'a') as f:
            df.to_csv(f, header=False, index=False)

In [9]:
# Read back in data
data2_1516 =  pd.read_csv("../Data/stats551data_2015_2016.csv")
data2_1516 = clean_data(data2_1516, teamsdf, '4/14/2016', '2/14/2016')
data2_1516.to_csv('../Data/stats551data_1516_updated.csv')

In [10]:
data2_1516.head()

Unnamed: 0,home_team,away_team,date,result,min,H_pts,H_reb,H_ast,H_stl,H_blk,...,H_3M,H_3A,A_3M,A_3A,H_FTM,H_FTA,A_FTM,A_FTA,reg_season,pre_allstar
0,Atlanta,Cleveland,2016-05-08,L,48,99,42,22,9,7,...,9,31,16,37,10,14,8,13,False,False
1,Atlanta,Cleveland,2016-05-06,L,48,108,28,29,8,7,...,16,34,21,39,10,12,16,27,False,False
2,Atlanta,Boston,2016-04-26,W,48,110,51,30,13,3,...,14,36,7,29,12,16,18,25,False,False
3,Atlanta,Boston,2016-04-19,W,48,89,49,20,7,15,...,11,29,5,28,14,16,11,12,False,False
4,Atlanta,Boston,2016-04-16,W,48,102,53,23,4,9,...,5,27,11,35,27,35,16,19,False,False
