In [374]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from datetime import datetime
from time import sleep

In [387]:
def gather_data(team, year):
    # Load URL
    # Customize for specific url
    url = 'http://www.hoopsstats.com/basketball/fantasy/nba/%s/team/schedule/%d/%d/1-2-2-date' % (teams[team][1], year, teams[team][0])
    words = urllib.request.urlopen(url)
    words2 = BeautifulSoup(words, 'html.parser')
    # Get all of the game tables
    # This includes the oponent and when the game was
    stats = []
    for tab in words2.find_all('table'):
        if tab.has_attr('class'):
            if tab['class'][0] == 'statscontent':
                stats.append(tab)
    # Get all of the stats in the rows
    stats2 = []
    for t in words2.find_all('center'):
        stats2.append(t)
    # Create data frame
    nrow=int(len(stats)/2)
    games = pd.DataFrame(columns=['home_team', 'away_team', 'date', 'result', 'score' ,'min',
                              'H_pts', 'H_reb', 'H_ast', 'H_stl', 'H_blk', 'H_to',
                              'H_pf', 'H_dreb', 'H_oreb', 'H_fgma', 'H_fgpct', 'H_3gma', 'H_3pct',
                              'H_ftma', 'H_ftpct', 'H_eff', 
                              'dummy1', 'dummy2', 'dummy_min',
                              'A_pts', 'A_reb', 'A_ast', 'A_stl', 'A_blk', 'A_to',
                              'A_pf', 'A_dreb', 'A_oreb', 'A_fgma', 'A_fgpct', 'A_3gma', 'A_3pct',
                              'A_ftma', 'A_ftpct', 'A_eff'], index=range(nrow))
    # Home Team
    games['home_team'] = team
    # Fill with first stats
    ## Away team and date
    game_num = 0
    for g in stats:
        k = 0
        # For each game, the info is in td tags
        for j in g.find_all('td'):
            val = j.get_text()
            # Every other game is a blank line, break out and go to next game
            if val == "":
                break
            if(k == 0):
                # Convert date into a date object
                d = datetime.strptime(val, '%b %d')
                d2 = datetime.strftime(d, "%m/%d")
                # Try to put the value in, if out of bounds why?
                try:
                    games.set_value(game_num, 'date', d2)
                except IndexError:
                    print(game_num)
                    print(val)
            # Strip the vs from the Away team and add to df.
            elif(k == 1):
                try:
                    games.set_value(game_num, 'away_team', re.findall(r'\s.+$', val)[0].strip())
                except IndexError:
                    print(game_num)
                    print(val)
            k += 1
        # If we weren't dealing with the blank table.
        if val != '':
            game_num += 1
    
    # Fill with the rest of the stats
    game_num = 0
    idx = 0
    game_stats = stats2[21::] # The first 21 values are the column headers
    while game_num < len(stats):
        # There are 38 columns to fill with the current games stats
        this_game = game_stats[idx:idx+38]
        # the random non-stats rows of the tables.
        try:
            # If it is one of the weird non-stats columns, skip to the next game by adding 21
            if(this_game[0].get_text() == "Date"):
                idx += 21
                this_game = game_stats[idx:idx+38]
            col_num = 3
            # Loop over the stats for that specific game
            # Add to data frame
            for i in range(len(this_game)):
                try:
                    games.iloc[game_num, col_num] = this_game[i].get_text()
                except IndexError:
                    print(game_num, col_num, game_stats[i].get_text())
                col_num += 1
            # Go to the next game
            idx += 38
            # Increment game number
            game_num += 1
        # We have probably just reached the end
        except IndexError:
            break
    # Get rid of dummy cols
    games.drop(['score', 'dummy1', 'dummy2', 'dummy_min'], axis=1, inplace=True)
    return games

In [376]:
teams = {
    "Atlanta": (1, "atlanta-hawks"),
    "Brooklyn": (18,"brooklyn-nets"),
    "Boston": (2, "boston-celtics"),
    "Charlotte": (3, "charlotte-hornets"),
    "Chicago": (4, "chicago-bulls"),
    "Cleveland": (5, "cleveland-cavaliers"),
    "Dallas": (6, "dallas-mavericks"),
    "Denver": (7, "denver-nuggets"),
    "Detroit": (8, "detroit-pistons"),
    "Golden State": (9, "golden-state-warriors"),
    "Houston": (10, "houston-rockets"),
    "Indiana": (11, "indiana-pacers"),
    "L.A. Clippers": (12, "los-angeles-clippers"),
    "L.A. Lakers": (13, "los-angeles-lakers"),
    "Memphis": (14, "memphis-grizzlies"),
    "Miami": (15, "miami-heat"),
    "Milwaukee": (16, "milwaukee-bucks"),
    "Minnesota": (17, "minnesota-timberwolves"),
    "New Orleans": (19, "new-orleans-pelicans"),
    "New York": (20, "new-york-knicks"),
    "Oklahoma City": (21, "oklahoma-city-thunder"),
    "Orlando Magic": (22, "orlando-magic"),
    "Philadelphia": (23, "philadelphia-76ers"),
    "Phoenix": (24, "phoenix-suns"),
    "Portland": (25, "portland-trail-blazers"),
    "Sacramento": (26, "sacramento-kings"),
    "San Antonio": (27, "san-antonio-spurs"),
    "Toronto": (28, "toronto-raptors"),
    "Utah": (29, "utah-jazz"),
    "Washington": (30, "washington-wizards")
}

In [390]:
atl = gather_data("Atlanta", 17)

In [385]:
# First file
with open('stats551data.csv', 'a') as f:
    atl.to_csv(f, header=True, index=False)

In [386]:
# For rest of teams:
for team in teams:
    if team == "Atlanta":
        next
    else:
        sleep(3)
        df = gather_data(team, 17)
        with open('stats551data.csv', 'a') as f:
            df.to_csv(f, header=False, index=False)