In [None]:
import os
import pandas as pd
import numpy as np


PLAYER_HEADERS = ['id', 'age', 'ast', 'blk', 'drb', 'efg_pct', 'fg', 'fg2', 'fg2_pct',
       'fg2a', 'fg3', 'fg3_pct', 'fg3a', 'fg_pct', 'fga', 'first_name',
       'ft', 'ft_pct', 'fta', 'g', 'gs', 'last_name', 'mp', 'orb', 'pf',
       'pos', 'pts', 'stl', 'teams', 'tov', 'trb', 'season']

DUNK_HEADERS = ['assister_id', 'date', 'dunker_id', 'game_id', 'make', 'quarter',
       'team', 'time', 'id', 'season']

def read(data_type):
    if data_type == "players":
        path = os.path.join("processed", "all_players.csv")
    elif data_type == "dunks":
        path = os.path.join("processed", "all_dunks.csv")
    data = pd.DataFrame.from_csv(path)
    return data

def aggregate_and_annotate(players, dunks):

    def find_player_stats_for_season(row):
        dunker_id = row["dunker_id"]
        season = row["season"]
        stats = players[(players["id"] == dunker_id) & (players["season"] == season)]
        print list(stats.ix[0,:])

    dunks["miss"] = dunks.make.map(lambda x: x==0)
    
    player_dunks_by_years = []
    team_dunks_by_years = []
    for season in dunks.season.unique():
        df = dunks[dunks.season == season].groupby("dunker_id").sum().ix[:, ["make", "miss"]]
        df["season"] = season
        player_dunks_by_years.append(df)
        
        df = dunks[dunks.season == season].groupby("team").sum().ix[:, ["make", "miss"]]
        df["season"] = season
        team_dunks_by_years.append(df)
        
    p_df = pd.concat(player_dunks_by_years)
    t_df = pd.concat(team_dunks_by_years)
    
    print t_df.head()
    
    p_df.reset_index(inplace=True)
    t_df.reset_index(inplace=True)
    
    print t_df.head()
    
    t_df.to_csv(os.path.join("processed","team_dunk_totals.csv"))
    
    stats = []
    for i in np.arange(len(p_df)):
        dunker_id = p_df.ix[i,"dunker_id"]
        season = p_df.ix[i,"season"]
        stats.append(players[(players["id"] == dunker_id) & (players["season"] == season)])
    
    stats_df = pd.concat(stats)

    stats_df.rename(columns={"id":"dunker_id"}, inplace=True)
    stats_df.reset_index(inplace=True, drop=True)

    df = pd.concat([stats_df, p_df.drop(["season", "dunker_id"], axis=1)], axis=1, join="inner")
    df.to_csv(os.path.join("processed","player_dunk_totals.csv"))
    
    
    return df, t_df

def summarize_player_dunk_stats(df):
    dunk_percentage = df["make"]/(df["make"] + df["miss"])
    dunks_per_game = df["make"]/df["g"]
    dunks_per_minute = df["make"]/df["mp"]
    dunks_per_fg2 = df["make"]/df["fg2"]
    
    df = pd.concat([dunk_percentage, dunks_per_game, dunks_per_minute, dunks_per_fg2, df["season"], df["dunker_id"], df["first_name"], df["last_name"], df["teams"]], axis=1)
    df.columns = ["dunk_pct", "dpg", "dpm", "dpfg2", "season", "dunker_id", "first_name", "last_name", "teams"]
    df.to_csv("player_dunk_summary.csv")
    
def summarize_team_dunk_stats(df):
    dunk_percentage = df["make"]/(df["make"] + df["miss"])
    
    df = pd.concat([df, dunk_percentage], axis=1)
    col_list = df.columns.tolist()
    print col_list
    df.rename(columns={0:"dunk_pct"}, inplace=True)
    print df.head()
    df.to_csv("team_dunk_summary.csv")
    
if __name__ == "__main__":

    players = read("players")
    dunks = read("dunks")

    dunks = dunks.ix[:, DUNK_HEADERS]
    players = players.ix[:, PLAYER_HEADERS]

    df, t_df = aggregate_and_annotate(players, dunks)
    summarize_player_dunk_stats(df)
    summarize_team_dunk_stats(t_df)
    

In [None]:
df = pd.DataFrame.from_csv(os.path.join("team_dunk_summary.csv"))
team_dunks = df.groupby("team").mean().sort_values("make", ascending=False).drop("season", axis=1)

print team_dunks
ax = plt.subplot(111)
ax.bar(np.arange(len(team_dunks)), team_dunks["make"], tick_label=team_dunks.index)
ax.set_xticklabels(ax.xaxis.get_ticklabels(), rotation=90)

In [None]:
# calculating dunks per minute played for Nicolas Batum
df = pd.DataFrame.from_csv(os.path.join("processed","player_dunk_totals.csv"))

dpm = df[(df["first_name"] == "Nicolas") & (df["last_name"] == "Batum")].apply(lambda x: x["make"]/x["mp"], axis=1)

%matplotlib inline
nb_dunks = df[(df["first_name"] == "Nicolas") & (df["last_name"] == "Batum")]
dpm.index = nb_dunks.season

dpm.plot()

In [None]:
# calculating dunks per games played for Nicolas Batum
df = pd.DataFrame.from_csv(os.path.join("processed","player_dunk_totals.csv"))

dpg = df[(df["first_name"] == "Nicolas") & (df["last_name"] == "Batum")].apply(lambda x: x["make"]/x["g"], axis=1)

%matplotlib inline
nb_dunks = df[(df["first_name"] == "Nicolas") & (df["last_name"] == "Batum")]
dpg.index = nb_dunks.season

dpg.plot()

In [None]:
df = pd.DataFrame.from_csv("dunk_summary.csv")

In [None]:
df[df["first_name"] == "Marquese"]

In [None]:
dw_dunks = df[(df["first_name"] == "Dwyane") & (df["last_name"] == "Wade")]
lbj_dunks = df[(df["first_name"] == "LeBron") & (df["last_name"] == "James")]
cb_dunks = df[(df["first_name"] == "Chris") & (df["last_name"] == "Bosh")]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

ax = plt.subplot(111)
ax.plot(dw_dunks.ix[:,["dpm"]], label="DW")
ax.plot(lbj_dunks.ix[:,["dpm"]], label="LBJ")
ax.plot(cb_dunks.ix[:,["dpm"]], label="CB")

In [None]:
ki_dunks = df[(df["first_name"] == "Kyrie") & (df["last_name"] == "Irving")]
lbj_dunks = df[(df["first_name"] == "LeBron") & (df["last_name"] == "James")]
kl_dunks = df[(df["first_name"] == "Kevin") & (df["last_name"] == "Love")]

ax = plt.subplot(111)
ax.plot(ki_dunks.ix[:,["dpm"]], label="KI")
ax.plot(lbj_dunks.ix[:,["dpm"]], label="LBJ")
ax.plot(kl_dunks.ix[:,["dpm"]], label="KL", )
ax.legend()

In [None]:
def plot_player_summary(stat, dunker_names = None, dunker_ids = None):
    dfs = []
    df = pd.DataFrame.from_csv("dunk_summary.csv")
    if dunker_names != None and len(dunker_names) != 0:
        for dunker_name in dunker_names:
            first_name, last_name = dunker_name.split(" ", 1)
            try:
                dfs.append(df[(df["first_name"] == first_name) & (df["last_name"] == last_name)])
            except Exception as err:
                print err.message
    elif dunker_ids != None and len(dunker_ids) != 0:
        for dunker_id in dunker_ids:
            try:
                dfs.append(df[(df["dunker_id"] == dunker_id)])
            except Exception as err:
                print err.message
    
    try:
        seasons = np.unique(np.concatenate([df["season"] for df in dfs]))
    except ValueError:
        print "couldn't find any dunk records for the names or ids passed in"
        raise
        

    ax = plt.subplot(111)
    if stat == "dunk_pct":
        ax.set_ylim(0, 1.1)
    for dunks in dfs:
        dpms= []
        for season in seasons:
            dpms.append(dunks[dunks["season"] == season][stat].values)
        try:
            ax.plot(pd.DataFrame(dpms, index=seasons), label=dunks["first_name"].values[0]+" "+dunks["last_name"].values[0])
        except Exception as err:
            print "one of the passed names or ids was not accurate"
    ax.legend(loc=(1.05, .5))

In [None]:
plot_player_summary(dunker_names=["Marquese Chriss"], stat="dpfg2")

In [None]:
def plot_team_trends(stat, teams=None):
    df = pd.DataFrame.from_csv("dunk_summary.csv")
    
    if teams == None:
        all_teams = ["".join([c if c not in ["'", "u", "]", "[", " "] else "" for c in tl]).split(',') for tl in df["teams"].values]
        teams = np.unique(np.concatenate(all_teams))
    
    dfs = []
    for team in teams:
        team_stats = []
        for row in np.arange(len(df.index)):
            team_list = "".join([c if c not in ["'", "u", "]", "["] else "" for c in df.ix[row, "teams"]]).split(',')
            if team in team_list:
                team_stats.append(df.ix[row, :])
        team_df = pd.DataFrame(team_stats)
        team_df = team_df.groupby("season").mean()
        team_df["team"] = team
        dfs.append(team_df)

    print pd.concat(dfs).sort_values(stat)
    top_teams = pd.concat(dfs).groupby("team").mean().sort_values(stat, ascending=False)[:10].index

    fig = plt.figure(figsize = (10,20))
    ax = plt.subplot(111)
    if stat == "dunk_pct":
        ax.set_ylim(0, 1.1)
    ax.set_ylim(.5, 1.1)
    for i in np.arange(len(dfs)):
        df = dfs[i]
        team = teams[i]
        if team in top_teams:
            try:
                ax.plot(list(df.index), df.ix[:, stat], label=team)
            except Exception as err:
                print "one of the passed names or ids was not accurate"
    ax.legend(loc=(1.05, .5))

In [None]:
def plot_teams(stat, teams=None):
    df = pd.DataFrame.from_csv("dunk_summary.csv")

    if teams == None:
        all_teams = ["".join([c if c not in ["'", "u", "]", "[", " "] else "" for c in tl]).split(',') for tl in df["teams"].values]
        teams = np.unique(np.concatenate(all_teams))

    dfs = []
    for team in teams:
        team_stats = []
        for row in np.arange(len(df.index)):
            team_list = "".join([c if c not in ["'", "u", "]", "["] else "" for c in df.ix[row, "teams"]]).split(',')
            if team in team_list:
                team_stats.append(df.ix[row, :])
        team_df = pd.DataFrame(team_stats)
        team_df = team_df.groupby("season").mean()
        team_df["team"] = team
        dfs.append(team_df)

    top_teams = pd.concat(dfs).sort_values(stat)
    top_teams.reset_index(inplace=True)
    
    fig = plt.figure(figsize = (100,10))
    ax = plt.subplot(111)
    if stat == "dunk_pct":
        ax.set_ylim(0, 1.1)

    print top_teams
    tick_labels = [str(top_teams.season[i])+" "+top_teams.ix[i, "team"] for i in np.arange(len(top_teams.index))]
    ax.bar(np.arange(len(top_teams)), top_teams.ix[:, stat], tick_label=tick_labels)
    ax.set_xticklabels(ax.get_xmajorticklabels(), rotation=90)
    ax.legend(loc=(1.05, .5))

In [None]:
df = pd.DataFrame.from_csv("dunk_summary.csv")
df[df["season"] == 2016 & ("POR" in df["teams"])]

In [None]:
plot_teams(stat="dpg")

In [None]:
plot_team_trends(stat = "dunk_pct")

In [None]:
cp3_dunks

In [None]:
def read(data_type):
    if data_type == "players":
        path = os.path.join("processed", "all_players.csv")
    elif data_type == "dunks":
        path = os.path.join("processed", "all_dunks.csv")
    data = pd.DataFrame.from_csv(path)
    return data

In [None]:
df = read("players")

In [None]:
df[(df["id"] == "walljo01") & (df["season"] == 2010)]

In [None]:
import dataset

db = dataset.connect("sqlite:///dunks.db")

table = db["dunks"]

In [None]:
results =  table.find()
for i in results:
    print i

In [6]:
%%writefile scraper.py

import pandas as pd
import numpy as np
import urllib2
import collections
import json
import bs4
import datetime
import re
import time
import wptools
import os
import scipy
import scipy.stats
from fuzzywuzzy import process
import matplotlib.pyplot as plt
import argparse
import dataset



def scrape_all(year):

    def players_for_year(year, force_query=False):
        # scrapes basketball-reference.com

        def request_players_from_web(year):
            player_stat_page = "http://www.basketball-reference.com/leagues/NBA_{0}_totals.html".format(str(year+1))
            r = urllib2.urlopen(player_stat_page)
            return r

        def backup_players_to_disk(year, df):
            if not os.path.exists("data/"+str(year)):
                os.makedirs("data/"+str(year))

            with open(os.path.join("data",str(year),"player_list.csv"), "w") as f:
                df.to_csv(f)

        def parse_player_page(page):
            table = bs4.BeautifulSoup(page, "lxml").find("div", {"id":"div_totals_stats"}).table
            stat_dict = collections.defaultdict(list)
            for row in table.find_all("tr", {"class":["full_table", "partial_table"]})[1:]:
                if row.find("td")["data-append-csv"] not in stat_dict["player_id"]:

                    for col in row.find_all("td"):
                        if col.has_attr("data-stat"):
                            stat_name = col["data-stat"]

                            # Special behavior for managing team data
                            if stat_name == "team_id":
                                if col.text != "TOT":
                                    #this player played on just one team this season.
                                    stat_dict["teams"].append([col.text])
                                else:
                                    #this player played on multiple teams, we'll add them later.
                                    stat_dict["teams"].append([])
                            # Special behavior for the player name data
                            elif stat_name == "player":
                                player = col["csk"]
                                stat_dict["first_name"].append(player.split(",")[1])
                                stat_dict["last_name"].append(player.split(",")[0])
                                stat_dict["player_id"].append(col["data-append-csv"])
                            # With everything else just use the stat name
                            else:
                                stat_dict[stat_name].append(col.text)
                else:
                    # we've recorded this player's information before, we just need to complete his list of teams.
                    index = stat_dict["player_id"].index(row.find("td")["data-append-csv"])
                    stat_dict["teams"][index].append(row.find("td", {"data-stat":"team_id"}).a.text)
            df = pd.DataFrame(stat_dict)
            df.set_index("player_id", drop=True, inplace=True)
            return df


        if force_query:
            # We want to get the remote copy, regardless if we have it already.
            page = request_players_from_web(year)
            df = parse_player_page(page)
            backup_players_to_disk(year, df)

        else:
            try:
                # Just try to get the local copy, if it exists
                with open(os.path.join(str(year), "player_list.csv"), "r") as f:
                    return pd.DataFrame.from_csv(f)
            except IOError:
                # We don't have a local copy
                page = request_players_from_web(year)
                df = parse_player_page(page)
                backup_players_to_disk(year, df)

        return df


    def scrape_pbp(year, players):
        # scrapes ESPN and wikipedia

        ###### Helper Methods #######

        def getPlayerId(last_name, first_name, players):
            # print "searching for: " + first_name + " " + last_name

            basic_search = players[(players["last_name"] == last_name) & (players["first_name"] == first_name)]
            if not basic_search.empty:
                # print "found: " + str(basic_search.index[0])
                return basic_search.index[0]
            else:
                full_names = pd.Series(player_list["first_name"]+" "+player_list["last_name"]).values
                name_match = process.extract(first_name+last_name, full_names, limit=1)[0]
                index = np.where(full_names == name_match[0])
                # print "fuzzywuzzy found: " + str(players.index[index[0][0]])
                return players.index[index[0][0]]

        def parse_dunk(dunk, players, game_id, title, teams):
#             print dunk
            if ("missed" in dunk) or ("made" in dunk):
#                 print "old"
                # Old style
                if "missed" in dunk:
                    dunker_name = dunk.split("miss")[0]
                    make = 0
                    # And he missed it...
                elif "made" in dunk:
                    dunker_name = dunk.split("made")[0]
                    make = 0
                elif "alley-oop" in dunk:
                    dunker_name = dunk.split("alley-oop")[0]
                    make = 1
                elif "'s" in dunk:
                    # rare possesive form of play-by-play, "player 1 blocks player 2's slam dunk"
                    # Since the dunk was blocked, we'll just ignore it for now.
                    return
                else:
#                     print "can't find name"
                    return
            else:
#                 print "new"
                if "miss" in dunk:
                    # And he missed it...
                    dunker_name = dunk.split("miss")[0]
                    make = 0

                elif "make" in dunk:
                    # And he made it...
                    dunker_name = dunk.split("make")[0]
                    make = 1
                elif "alley-oop" in dunk:
                    dunker_name = dunk.split("alley-oop")[0]
                    make = 1
                elif "'s" in dunk:
                    # rare possesive form of play-by-play, "player 1 blocks player 2's slam dunk"
                    # Since the dunk was blocked, we'll just ignore it for now.
                    return
                else:
#                     print "can't find name"
                    return

            # Try getting a last name for player, some people don't have these
            if len(dunker_name.split(" ", 1)) == 2:
                last_name = dunker_name.split(" ", 1)[1].strip()
            else:
                last_name = ""

            # Fetch the playerId from the passed in player dataframe.
            first_name = dunker_name.split(" ", 1)[0].strip()

            if len(first_name+last_name) != 0:
                player_id = getPlayerId(last_name, first_name, players)
            else:
                player_id = np.NaN

            # Add quarter to record
            quarter = q_id[-1]

            # Add data to dunk record
            try:
                date_from_title = title.text.split(" - ")[2].strip().encode("ascii", "ignore")
                date = datetime.datetime.strptime(date_from_title, "%B %d, %Y")

                day = date.day
                month = date.month
                year = date.year

            except IndexError:
#                 print "Could not get date for game"

                day = np.NaN
                month = np.NaN
                year = np.NaN

            # Add in-game time to dunk record
            time_stamp = tr.find("td", {"class":"time-stamp"}).text
            time = datetime.datetime.strptime(time_stamp, "%M:%S").time()

            minute = time.minute
            second = time.second

            ptrn = re.compile("\([\w\s]*\)")
            # Add assist to dunk record
            if re.findall(ptrn, dunk):
                assist = re.findall(ptrn, dunk)[0]
                full_name = assist.strip("(").strip(")").split("assists")[0].strip()
                first_name = full_name.split(" ")[0].strip()
                if len(full_name.split(" ")) == 2:
                    last_name = full_name.split(" ")[1].strip()
                else:
                    last_name = ""

                if len(first_name+last_name) != 0:
                    a_player_id = getPlayerId(last_name, first_name, players)
                else:
                    a_player_id = ""
            elif "assisted" in dunk:
                full_name = dunk.lower().split("assisted by")[1].strip()
                first_name = full_name.split(" ")[0].strip()
                if len(full_name.split(" ")) == 2:
                    last_name = full_name.split(" ")[1].strip()
                else:
                    last_name = ""

                if len(first_name+last_name) != 0:
                    a_player_id = getPlayerId(last_name, first_name, players)
                else:
                    a_player_id = ""
            else:
                a_player_id = ""


            try:
                team_id = tr.find("img")["src"].split("/500/")[1].split(".png")[0].upper()
            except IndexError:
#                 print "couldn't find logo for team"
                team_id = predict_team(player_id, a_player_id, players, teams)
                
            dunk_id = str(str(game_id)+str(quarter)+str(minute)+str(second))

            return dict(day=day,
            month=month,
            year=year,
            minute=minute,
            second=second,
            game_id=game_id,
            quarter=quarter,
            make=make,
            player_id=player_id,
            assister_id=a_player_id,
            team=team_id,
            id=dunk_id
            )

        def season_duration(year):

            print "looking for season for year: " + str(year)
            page_title = "{0}%E2%80%93{1}_NBA_season".format(str(year), str(year+1)[2:])


            r = wptools.page(page_title).get_parse().infobox["duration"]

            start_date = 0
            end_date = 0

            dates_string = r.encode('ascii','ignore')
            p = re.compile("[A-Za-z]+\s[0-9]+,\s[0-9]{4}")
            dates = p.findall(dates_string)
            start_date = datetime.datetime.strptime(dates[0], "%B %d, %Y")
            end_date = datetime.datetime.strptime(dates[1], "%B %d, %Y")

            return start_date, end_date

        def predict_team(dunker, assister, player_list, teams):
            
            dunker_teams = player_list.ix[dunker, "teams"]
            if type(dunker_teams) == list:
                dunker_teams = ",".join(dunker_teams)
            
            if assister:
                
                assister_teams = player_list.ix[assister, "teams"]
                if type(assister_teams) == list:
                    assister_teams = ",".join(assister_teams)
                
                for t in teams:
                    if t in dunker_teams and t in assister_teams:
                        return t
            else:
                for t in teams:
                    if t in dunker_teams:
                        return t
                    
            return np.NaN



        def parse_schedule_page(page, start_date, end_date):

            def year_to_espn_season_code(year):
                base = (20, 2000)
                diff = year - base[1]
                code_for_year = base[0] + diff
                return code_for_year

            soup = bs4.BeautifulSoup(page)
            games = soup.find_all("a", {"name":"&lpos=nba:schedule:score"})
            links = []
            for game in games:
                game_id = game["href"].split("=")[1]
                game_month = int(game_id[2:4])
                game_day = int(game_id[4:6])
                game_year_code = int(game_id[0:2])
#                 print game_id, game_month, game_day, game_year_code, year_to_espn_season_code(start_date.year)
                if game_year_code == year_to_espn_season_code(start_date.year):
                    # Beginning of Season
                    if game_month == start_date.month:
                        if game_day > start_date.day or game_day == start_date.day:
                            links.append(game_id)
                    elif game_month > start_date.month:
                        links.append(game_id)
                else:
                    # End of Season
                    if game_month == end_date.month:
                        if game_day < end_date.day or game_day == end_date.day:
                            links.append(game_id)
                    elif game_month < end_date.month:
                        links.append(game_id)
            print "games found: " + str(len(links))
            return links

        def request_schedule_page(date):
            date_param = date.strftime("%Y%m%d")

            ROOT_URL = "http://www.espn.com/nba/schedule/_/date/{date}"
            GAME_ROOT_URL = "http://www.espn.com/nba/playbyplay?gameId={0}"

            r = urllib2.urlopen(ROOT_URL.format(date=date_param)).read()

            return r

        
        def get_game_ids_for_season(season):
            print "looking for season for year: " + str(year)
            page_title = "{0}%E2%80%93{1}_NBA_season".format(str(year), str(year+1)[2:])

            r = wptools.page(page_title).get_parse().infobox["duration"]

            start_date = 0
            end_date = 0

            dates_string = r.encode('ascii','ignore')
            p = re.compile("[A-Za-z]+\s[0-9]+,\s[0-9]{4}")
            dates = p.findall(dates_string)
            start_date = datetime.datetime.strptime(dates[0], "%B %d, %Y")
            end_date = datetime.datetime.strptime(dates[1], "%B %d, %Y")
            
            # ESPN starts using sequential game_ids starting Oct 2012, making it very easy to iterate through all games.
            if start_date > datetime.datetime.strptime("October, 2012", "%B, %Y"):
                print "after 2012"
                r = request_schedule_page(start_date)

                soup = bs4.BeautifulSoup(r, "lxml")

                indices = []
                pbp_path = "data/"+str(year)+"/pbp_reg"

                if not os.path.exists(pbp_path):
                    os.makedirs(pbp_path)

                dataset_path = "data/"+str(year)+"/datasets"
                if not os.path.exists(dataset_path):
                    os.makedirs(dataset_path)

                first_game_id = ""
                for tr in soup.find("div", {"id":"sched-container"}).find_all("tr"):
                    if tr.find("a", {"name":"&lpos=nba:schedule:score"}) != None:
                        first_game_id = tr.find("a", {"name":"&lpos=nba:schedule:score"})["href"].split("=")[1]
                        break

                if year == 2011:
                    reg_season_games = (66*30)/2
                elif year == 1998:
                    reg_season_games = (50*29)/2
                else:
                    reg_season_games = (82*30)/2
                return np.arange(int(first_game_id), int(first_game_id)+reg_season_games)
            
            else:
                print "before 2012"
                duration = end_date - start_date

                weeks_in_season = duration.days//7

                dataset_path = "data/"+str(year)+"/datasets"
                if not os.path.exists(dataset_path):
                    os.makedirs(dataset_path)

                game_ids = []
                for week in np.arange(0, weeks_in_season+1):
                    time.sleep(np.random.randint(2,7))
                    r = request_schedule_page(start_date+datetime.timedelta(days=week*7))
                    def year_to_espn_season_code(year):
                        base = (20, 2000)
                        diff = year - base[1]
                        code_for_year = base[0] + diff
                        return code_for_year

                    soup = bs4.BeautifulSoup(r)
                    games = soup.find_all("a", {"name":"&lpos=nba:schedule:score"})
                    for game in games:
                        game_id = game["href"].split("=")[1]
                        game_month = int(game_id[2:4])
                        game_day = int(game_id[4:6])
                        game_year_code = int(game_id[0:2])
                        print game_id, game_month, game_day, game_year_code, year_to_espn_season_code(start_date.year)
                        if game_year_code == year_to_espn_season_code(start_date.year):
                            # Beginning of Season
                            if game_month == start_date.month:
                                if game_day > start_date.day or game_day == start_date.day:
                                    game_ids.append(game_id)
                            elif game_month > start_date.month:
                                game_ids.append(game_id)
                        else:
                            # End of Season
                            if game_month == end_date.month:
                                if game_day < end_date.day or game_day == end_date.day:
                                    game_ids.append(game_id)
                            elif game_month < end_date.month:
                                game_ids.append(game_id)
                return game_ids
        
        ###### END OF HELPER METHODS ######

        db = dataset.connect("sqlite:///dunks.db")
        table = db["dunks"]
        
        GAME_ROOT_URL = "http://www.espn.com/nba/playbyplay?gameId={0}"            
        
        game_ids_path = os.path.join("data", str(year), "game_ids")
        
        import pickle
        if not os.path.exists(game_ids_path):
            print year
            game_ids = get_game_ids_for_season(year)

            with open(game_ids_path, 'wb') as fp:
                pickle.dump(game_ids, fp)
        else:
            with open (game_ids_path, 'rb') as fp:
                game_ids = pickle.load(fp)

        # Check if we have any dunks from games this season already in the database
        results = list(table.find(season=year))
        print len(results)
        recorded_g_ids = set([int(dunk["game_id"]) for dunk in results])
        all_g_ids = set([int(game_id) for game_id in game_ids])
        if len(recorded_g_ids) > 0:
            missing_g_ids = list(all_g_ids - recorded_g_ids)
        else:
            missing_g_ids = all_g_ids
        
        print "There are ", len(missing_g_ids), " non-recorded games in the existing dunk data."
        games_not_found = []
        
        pbp_path = os.path.join("data", str(year), "pbp_reg")

        if not os.path.exists(pbp_path):
            os.makedirs(pbp_path)
    
        for game_id in missing_g_ids:  
            print game_id
            try:
                with open(os.path.join(pbp_path, "pbp_"+str(game_id)+".txt"), "r") as f:
                    g = f.read()
                    g_soup = bs4.BeautifulSoup(g, "lxml")
                print "We've got this on disk"
            except IOError:
                print "Don't have this play-by-play yet"
                time.sleep(np.random.randint(2,7))
                try:
                    g = urllib2.urlopen(GAME_ROOT_URL.format(game_id)).read()
                except urllib2.URLError:
                    print "could not get page"
                    games_not_found.append(game_id)
                    continue


            g_soup = bs4.BeautifulSoup(g, "lxml")
            with open(os.path.join(pbp_path, "pbp_"+str(game_id)+".txt"), "wb") as f:
                f.write(str(g_soup))
            try:
                title = g_soup.title
                print "Checking for dunks: ", " ".join(title.text.split(" - ")[0:3])
            except AttributeError:
                print "Something went wrong. Possibly the backed up file did not save properly."


            play_by_play =  g_soup.find("article", {"class":"play-by-play"})


            teams = [team.text for team in g_soup.find_all("span", {"class":"abbrev"})]

            try:
                for acc in play_by_play.find_all("li", {"class":"accordion-item"}):
                    for div in acc.find_all("div"):
                        if div.has_attr('id'):
                            q_id = div["id"]
                    for tr in acc.find_all("tr"):
                        details = tr.find("td", {"class":"game-details"})
                        if details != None:
                            play = details.string.lower()
                            if "dunk" in play:
                                # This play was a dunk!
                                parsed_dict = parse_dunk(play, players, game_id, title, teams)
                                if parsed_dict != None:
                                    parsed_dict["season"] = year
                                    try:
                                        table.insert_ignore(parsed_dict, ["id"])
                                        db.commit()
                                        print "dunk saved"
                                    except Exception as err:
                                        db.rollback()
                                        print err.message, "dunk not saved"
                                else:
                                    continue
            except AttributeError:
                print "Play-by-play parsing failed. Probably got incorrect page."
                continue

    player_list = players_for_year(year)
    scrape_pbp(year, player_list)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Get dunks for any given NBA season')
    parser.add_argument('-s', '--season', type=int, nargs='+',
                        help='The year to scrape (the year passed represents the beginning of the season)', required=True)

    args = parser.parse_args()
    print args.season
    for season in list(args.season):
        scrape_all(season)

Overwriting scraper.py


In [None]:
%run scraper.py -s 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016

[2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]
5119
There are  222  non-recorded games in the existing dunk data.
230123009
We've got this on disk
Checking for dunks:  Nets vs. Warriors Play-By-Play January 23, 2003
230205014
We've got this on disk
Checking for dunks:  Trail Blazers vs. Heat Play-By-Play February 5, 2003
230115002
We've got this on disk
Checking for dunks:  Hawks vs. Celtics Play-By-Play January 15, 2003
230123020
We've got this on disk
Checking for dunks:  Mavericks vs. 76ers Play-By-Play January 23, 2003
230122001
We've got this on disk
Checking for dunks:  Trail Blazers vs. Hawks Play-By-Play January 22, 2003
230122002
We've got this on disk
Checking for dunks:  Bucks vs. Celtics Play-By-Play January 22, 2003
230315027
We've got this on disk
Checking for dunks:  Heat vs. Wizards Play-By-Play March 15, 2003
230123029
We've got this on disk
Checking for dunks:  Kings vs. Grizzlies Play-By-Play January 23, 2003
230122008
We'v

In [4]:
db = dataset.connect("sqlite:///dunks.db")
table = db["dunks"]


results = table.all()
results = list(results)
len(results)

89635

In [14]:
len(np.unique([dunk["game_id"] for dunk in list(db["dunks"].find(season=2010))]))


1217

In [1085]:
for result in results:
    result["assister_id"] = ""
    table.update(result, ["id"])

StatementError: (exceptions.ValueError) could not convert string to float:  [SQL: u'UPDATE dunks SET quarter=?, season=?, make=?, team=?, month=?, second=?, year=?, player_id=?, game_id=?, assister_id=?, day=?, minute=? WHERE dunks.id = ?']

In [15]:
!ls

Dunk Dataset.ipynb      database.pyc            player_dunk_summary.csv
[34m__pycache__[m[m             dunk.db                 [34mprocessed[m[m
annotate.ipynb          dunk_summary.csv        scraper.py
annotate.py             dunks.db                scraper.pyc
assemble.py             fix_dunks.sql           team_dunk_summary.csv
[34mdata[m[m                    parser.py               test_scraper.py
database.py             parser.pyc


In [19]:
import pickle
games_2003 = pickle.load(open("data/2003/game_ids", "rb"))

In [21]:
print games_2003

['231028020', '231028024', '231028013', '231029002', '231029003', '231029007', '231029016', '231029026', '231029004', '231029009', '231029023', '231029008', '231029018', '231029028', '231030010', '231030019', '231030025', '231030021', '231031012', '231031028', '231031029', '231031004', '231031011', '231031014', '231031023', '231031017', '231101001', '231101003', '231101007', '231101016', '231101019', '231101021', '231101022', '231101027', '231101006', '231101009', '231101010', '231101015', '231101024', '231102013', '231103019', '231103022', '231103026', '231103029', '231103001', '231103004', '231103006', '231104011', '231104015', '231104017', '231104024', '231105016', '231105019', '231105020', '231105026', '231105027', '231105008', '231105009', '231105018', '231105022', '231105005', '231106017', '231106028', '231106024', '231107003', '231107007', '231107021', '231107027', '231107002', '231107004', '231107009', '231107011', '231107018', '231107025', '231107008', '231107019', '231108003'