In [1]:
import requests
import pandas as pd
from pyquery import PyQuery as pq
import helpers

In [2]:
leagues = ["euroleague", "eurocup", "acb", "italian", "french", "greek"]
stats = ["basic", "usage", "efficiency"]

In [3]:
url_template = "http://www.draftexpress.com/stats/{league}/{year}/all/{stat}/"\
                "pace/0/all/all/{pg}"

In [4]:
# dict to store scraped dfs and errors for each league and stat type
data_dict = {lg: {stat: {"dfs": [], "errors": []} for stat in stats} 
             for lg in leagues}

In [5]:
data_dict

{'acb': {'basic': {'dfs': [], 'errors': []},
  'efficiency': {'dfs': [], 'errors': []},
  'usage': {'dfs': [], 'errors': []}},
 'eurocup': {'basic': {'dfs': [], 'errors': []},
  'efficiency': {'dfs': [], 'errors': []},
  'usage': {'dfs': [], 'errors': []}},
 'euroleague': {'basic': {'dfs': [], 'errors': []},
  'efficiency': {'dfs': [], 'errors': []},
  'usage': {'dfs': [], 'errors': []}},
 'french': {'basic': {'dfs': [], 'errors': []},
  'efficiency': {'dfs': [], 'errors': []},
  'usage': {'dfs': [], 'errors': []}},
 'greek': {'basic': {'dfs': [], 'errors': []},
  'efficiency': {'dfs': [], 'errors': []},
  'usage': {'dfs': [], 'errors': []}},
 'italian': {'basic': {'dfs': [], 'errors': []},
  'efficiency': {'dfs': [], 'errors': []},
  'usage': {'dfs': [], 'errors': []}}}

In [6]:
col_dict = {"basic": helpers.basic_stats_cols,
            "usage": helpers.usage_cols,
            "efficiency": helpers.eff_cols}

In [7]:
print(col_dict)

{'basic': ['box', 'Player', 'Team_Logo', 'Team', 'G', 'MP', 'PTS', 'FG_2P', 'FG_2PA', 'FG_2P_Pct', 'FG_3P', 'FGA_3P', 'FG_3P_Pct', 'FT', 'FTA', 'FT_Pct', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF'], 'usage': ['box', 'Player', 'Team_Logo', 'Team', 'G', 'MP', 'PER', 'EFF', 'EFF_per_40', 'EWA', 'Poss_per_G', 'Tm_Poss_per_G', 'Pct_of_Tm_Poss', 'Pts_per_Poss', 'FGA_per_Poss', 'FTA_per_Poss', 'AST_per_Poss', 'TO_per_Poss'], 'efficiency': ['box', 'Player', 'Team_Logo', 'Team', 'G', 'MP', 'PTS', 'FGA', 'PTS_per_Play', 'TS_Pct', 'eFG_Pct', 'FT_Rate', 'Three_Pt_Rate', 'AST', 'AST_to_FGA_Ratio', 'AST_to_TO_Ratio', 'PPR', 'STL', 'BLK', 'PF']}


In [8]:
for lg in leagues:
    for stat in stats:
        for yr in range(2003, 2018):
            # For each year find the last page number, if there isn't
            # one then there are no stats for that page and move on to a 
            # different year
            last_pg_url = url_template.format(league=lg, year=yr, stat=stat, 
                                              pg=1)
            try:
                last_pg = helpers.get_last_pg(last_pg_url)
            except Exception as e:
                data_dict[lg][stat]["errors"].append([lg, stat, yr,
                                                      last_pg_url, e])
                continue
            for pg in range(1, int(last_pg)):
                url = url_template.format(league=lg,year=yr,stat=stat, pg=pg)
                try:
                    df = helpers.create_df(url, col_dict[stat])
                    df["Season"] = yr
                    df["League"] = lg
                    df.drop(["box", "Team_Logo"], axis=1, inplace=True)
                    df["DX_Player_ID"] = (df.Player_Link.str
                                          .extract("/.*/(.*)/", expand=False))
                    df["DX_Team_ID"] = (df.Team_Link.str
                                        .extract("/.*/.*/.*/(.*)", expand=False))
                    df["Team"] = df.Team.str.strip()
                    data_dict[lg][stat]["dfs"].append(df)
                except Exception as e:
                    data_dict[lg][stat]["errors"].append([lg, stat, yr, url, 
                                                          e])

In [9]:
for lg in leagues:
    for stat in stats:
        print(len(data_dict[lg][stat]["errors"]))

0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
6
6
6


In [10]:
for lg in leagues:
    for stat in stats:
        print(len(data_dict[lg][stat]["dfs"]))

194
194
194
278
278
278
157
157
157
154
154
154
127
127
127
84
84
84


In [11]:
for lg in leagues:
    for stat in stats:
        print(data_dict[lg][stat]["errors"],"\n")

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[['french', 'basic', 2003, 'http://www.draftexpress.com/stats/french/2003/all/basic/pace/0/all/all/1', IndexError('list index out of range',)], ['french', 'basic', 2004, 'http://www.draftexpress.com/stats/french/2004/all/basic/pace/0/all/all/1', IndexError('list index out of range',)]] 

[['french', 'usage', 2003, 'http://www.draftexpress.com/stats/french/2003/all/usage/pace/0/all/all/1', IndexError('list index out of range',)], ['french', 'usage', 2004, 'http://www.draftexpress.com/stats/french/2004/all/usage/pace/0/all/all/1', IndexError('list index out of range',)]] 

[['french', 'efficiency', 2003, 'http://www.draftexpress.com/stats/french/2003/all/efficiency/pace/0/all/all/1', IndexError('list index out of range',)], ['french', 'efficiency', 2004, 'http://www.draftexpress.com/stats/french/2004/all/efficiency/pace/0/all/all/1', IndexError('list index out of range',)]] 

[['greek', 'basic', 2003, 'http://www.draftexpress.co

In [12]:
for lg in leagues:
    for stat in stats:
        df = pd.concat(data_dict[lg][stat]["dfs"])
        if stat == "basic":
            file = "raw_data/draft_express_{}_player_basic_stats_pace_adj_"\
                   "07_11_17.csv"
            df.to_csv(file.format(lg), index=False)
        else:
            file = "raw_data/draft_express_{}_player_{}_07_11_17.csv"
            df.to_csv(file.format(lg, stat), index=False)