In [65]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import urllib
import urllib2
import pandas as pd
import numpy as np

In [66]:
r = urllib.urlopen('http://www.buda.org/leagues/past-leagues')
soup = BeautifulSoup(r, 'html.parser')

In [67]:
iframe = soup.find_all('iframe')[0]
response = urllib2.urlopen(iframe.attrs['src'])
iframe_soup = BeautifulSoup(response)

In [68]:
leaguelinks = [i.a['href'] for i in iframe_soup.find_all("td", class_="infobody")]

In [85]:
# define the dictionary that will contain all player ratings
all_players = {}

# loop over all leagues in the BUDA database
for link in leaguelinks:

    # extract the league id for this league
    leagueid = link[link.index('league=') + 7:]

    # scrape the scores for this league
    leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=0'
    response = urllib2.urlopen(leaguescoreurl)
    leaguescore_soup = BeautifulSoup(response)

    # assemble the data of team ratings for this league
    data = []
    try:
        table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
    except IndexError:
        print("Unable to find a database of scores for league {}".format(leagueid))
        continue
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('th')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values

    # convert to dataframe and drop irrelevant columns
    dfdata = pd.DataFrame(data)
#     print(leagueid, dfdata.columns)
    dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
#     print(leagueid, dfdata.columns)
    dfdata = dfdata.drop(0).reset_index()
    
    # fill na's with -99 to facilitate division dividers
    dfdata = dfdata.fillna(-99)
    
    # get the list of divisions in this league
    divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
    if len(divnames) == 0:
        print("No divisions found, skipping league {}".format(leagueid))
        continue

    # define base ratings by division (arbitrarily assigned based on my experience)
    divratings = {'4/3 Div 1': 1800, '4/3 Div 2': 1400, '4/3 Div 3': 1000, '4/3 Div 4': 900, 
                '5/2 Div 1': 1700, '5/2 Div 2': 1300, '5/2 Div 3': 900, '5/2 Div 4': 800,
                'Open Div 1': 1400, 'Open Div 2': 1200}
    dfdata['div'] = np.zeros(len(dfdata))
    for i in range(len(divnames)-1):
        try:
            divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i], leagueid))
            continue
        try:
            divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
            continue
        try:
            dfdata.ix[divstart + 1: divend, 'div'] = divratings[divnames[i]]
        except KeyError:
            print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
            continue
    try:
        dfdata.ix[divend + 1:, 'div'] = divratings[divnames[-1]]
    except KeyError:
        print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
        continue        

    # remove the division dividers from the dataframe
    for i in range(len(divnames)):
        dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])

    # generate the average goal differential column
    dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
    dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
    dfdata['games'] = dfdata['wins'] + dfdata['losses']
    dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']

    # assert that an average goal differential per game of +5 gives +300 rating points.
    dfdata['rating'] = dfdata['div'] + 60. * dfdata['avgplusminus']

    # scrape the list of teams for this league
    teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
    response = urllib2.urlopen(teamsurl)
    teams_soup = BeautifulSoup(response)

    # generate list of team ids and names for this league
    tdlist = teams_soup.find_all('td', class_='infobody')
    teamids = []
    teamnames = []
    for td in tdlist:
        try:
            url = td.a['href']
            idindex = url.index('team=')
            whichindex = url.index('which=')
            teamids.append(url[idindex+5:whichindex-1])
            teamnames.append(td.a.get_text())
        except:
            continue

    # find all players associated with each team
    # link the team rating to each player on that team
    for teamid, teamname in zip(teamids, teamnames):
        try:
            teamrating = dfdata.ix[dfdata['Team'] == teamname.strip(' '), 'rating'].values[0]
        except IndexError:
            print("Couldn't match {} to scores database, skipping this team.".format(teamname))
            continue

        teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
        response = urllib2.urlopen(teamurl)
        roster_soup = BeautifulSoup(response)

        players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
        for player in players:
            if player in all_players:
                all_players[player].append(teamrating)
            else:
                all_players[player] = [teamrating]
    print("Finished successfully with leage {}".format(leagueid))

Unable to find a database of scores for league 40491
Unable to find a database of scores for league 40278
Unable to find a database of scores for league 40273
Unable to find a database of scores for league 40268
Unable to find a database of scores for league 40264
Unable to find a database of scores for league 40258
Unable to find a database of scores for league 40253
Unable to find a database of scores for league 40249
Unable to find a database of scores for league 40245
No base rating for Northborough Open, skipping league 39633
No base rating for Danvers Weeknight, skipping league 39633
No base rating for Danvers Weekend, skipping league 39633
Unable to find a database of scores for league 39960
Unable to find a database of scores for league 39939
Unable to find a database of scores for league 39904
Unable to find a database of scores for league 39678
Unable to find a database of scores for league 39673
Finished successfully with leage 39641
No base rating for Northborough Open, ski

In [87]:
all_players.pop('')

[1635.0,
 1500.0,
 1455.0,
 1940.0,
 1908.0,
 2020.0,
 1944.0,
 2115.0,
 1740.0,
 1688.5714285714287,
 1662.8571428571429,
 1785.0,
 1750.0,
 1025.0,
 1205.0,
 800.0,
 1408.5714285714287,
 1254.2857142857142,
 1845.7142857142858,
 1040.0,
 1592.0,
 1415.0,
 1220.0,
 1751.4285714285713,
 1208.0,
 1280.0,
 1412.0,
 1340.0,
 1230.0,
 1460.0,
 1357.1428571428571,
 1580.0,
 1648.5714285714287,
 1520.0,
 1500.0,
 1452.5,
 922.85714285714289,
 1168.0,
 1390.0,
 620.0,
 616.0,
 890.0,
 905.71428571428578,
 957.14285714285711,
 1248.5714285714287,
 1135.0,
 725.71428571428578,
 1408.0,
 900.0,
 1317.1428571428571,
 1328.0,
 1004.0,
 1440.0,
 1797.5,
 1380.0,
 680.0,
 1920.0,
 1424.0,
 1632.5,
 390.0,
 1813.3333333333333,
 1868.5714285714287,
 2005.7142857142858,
 2061.1764705882351,
 1812.0,
 1860.0,
 1687.5,
 1626.0,
 2070.0,
 1202.0,
 1516.25,
 1657.1428571428571,
 1463.3333333333333,
 1526.3157894736842,
 1262.0,
 1373.3333333333333,
 1271.4285714285716,
 1386.1538461538462,
 1164.2857142857

In [89]:
all_players

{u'Salomon, Dana': [760.0],
 u'Hofrichter, Lee': [1650.0, 1897.5, 1845.0, 1670.0],
 u'du Vair, Erin': [760.0],
 u'Light, Rebecca': [-22.5],
 u'Khouri, David': [640.0],
 u'Robinson, Matthew': [1592.0,
  1445.8823529411766,
  1348.5714285714287,
  1460.0,
  1101.5384615384614,
  1568.75,
  477.5,
  658.18181818181824],
 u'Brender, Matthew': [573.33333333333337, 1358.75],
 u'Kane, Ryan': [1493.8461538461538, 1603.75, 1700.0],
 u'Zhang, Jing Jing': [1828.5714285714287, 1190.0],
 u'Crimmin, Erik': [712.0],
 u'Stern, Keith': [1944.0,
  1812.0,
  1660.0,
  1977.1428571428571,
  1680.0,
  1586.0,
  1980.0],
 u'Monnier, Kurti': [490.0],
 u'Henderson, Laura': [724.0, 646.66666666666663, 480.0],
 u'Meyer, Joseph': [760.0],
 u'Oberg, Dave': [1492.7272727272727, 1754.0, 1891.25, 1644.0],
 u'Berliner, Martha': [616.0],
 u'Iskandar, Michelle': [1522.0],
 u'Molina, Jenny': [1400.0, 1400.0, 1320.0, 1313.75],
 u'Richards, Brady': [953.33333333333337,
  1366.25,
  590.0,
  1293.3333333333333,
  790.0,
  