In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#generate url list
def getUrl(year):
    baseUrl = "https://www.basketball-reference.com/leagues/NBA_"
    months = ["october", "november", "december", "january", "february", "march", "april", "may", "june"]
    urlList = []
    year += 1
    for month in months:
        urlList.append(baseUrl + str(year) + "_games-" + month + ".html")
    return urlList

In [3]:
def compareScore(team1, score1, team2, score2):
    if score1 > score2:
        return (team1, team2)
    return (team2, team1)

In [6]:

#download html pages from urlList and parse the data
def collectAndParse(urlList):
    post = pd.DataFrame(columns=["date", "away", "awayScore", "home", "homeScore", "winTeam", "loseTeam"])
    for url in urlList:
        #print "working on " + url
        result = requests.get(url).text
        soup = BeautifulSoup(result, 'html.parser').find("table", {"id" : "schedule"}).find("tbody")
        for game in soup.find_all("tr"):
            gameDetail = {}

            gameDetail["date"] = game.find("th").get_text()
            
            if "Playoffs" in gameDetail["date"]:
                reg = post
                post = pd.DataFrame(columns=["date", "away", "awayScore", "home", "homeScore", "winTeam", "loseTeam"])
                continue

            gameDetail["away"] = game.find("td", {"data-stat" : "visitor_team_name"}).get_text()
            gameDetail["awayScore"] = int(game.find("td", {"data-stat" : "visitor_pts"}).get_text())
            gameDetail["home"] = game.find("td", {"data-stat" : "home_team_name"}).get_text()
            gameDetail["homeScore"] = int(game.find("td", {"data-stat" : "home_pts"}).get_text())
            gameDetail["winTeam"], gameDetail["loseTeam"] = compareScore(gameDetail["away"], gameDetail["awayScore"], gameDetail["home"], gameDetail["homeScore"])
            data = pd.Series(gameDetail)
            post = post.append(data, ignore_index = True)
    return (reg, post)

In [7]:
#Reference: https://github.com/sublee/trueskill#
import math
def cdf(x, mu=0, sigma=1):
    return 0.5 * erfc(-(x - mu) / (sigma * math.sqrt(2)))


def pdf(x, mu=0, sigma=1):
    return (1 / math.sqrt(2 * math.pi) * abs(sigma) *
            math.exp(-(((x - mu) / abs(sigma)) ** 2 / 2)))

def erfc(x):
    """Complementary error function (via `http://bit.ly/zOLqbc`_)"""
    z = abs(x)
    t = 1. / (1. + z / 2.)
    r = t * math.exp(-z * z - 1.26551223 + t * (1.00002368 + t * (
        0.37409196 + t * (0.09678418 + t * (-0.18628806 + t * (
            0.27886807 + t * (-1.13520398 + t * (1.48851587 + t * (
                -0.82215223 + t * 0.17087277
            )))
        )))
    )))
    return 2. - r if x < 0 else r

In [170]:
def TrueSkill(data, TrueSkillTable = 0, beta = 0, eps = 0):
    import copy
    #build Skill Table if not exists
    if type(TrueSkillTable) is not pd.DataFrame:
        TrueSkillTable = {}
        TrueSkillTable["team"] = data.away.unique()
        TrueSkillTable["mu"] = 1200.0
        TrueSkillTable["sigma"] = 1200.0/3
        TrueSkillTable = pd.DataFrame(TrueSkillTable, columns = ["team", "mu", "sigma"])
        beta = 1200.0/3/2
    #build record mu overtime
    overtimeTable = {}
    overtimeTable["team"] = TrueSkillTable.team
    #overtimeTable["startingMu"] = copy.deepcopy(TrueSkillTable.mu)
    dateList = data.date.unique()

    for date in dateList:
        currentData = data[data.date == date]

        for result in currentData.iterrows():
            result = result[1]
            winTeam = result.winTeam
            loseTeam = result.loseTeam
            
            winMu = float(TrueSkillTable[TrueSkillTable.team == winTeam].mu.values[0])
            winSigma = float(TrueSkillTable[TrueSkillTable.team == winTeam].sigma.values[0])
            loseMu = float(TrueSkillTable[TrueSkillTable.team == loseTeam].mu.values[0])
            loseSigma = float(TrueSkillTable[TrueSkillTable.team == loseTeam].sigma.values[0])
            c = (2.0 * beta**2 + winSigma ** 2 + loseSigma ** 2)**(.5)
            t = (winMu - loseMu)/c
            v = pdf(t - eps)/cdf(t - eps)
            w = v * (v + t - eps)
            
            TrueSkillTable.loc[TrueSkillTable.team == winTeam, 'mu'] = winMu + (winSigma ** 2) / c * v
            TrueSkillTable.loc[TrueSkillTable.team == loseTeam, 'mu'] = loseMu - (loseSigma ** 2) / c * v
            TrueSkillTable.loc[TrueSkillTable.team == winTeam, 'sigma'] = (winSigma ** 2 * (1 - (winSigma ** 2) / (c ** 2) * w)) **(.5)
            TrueSkillTable.loc[TrueSkillTable.team == loseTeam, 'sigma'] = (loseSigma ** 2 * (1 - (loseSigma ** 2) / (c ** 2) * w)) **(.5)
        date = pd.to_datetime(date[5:], format = '%b %d, %Y')
        overtimeTable[date] = copy.deepcopy(TrueSkillTable.mu)
    overtimeTable = pd.DataFrame(overtimeTable)
    overtimeTable = overtimeTable.set_index('team')
    overtimeTable = overtimeTable.transpose()
    overtimeTable = overtimeTable.sort_index()
    TrueSkillTable = TrueSkillTable.sort_values(by = 'mu', ascending = False)
    return (TrueSkillTable, overtimeTable)

In [175]:
#reg2015, post2015 =  collectAndParse(getUrl(2016))

ranking, timetable = TrueSkill(reg2015)

In [176]:
rankingPost, timetablePost = TrueSkill(post2015, TrueSkillTable= ranking)

In [177]:
rankingPost

Unnamed: 0,team,mu,sigma
1,San Antonio Spurs,1331.928217,5.839811
5,Houston Rockets,1321.171684,5.847208
2,Utah Jazz,1311.859629,10.257497
15,Los Angeles Clippers,1300.705752,10.25954
23,Memphis Grizzlies,1293.464363,18.45364
10,Oklahoma City Thunder,1288.81632,20.403702
18,Golden State Warriors,1287.258235,6.188998
20,Cleveland Cavaliers,1280.779421,3.986429
14,Boston Celtics,1274.107477,3.446029
13,Washington Wizards,1271.356819,4.182257
