In [6]:
import requests
import numpy as np
import pandas as pd
from collections import Counter

In [7]:
# got team names from set(df.team1).union(df.team2)
# Arizona and Cleveland unassigned
team_assignment = {
 'Arizona': '',
 'Atlanta': 'Matt',
 'Baltimore': 'John',
 'Buffalo': 'Ryan',
 'Carolina': 'Ryan',
 'Chicago': 'Aaron',
 'Cincinnati': 'Jon',
 'Cleveland': '',
 'Dallas': 'Brad',
 'Denver': 'Micah',
 'Detroit': 'Colin',
 'Green Bay': 'Jon',
 'Houston': 'Colin',
 'Indianapolis': 'Colin',
 'Jacksonville': 'Brian',
 'Kansas City': 'Ryan',
 'L.A. Chargers': 'Matt',
 'L.A. Rams': 'John',
 'Miami': 'Brad',
 'Minnesota': 'Josh',
 'N.Y. Giants': 'John',
 'N.Y. Jets': 'Josh',
 'New England': 'Aaron',
 'New Orleans': 'Micah',
 'Oakland': 'Matt',
 'Philadelphia': 'Brad',
 'Pittsburgh': 'Brian',
 'San Francisco': 'Aaron',
 'Seattle': 'Jon',
 'Tampa Bay': 'Brian',
 'Tennessee': 'Micah',
 'Washington': 'Josh'}

In [8]:
persons = sorted(list(set(p for p in team_assignment.values() if p!='')))
persons

['Aaron',
 'Brad',
 'Brian',
 'Colin',
 'John',
 'Jon',
 'Josh',
 'Matt',
 'Micah',
 'Ryan']

In [9]:
# get full html of page
r = requests.get("https://projects.fivethirtyeight.com/2018-nfl-predictions/games/")

raw = r.text

# html for each game table
# first element is not relevant
games = raw.split('<table class="game-body">')[1:]
len(games)

256

In [10]:
def team_split(game):
    """split team text"""
    return game.split('<td class="td text team')[1:]

def team_name(team_raw):
    """extract team name"""
    part = team_raw.split('">')[1]
    return part.split('</td>')[0].strip()

def team_probability(team_raw):
    """extract win probability for team
    apparently they put 'loser' and 'winner' in the class name when the game is complete"""
    if 'loser">' in team_raw:
        return 0
    elif 'winner">' in team_raw:
        return 1
    else: 
        part = team_raw.split('%</td><td class="td number score">')[0]
        return float(part.split('>')[-1])/100

In [11]:
# function example usage
game = games[100]
team1, team2 = team_split(game)
print(team_name(team1), team_name(team2))
print(team_probability(team1), team_probability(team2))

Indianapolis Oakland
0.34 0.66


In [12]:
# build game probabilities data frame
data = []
for game in games:
    team1, team2 = team_split(game)
    team1_name, team2_name = team_name(team1), team_name(team2)
    person1, person2 = team_assignment[team1_name], team_assignment[team2_name]
    info = [team1_name, team2_name, team_probability(team1), team_probability(team2), person1, person2]
    data.append(info)
    
cols = ["team1", "team2", "prob1", "prob2", "person1", "person2"]
df = pd.DataFrame(data, columns=cols)
df[:5]

Unnamed: 0,team1,team2,prob1,prob2,person1,person2
0,Baltimore,Cincinnati,0.5,0.5,John,Jon
1,Carolina,Atlanta,0.38,0.62,Ryan,Matt
2,Cleveland,New Orleans,0.15,0.85,,Micah
3,Houston,Tennessee,0.3,0.7,Colin,Micah
4,Indianapolis,Washington,0.25,0.75,Colin,Josh


In [13]:
# current and best possible score by person
current = []
for p in persons:
    certain_win = possible_win = ((df.person1==p) & (df.prob1==1)) | ((df.person2==p) & (df.prob2==1))
    possible_win = ((df.person1==p) & (df.prob1>0)) | ((df.person2==p) & (df.prob2>0))
    current.append([p, certain_win.sum(), possible_win.sum()])
    
df_current = pd.DataFrame(current, columns=['person', 'wins', 'max_possible']).set_index("person")
df_formatted = df_current.sort_values(['wins','max_possible'], ascending=False)
df_formatted.to_csv("scores.txt", sep=" ", header=False)
df_formatted

Unnamed: 0_level_0,wins,max_possible
person,Unnamed: 1_level_1,Unnamed: 2_level_1
Josh,3,47
John,2,47
Ryan,2,47
Jon,2,46
Brad,2,45
Brian,2,45
Micah,1,46
Aaron,1,44
Colin,0,43
Matt,0,43


In [14]:
# simulate seasons
n_sims = 10000
sims = []
for index, g in df.iterrows():
    a = [g['person1'], g['person2']]
    p = [g['prob1'], g['prob2']]
    if sum(p) != 0:
        game_sims = np.random.choice(a, n_sims, p=p)
    sims.append(game_sims)

sims = np.array(sims).T
sims.shape

(10000, 256)

In [15]:
# aggregate seasons into scores for each person
df_wins = pd.DataFrame([Counter(s) for s in sims])[persons]
print(df_wins.shape)
df_wins[:5]

(10000, 10)


Unnamed: 0,Aaron,Brad,Brian,Colin,John,Jon,Josh,Matt,Micah,Ryan
0,29,26,28,20,22,26,24,21,20,30
1,29,31,30,14,22,23,26,19,26,27
2,29,25,27,20,25,21,28,31,13,25
3,19,29,27,17,29,26,34,25,18,24
4,17,31,22,14,29,25,32,22,23,34


In [16]:
# use min method because ties take from following placements
# mistakenly used dense at first
df_rank = df_wins.rank(axis=1, method="min", ascending=False).astype(int)
df_rank[:5]

Unnamed: 0,Aaron,Brad,Brian,Colin,John,Jon,Josh,Matt,Micah,Ryan
0,2,4,3,9,7,4,6,8,9,1
1,3,1,2,10,8,7,5,9,5,4
2,2,5,4,9,5,8,3,1,10,5
3,8,2,4,10,2,5,1,6,9,7
4,9,3,7,10,4,5,2,7,6,1


In [17]:
rank1 = (df_rank==1).sum(0) / n_sims
rank2 = (df_rank<=2).sum(0) / n_sims
rank3 = (df_rank<=3).sum(0) / n_sims

probs = pd.concat([rank1, rank2, rank3], axis=1)
probs.columns = ["1", "2", "3"]
probs.sort_values("1", ascending=False).style.format("{:.1%}")

Unnamed: 0,1,2,3
Josh,27.0%,46.8%,62.2%
Brian,25.9%,45.6%,61.4%
Brad,25.7%,45.2%,61.4%
Ryan,21.5%,39.8%,55.9%
Aaron,6.2%,15.6%,26.8%
John,5.3%,13.2%,22.9%
Jon,5.2%,13.1%,22.6%
Matt,2.6%,7.3%,14.3%
Micah,2.1%,6.0%,11.8%
Colin,0.0%,0.2%,0.3%


In [211]:
formatted = (probs*100).sort_values("1", ascending=False).round(1).astype(float).astype(str)+'%'
formatted.to_csv("probabilities.txt", sep=" ", header=False)

In [200]:
# score distribution for individual players
pd.DataFrame(df_wins.groupby("Josh").size() / n_sims).style.format("{:.2%}")

Unnamed: 0_level_0,0
Josh,Unnamed: 1_level_1
16,0.03%
17,0.06%
18,0.09%
19,0.28%
20,0.71%
21,1.28%
22,2.33%
23,4.43%
24,6.64%
25,9.64%
