In [165]:
import requests
import numpy as np
import pandas as pd
from collections import Counter

In [166]:
# team_names = set(df.team1).union(df.team2)
# Arizona and Cleveland unassigned
team_assignment = {
 'Arizona': '',
 'Atlanta': 'Matt',
 'Baltimore': 'John',
 'Buffalo': 'Ryan',
 'Carolina': 'Ryan',
 'Chicago': 'Aaron',
 'Cincinnati': 'Jon',
 'Cleveland': '',
 'Dallas': 'Brad',
 'Denver': 'Micah',
 'Detroit': 'Colin',
 'Green Bay': 'Jon',
 'Houston': 'Colin',
 'Indianapolis': 'Colin',
 'Jacksonville': 'Brian',
 'Kansas City': 'Ryan',
 'L.A. Chargers': 'Matt',
 'L.A. Rams': 'John',
 'Miami': 'Brad',
 'Minnesota': 'Josh',
 'N.Y. Giants': 'John',
 'N.Y. Jets': 'Josh',
 'New England': 'Aaron',
 'New Orleans': 'Micah',
 'Oakland': 'Matt',
 'Philadelphia': 'Brad',
 'Pittsburgh': 'Brian',
 'San Francisco': 'Aaron',
 'Seattle': 'Jon',
 'Tampa Bay': 'Brian',
 'Tennessee': 'Micah',
 'Washington': 'Josh'}

In [167]:
persons = sorted(list(set(p for p in team_assignment.values() if p!='')))

In [168]:
r = requests.get("https://projects.fivethirtyeight.com/2018-nfl-predictions/games/")

raw = r.text

games = raw.split('<table class="game-body">')[1:]
len(games)

256

In [169]:
def team_split(game):
    """split team text"""
    return game.split('<td class="td text team')[1:]

def team_name(team_raw):
    part = team_raw.split('">')[1]
    return part.split('</td>')[0].strip()

def team_probability(team_raw):
    if 'loser">' in team_raw:
        return 0
    elif 'winner">' in team_raw:
        return 1
    else: 
        part = team_raw.split('%</td><td class="td number score">')[0]
        return float(part.split('>')[-1])/100

In [203]:
# example usage
game = games[100]
team1, team2 = team_split(game)
print(team_name(team1), team_name(team2))
print(team_probability(team1), team_probability(team2))

Indianapolis Oakland
0.34 0.66


In [171]:
# build game probabilities data frame
data = []
for game in games:
    team1, team2 = team_split(game)
    team1_name, team2_name = team_name(team1), team_name(team2)
    person1, person2 = team_assignment[team1_name], team_assignment[team2_name]
    info = [team1_name, team2_name, team_probability(team1), team_probability(team2), person1, person2]
    data.append(info)
    
cols = ["team1", "team2", "prob1", "prob2", "person1", "person2"]
df = pd.DataFrame(data, columns=cols)
df[:5]

Unnamed: 0,team1,team2,prob1,prob2,person1,person2
0,Baltimore,Cincinnati,0.5,0.5,John,Jon
1,Carolina,Atlanta,0.38,0.62,Ryan,Matt
2,Cleveland,New Orleans,0.15,0.85,,Micah
3,Houston,Tennessee,0.3,0.7,Colin,Micah
4,Indianapolis,Washington,0.25,0.75,Colin,Josh


In [180]:
# best possible score by person
current = []
for p in persons:
    certain_win = possible_win = ((df.person1==p) & (df.prob1==1)) | ((df.person2==p) & (df.prob2==1))
    possible_win = ((df.person1==p) & (df.prob1>0)) | ((df.person2==p) & (df.prob2>0))
    current.append([p, certain_win.sum(), possible_win.sum()])
    
df_current = pd.DataFrame(current, columns=['person', 'wins', 'max_possible']).set_index("person")
df_formatted = df_current.sort_values(['wins','max_possible'], ascending=False)
df_formatted.to_csv("scores.txt", sep=" ", header=False)
df_formatted

Unnamed: 0_level_0,wins,max_possible
person,Unnamed: 1_level_1,Unnamed: 2_level_1
Josh,3,47
John,2,47
Ryan,2,47
Jon,2,46
Brad,2,45
Brian,2,45
Micah,1,46
Aaron,1,44
Colin,0,43
Matt,0,43


In [232]:
n_sims = 10000
sims = []
for index, g in df.iterrows():
    a = [g['person1'], g['person2']]
    p = [g['prob1'], g['prob2']]
    if sum(p) != 0:
        game_sims = np.random.choice(a, n_sims, p=p)
    sims.append(game_sims)

sims = np.array(sims).T
sims.shape

(10000, 256)

In [233]:
df_wins = pd.DataFrame([Counter(s) for s in sims])[persons]
print(df_wins.shape)
df_wins[:5]

(10000, 10)


Unnamed: 0,Aaron,Brad,Brian,Colin,John,Jon,Josh,Matt,Micah,Ryan
0,22,28,30,19,25,24,25,23,23,27
1,26,31,30,12,22,27,21,24,27,22
2,28,24,29,15,24,26,30,23,20,29
3,23,25,31,19,25,22,30,21,29,22
4,21,29,27,19,25,24,28,23,25,27


In [234]:
# use min method because ties take from following placements
# mistakenly used dense at first
df_rank = df_wins.rank(axis=1, method="min", ascending=False).astype(int)
df_rank[:5]

Unnamed: 0,Aaron,Brad,Brian,Colin,John,Jon,Josh,Matt,Micah,Ryan
0,9,2,1,10,4,6,4,7,7,3
1,5,1,2,10,7,3,9,6,3,7
2,4,6,2,10,6,5,1,8,9,2
3,6,4,1,10,4,7,2,9,3,7
4,9,1,3,10,5,7,2,8,5,3


In [235]:
rank1 = (df_rank==1).sum(0) / n_sims
rank2 = (df_rank<=2).sum(0) / n_sims
rank3 = (df_rank<=3).sum(0) / n_sims

In [236]:
probs = pd.concat([rank1, rank2, rank3], axis=1)
probs.columns = ["1", "2", "3"]
probs.sort_values("1", ascending=False).style.format("{:.1%}")

Unnamed: 0,1,2,3
Josh,26.2%,45.9%,61.4%
Brad,26.1%,46.4%,62.4%
Brian,26.0%,45.4%,61.1%
Ryan,22.4%,40.7%,56.4%
Aaron,6.0%,14.8%,25.6%
John,5.5%,13.2%,23.1%
Jon,5.1%,12.7%,22.8%
Matt,2.5%,7.3%,13.8%
Micah,2.4%,6.5%,12.2%
Colin,0.0%,0.1%,0.4%


In [211]:
formatted = (probs*100).sort_values("1", ascending=False).round(1).astype(float).astype(str)+'%'
formatted.to_csv("probabilities.txt", sep=" ", header=False)

In [200]:
# placement distribution for individual players
pd.DataFrame(df_wins.groupby("Josh").size() / n_sims).style.format("{:.2%}")

Unnamed: 0_level_0,0
Josh,Unnamed: 1_level_1
16,0.03%
17,0.06%
18,0.09%
19,0.28%
20,0.71%
21,1.28%
22,2.33%
23,4.43%
24,6.64%
25,9.64%
