### A Hierarchical Bayesian Model of the Premier League
http://danielweitzenfeld.github.io/passtheroc/blog/2014/10/28/bayes-premier-league/

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
import os
import math
import warnings
warnings.filterwarnings('ignore')

from IPython.display import Image, HTML
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pymc3 as pm

In [3]:
DATA_DIR = os.path.join(os.getcwd(), 'data/')
CHART_DIR = os.path.join(os.getcwd(), 'charts/')

In [4]:
DATA_DIR

'/Users/rubenrodriguez/Documents/anaconda/premiereLeague/data/'

In [5]:
data_file = './data/premiere_league_13_14.csv'
df = pd.read_csv(data_file, index_col=0)
df.head()

Unnamed: 0_level_0,ARS,AVL,CAR,CHE,CRY,EVE,FUL,HUL,LIV,MCI,MUN,NEW,NOR,SOU,STK,SUN,SWA,TOT,WBA,WHU
Home \ Away,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Arsenal,—,1–3,2–0,0–0,2–0,1–1,2–0,2–0,2–0,1–1,0–0,3–0,4–1,2–0,3–1,4–1,2–2,1–0,1–0,3–1
Aston Villa,1–2,—,2–0,1–0,0–1,0–2,1–2,3–1,0–1,3–2,0–3,1–2,4–1,0–0,1–4,0–0,1–1,0–2,4–3,0–2
Cardiff City,0–3,0–0,—,1–2,0–3,0–0,3–1,0–4,3–6,3–2,2–2,1–2,2–1,0–3,1–1,2–2,1–0,0–1,1–0,0–2
Chelsea,6–0,2–1,4–1,—,2–1,1–0,2–0,2–0,2–1,2–1,3–1,3–0,0–0,3–1,3–0,1–2,1–0,4–0,2–2,0–0
Crystal Palace,0–2,1–0,2–0,1–0,—,0–0,1–4,1–0,3–3,0–2,0–2,0–3,1–1,0–1,1–0,3–1,0–2,0–1,3–1,1–0


In [6]:
print(df.columns)

Index(['ARS', 'AVL', 'CAR', 'CHE', 'CRY', 'EVE', 'FUL', 'HUL', 'LIV', 'MCI',
       'MUN', 'NEW', 'NOR', 'SOU', 'STK', 'SUN', 'SWA', 'TOT', 'WBA', 'WHU'],
      dtype='object')


In [7]:
df.index = df.columns
rows = []
for i in df.index:
    for c in df.columns:
        if i == c: continue
        score = df.loc[i, c]
        ssplit = score.split('–')
        #print(ssplit[1])
        rows.append([i, c, ssplit[0], ssplit[1]])
df = pd.DataFrame(rows, columns = ['home', 'away', 'home_score', 'away_score'])
df.head()

Unnamed: 0,home,away,home_score,away_score
0,ARS,AVL,1,3
1,ARS,CAR,2,0
2,ARS,CHE,0,0
3,ARS,CRY,2,0
4,ARS,EVE,1,1


In [8]:
teams = df.home.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
teams.head()

Unnamed: 0,team,i
0,ARS,0
1,AVL,1
2,CAR,2
3,CHE,3
4,CRY,4


In [9]:
df = pd.merge(df, teams, left_on='home', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
df.head()

Unnamed: 0,home,away,home_score,away_score,i_home,i_away
0,ARS,AVL,1,3,0,1
1,ARS,CAR,2,0,0,2
2,ARS,CHE,0,0,0,3
3,ARS,CRY,2,0,0,4
4,ARS,EVE,1,1,0,5


In [10]:
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.unique())
num_games = len(home_team)

In [11]:
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())