In [2]:
# construct Poisson  for each mean goals value
poisson_pred = np.column_stack([[poisson.pmf(i, epl.mean()[j]) for i in range(8)] for j in range(2)])

# plot histogram of actual goals
plt.hist(epl[['HomeGoals', 'AwayGoals']].values, range(9), 
         alpha=0.7, label=['Home', 'Away'],normed=True, color=["#FFA07A", "#20B2AA"])

# add lines for the Poisson distributions
pois1, = plt.plot([i-0.5 for i in range(1,9)], poisson_pred[:,0],
                  linestyle='-', marker='o',label="Home", color = '#CD5C5C')
pois2, = plt.plot([i-0.5 for i in range(1,9)], poisson_pred[:,1],
                  linestyle='-', marker='o',label="Away", color = '#006400')

leg=plt.legend(loc='upper right', fontsize=13, ncol=2)
leg.set_title("Poisson           Actual        ", prop = {'size':'14', 'weight':'bold'})

plt.xticks([i-0.5 for i in range(1,9)],[i for i in range(9)])
plt.xlabel("Goals per Match",size=13)
plt.ylabel("Proportion of Matches",size=13)
plt.title("Number of Goals per Match (EPL 2019/20 Season)",size=14,fontweight='bold')
plt.ylim([-0.004, 0.4])
plt.tight_layout()
plt.show()

In [3]:
# probability of draw between home and away team
draw_odds = skellam.pmf(0.0,  epl.mean()[0],  epl.mean()[1])
print("probability of draw between home and away team for the entire league", draw_odds)


# probability of home team winning by one goal
win_by_1 = skellam.pmf(1,  epl.mean()[0],  epl.mean()[1])
print("probability of home team winning by one goal", win_by_1)


In [4]:
skellam_pred = [skellam.pmf(i,  epl.mean()[0],  epl.mean()[1]) for i in range(-6,8)]

plt.hist(epl[['HomeGoals']].values - epl[['AwayGoals']].values, range(-6,8), 
         alpha=0.7, label='Actual',normed=True)
plt.plot([i+0.5 for i in range(-6,8)], skellam_pred,
                  linestyle='-', marker='o',label="Skellam", color = '#CD5C5C')
plt.legend(loc='upper right', fontsize=13)
plt.xticks([i+0.5 for i in range(-6,8)],[i for i in range(-6,8)])
plt.xlabel("Home Goals - Away Goals",size=13)
plt.ylabel("Proportion of Matches",size=13)
plt.title("Difference in Goals Scored (Home Team vs Away Team)",size=14,fontweight='bold')
plt.ylim([-0.004, 0.26])
plt.tight_layout()
plt.show()

In [5]:
fig,(ax1,ax2) = plt.subplots(2, 1)


team1_home = epl[epl['HomeTeam']=='Chelsea'][['HomeGoals']].apply(pd.value_counts,normalize=True)
team1_home_pois = [poisson.pmf(i,np.sum(np.multiply(team1_home.values.T,team1_home.index.T),axis=1)[0]) for i in range(8)]
team2_home = epl[epl['HomeTeam']=='Liverpool'][['HomeGoals']].apply(pd.value_counts,normalize=True)
team2_home_pois = [poisson.pmf(i,np.sum(np.multiply(team2_home.values.T,team2_home.index.T),axis=1)[0]) for i in range(8)]

team1_away = epl[epl['AwayTeam']=='Chelsea'][['AwayGoals']].apply(pd.value_counts,normalize=True)
team1_away_pois = [poisson.pmf(i,np.sum(np.multiply(team1_away.values.T,team1_away.index.T),axis=1)[0]) for i in range(8)]
team2_away = epl[epl['AwayTeam']=='Liverpool'][['AwayGoals']].apply(pd.value_counts,normalize=True)
team2_away_pois = [poisson.pmf(i,np.sum(np.multiply(team2_away.values.T,team2_away.index.T),axis=1)[0]) for i in range(8)]

#ax1.bar(team1_home.index-0.4,team1_home.values,width=0.4,color="#034694",label="Chelsea")
#ax1.bar(team2_home.index,team2_home.values,width=0.4,color="#EB172B",label="Liverpool")
pois1, = ax1.plot([i for i in range(8)], team1_home_pois,
                  linestyle='-', marker='o',label="Home Team", color = "#0a7bff")
pois1, = ax1.plot([i for i in range(8)], team2_home_pois,
                  linestyle='-', marker='o',label="Away Team", color = "#ff7c89")
leg=ax1.legend(loc='upper right', fontsize=12, ncol=2)
leg.set_title("Poisson                 Actual                ", prop = {'size':'14', 'weight':'bold'})
ax1.set_xlim([-0.5,7.5])
ax1.set_ylim([-0.01,0.65])
ax1.set_xticklabels([])
# mimicing the facet plots in ggplot2 with a bit of a hack
ax1.text(7.65, 0.585, '                Home                ', rotation=-90,
        bbox={'facecolor':'#ffbcf6', 'alpha':0.5, 'pad':5})
ax2.text(7.65, 0.585, '                Away                ', rotation=-90,
        bbox={'facecolor':'#ffbcf6', 'alpha':0.5, 'pad':5})

#ax2.bar(team1_away.index-0.4,team1_away.values,width=0.4,color="#034694",label="Chelsea")
#ax2.bar(team2_away.index,team2_away.values,width=0.4,color="#EB172B",label="Sunderland")
pois1, = ax2.plot([i for i in range(8)], team1_away_pois,
                  linestyle='-', marker='o',label="Home Team", color = "#0a7bff")
pois1, = ax2.plot([i for i in range(8)], team2_away_pois,
                  linestyle='-', marker='o',label="Away Team", color = "#ff7c89")
ax2.set_xlim([-0.5,7.5])
ax2.set_ylim([-0.01,0.65])
ax1.set_title("Number of Goals per Match (EPL 2019/20 Season)",size=14,fontweight='bold')
ax2.set_xlabel("Goals per Match",size=13)
ax2.text(-1.15, 0.9, 'Proportion of Matches', rotation=90, size=13)
plt.tight_layout()
plt.show()

In [6]:
# importing the tools required for the Poisson regression model
import statsmodels.api as sm
import statsmodels.formula.api as smf
n_columns = 0
n_rows=0
goal_model_data = pd.concat([epl[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
            columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'}),
           epl[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
            columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})])

poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()
pd.DataFrame(columns = range(0,5), index= range(0,n_rows))

poisson_model.summary()

0,1,2,3
Dep. Variable:,goals,No. Observations:,318.0
Model:,GLM,Df Residuals:,278.0
Model Family:,Poisson,Df Model:,39.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-438.57
Date:,"Mon, 09 Dec 2019",Deviance:,275.55
Time:,19:18:46,Pearson chi2:,226.0
No. Iterations:,8,,

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,0.3761,0.313,1.200,0.230,-0.238 0.990
team[T.Aston Villa],0.0442,0.305,0.145,0.885,-0.554 0.642
team[T.Bournemouth],-0.2204,0.324,-0.680,0.496,-0.855 0.414
team[T.Brighton],-0.1723,0.315,-0.547,0.584,-0.790 0.445
team[T.Burnley],-0.0927,0.312,-0.298,0.766,-0.704 0.518
team[T.Chelsea],0.3280,0.285,1.151,0.250,-0.230 0.886
team[T.Crystal Palace],-0.4831,0.348,-1.389,0.165,-1.165 0.199
team[T.Everton],-0.1897,0.318,-0.596,0.551,-0.813 0.434
team[T.Leicester],0.5352,0.273,1.963,0.050,0.001 1.069


In [7]:
import urllib.request
from bs4 import BeautifulSoup
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'

url = "https://www.oddschecker.com/football/english/premier-league"
headers={'User-Agent':user_agent,} 

request=urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
data = response.read() # The data u need
soup = BeautifulSoup(data , "lxml")
table = soup.find_all('table')[0]


    # Find number of rows and columns
# we also find the column titles if we can
for row in table.find_all('tr'):

    # Determine the number of rows in the table
    td_tags = row.find_all('p')
    if len(td_tags) > 0:
        n_rows+=1
        if n_columns == 0:
            # Set the number of columns for our table
            n_columns = len(td_tags)

df = pd.DataFrame(columns = range(0,6),
                  index= range(0,n_rows))
row_marker = 0
identity = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('p')
    for column in columns:
        df.iat[row_marker,column_marker] = column.get_text()
        column_marker += 1
        
    if len(columns) > 0:
        df.iat[row_marker,column_marker] = identity
        identity += 1
        row_marker += 1
        
header = ['hometeam','awayteam','homeodds','drawodds','awayodds', 'id']
df.columns=(header)


In [8]:
hometeam = []
awayteam = []
homeodds = []
drawodds = []
awayodds = []
id = []
all_names = list(set(epl["HomeTeam"]))

for (hometeam_v, awayteam_v, home_fraction, draw_fraction, away_fraction, ids) in zip(df['hometeam'],df['awayteam'],df['homeodds'],df['drawodds'],df['awayodds'], df['id']):
  for (cleanname,name) in zip(all_names,df['hometeam']):
    if hometeam_v[:5] == cleanname[:5]:
      hometeam.append(cleanname)
    if awayteam_v[:5] == cleanname[:5]:
      awayteam.append(cleanname)
  homeodd_raw = eval(home_fraction)
  homeodds_v = (1/(homeodd_raw+ 1))
  homeodds.append(homeodds_v)
  drawodd_raw = eval(draw_fraction)
  drawodd_raw_v = (1/(drawodd_raw+ 1))
  drawodds.append(drawodd_raw_v)
  awayodds_raw = eval(away_fraction)
  awayodds_v = (1/(awayodds_raw+ 1))
  awayodds.append(awayodds_v)
  id.append(ids)

  
percentagedf = pd.DataFrame(list(zip(hometeam, awayteam, homeodds, drawodds,awayodds,id)),
  columns=['hometeam','awayteam','homeodds','drawodds','awayodds', 'id'])

In [9]:
percentagedf

Unnamed: 0,hometeam,awayteam,homeodds,drawodds,awayodds,id
0,West Ham,Arsenal,0.274194,0.230769,0.491525,0
1,Liverpool,Watford,0.818182,0.12987,0.058824,1
2,Burnley,Newcastle,0.47619,0.277778,0.255319,2
3,Chelsea,Bournemouth,0.75,0.152778,0.083333,3
4,Leicester,Norwich,0.8,0.140845,0.066667,4
5,Sheffield United,Aston Villa,0.5,0.260274,0.232558,5
6,Southampton,West Ham,0.512821,0.246753,0.25641,6
7,Man United,Everton,0.551724,0.263158,0.197917,7
8,Wolves,Tottenham,0.294118,0.275,0.434783,8
9,Arsenal,Man City,0.16,0.181818,0.666667,9


In [10]:
def simulate_match(foot_model, homeTeam, awayTeam, max_goals=10):
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam,'opponent': awayTeam,'home':1},index=[1]))[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'home':0},
                                                      index=[1]))[0]
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    return(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))

hometeam = []
awayteam = []
home_prob = []
draw_prob = []
away_prob = []
idprob = []

ids = 0
for (ht,at) in zip(percentagedf.iloc[:, 0], percentagedf.iloc[:, 1]):
    if ht.find("Utd") >=0:
        ht= ht.replace("Utd", "United")
        at = at.replace("Utd", "United")
    elif at.find("Utd") >=0:
        ht= ht.replace("Utd", "United")
        at = at.replace("Utd", "United")
    odds_matrix = simulate_match(poisson_model, ht, at, max_goals = 10)
    home_probability = np.sum(np.tril(odds_matrix, -1))
    draw_probability = np.sum(np.diag(odds_matrix))
    away_probability = np.sum(np.triu(odds_matrix, 1))
    hometeam.append(ht)
    awayteam.append(at)
    home_prob.append(home_probability)
    draw_prob.append(draw_probability)
    away_prob.append(away_probability)
    idprob.append(ids)
    ids +=1

percentile_list = pd.DataFrame(
    {'id': idprob,
     'hometeam': hometeam,
     'awayteam': awayteam,
     'home_prob': home_prob,
     'draw_prob': draw_prob,
     'away_prob': away_prob
    })

Total_Odds = percentile_list.merge(percentagedf, on = 'id')

In [11]:
Total_Odds

Unnamed: 0,away_prob,awayteam_x,draw_prob,home_prob,hometeam_x,id,hometeam_y,awayteam_y,homeodds,drawodds,awayodds
0,0.439058,Arsenal,0.243287,0.317654,West Ham,0,West Ham,Arsenal,0.274194,0.230769,0.491525
1,0.013459,Watford,0.055616,0.930021,Liverpool,1,Liverpool,Watford,0.818182,0.12987,0.058824
2,0.341233,Newcastle,0.271564,0.387202,Burnley,2,Burnley,Newcastle,0.47619,0.277778,0.255319
3,0.18756,Bournemouth,0.200152,0.612271,Chelsea,3,Chelsea,Bournemouth,0.75,0.152778,0.083333
4,0.013801,Norwich,0.044286,0.939161,Leicester,4,Leicester,Norwich,0.8,0.140845,0.066667
5,0.260634,Aston Villa,0.26888,0.470485,Sheffield United,5,Sheffield United,Aston Villa,0.5,0.260274,0.232558
6,0.375033,West Ham,0.242824,0.382142,Southampton,6,Southampton,West Ham,0.512821,0.246753,0.25641
7,0.149049,Everton,0.204035,0.646906,Man United,7,Man United,Everton,0.551724,0.263158,0.197917
8,0.376974,Tottenham,0.242406,0.380618,Wolves,8,Wolves,Tottenham,0.294118,0.275,0.434783
9,0.653825,Man City,0.170861,0.175206,Arsenal,9,Arsenal,Man City,0.16,0.181818,0.666667
