Premier League Table Ranking based on Google's Page Ranking Algorithm

Hi there, here I simply investigate whether or not the PageRank algorithm famously used by Google could give any insight into predicted the Premier League 

In [None]:
import pandas as pd
import numpy as np
import copy
import requests
import io
import datetime

In [2]:
#Functions
def smoothing(table):
    outTable = pd.DataFrame(index=table.index,columns=table.columns)
    for i in table.index:
        for j in table.columns:
            outTable[i][j] = np.where(table[i][j]==0,0.5,table[i][j]/(table[i][j]+table[j][i]))
    return outTable
    

def generateTable(data,selectStr):
    crossTable = pd.crosstab(data['HomeTeam'],data['AwayTeam']) #Cross tabulation of home and away teams
    outputTable = pd.DataFrame(index=crossTable.index,columns=crossTable.columns) #Copy of crosstab table but to be outputted
    for team_i in crossTable.index:
        outTeamI = data[data['HomeTeam']==team_i]
        for team_j in crossTable.columns:
            if team_i != team_j and crossTable[team_j][team_i] != 0:
                goals = outTeamI[outTeamI['AwayTeam']==team_j][selectStr].values[0]
                outputTable[team_j][team_i] = np.where(goals != 0, goals, 0.0)
    outputTable.fillna(0, inplace=True)  #Finally, get rid of NaN's by replacing with 0 (required for next stage)  
    return outputTable
    
    
def generateGoalsTable(data,rankby='xG'):
    goalsTable = pd.crosstab(data['HomeTeam'],data['AwayTeam'])
    goalsTable_df = pd.DataFrame(index=goalsTable.index,columns=goalsTable.columns)
    
    if rankby == "xG" or rankby == "xGA":
        homeTable = generateTable(data, "HomexG")
        awayTable = generateTable(data, "AwayxG")
    elif rankby == "goals":
        homeTable = generateTable(data, "score1")
        awayTable = generateTable(data, "score2")
    
    # Full-time goals table
    for team_i in goalsTable.index:  #HomeTeams
        for team_j in goalsTable.columns:  #AwayTeams
            if team_i != team_j:
                xG = homeTable[team_j][team_i] + awayTable[team_i][team_j]
                xGA = homeTable[team_i][team_j] + awayTable[team_j][team_i]
                goals = homeTable[team_j][team_i] + awayTable[team_i][team_j]
                
                if rankby == 'xG':
                    goalsTable_df[team_j][team_i] = xG
                elif rankby == 'xGA':
                    goalsTable_df[team_j][team_i] = xGA
                elif rankby == 'goals':
                    goalsTable_df[team_j][team_i] = xG

                    
    #return smoothing(goalsTable_df.fillna(0))
    return goalsTable_df.fillna(0)
                
#generateTable(data,"HomexG")
#generateGoalsTable(data)

In [3]:
#Import data from FiveThirtyEight
url = "https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv"
s=requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')))
df.head()  #Show the top five results to get a feel for the data

Unnamed: 0,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,probtie,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2016-08-12,1843,French Ligue 1,Bastia,Paris Saint-Germain,51.16,85.68,0.0463,0.838,0.1157,...,32.4,67.7,0.0,1.0,0.97,0.63,0.43,0.45,0.0,1.05
1,2016-08-12,1843,French Ligue 1,AS Monaco,Guingamp,68.85,56.48,0.5714,0.1669,0.2617,...,53.7,22.9,2.0,2.0,2.45,0.77,1.75,0.42,2.1,2.1
2,2016-08-13,2411,Barclays Premier League,Hull City,Leicester City,53.57,66.81,0.3459,0.3621,0.2921,...,38.1,22.2,2.0,1.0,0.85,2.77,0.17,1.25,2.1,1.05
3,2016-08-13,2411,Barclays Premier League,Everton,Tottenham Hotspur,68.02,73.25,0.391,0.3401,0.2689,...,31.9,48.0,1.0,1.0,0.73,1.11,0.88,1.81,1.05,1.05
4,2016-08-13,2411,Barclays Premier League,Burnley,Swansea City,58.98,59.74,0.4482,0.2663,0.2854,...,36.5,29.1,0.0,1.0,1.24,1.84,1.71,1.56,0.0,1.05


In [13]:
##### Select only the subset of the data you want #####
league = "Barclays Premier League"
#start_date = "2018-08-10"
#end_date = datetime.datetime.today().strftime('%Y-%m-%d')
#start_date = "2018-08-09"
start_data = "2018-01-01"
end_date = "2019-05-13"

data = df[(df['league']==league)&(df['date']>=start_date)&(df['date']<=end_date)]

#The columns that are most important for us are the 'team1','team2','xg1' and 'xg2' columns
#Sometimes FiveThirtyEight include games that havent been played yet and places 'NaN' instead, which could mess
#up our PageRank! Therefore, we have to make sure these columns aren't null.
data = data[(pd.notnull(data['team1']))&(pd.notnull(data['team2']))&(pd.notnull(data['xg1']))&(pd.notnull(data['xg2']))]

data.replace("AFC Bournemouth","Bournemouth",inplace=True)
#Rename these columns to something that makes more sense than what FiveThirtyEight have used
data.rename(columns={'team1': 'HomeTeam', 'team2': 'AwayTeam','xg1': 'HomexG', 'xg2': 'AwayxG'}, inplace=True)

data.head() #Show the top five results to get a feel for the data subset

Unnamed: 0,date,league_id,league,HomeTeam,AwayTeam,spi1,spi2,prob1,prob2,probtie,...,importance1,importance2,score1,score2,HomexG,AwayxG,nsxg1,nsxg2,adj_score1,adj_score2
11686,2018-08-10,2411,Barclays Premier League,Manchester United,Leicester City,82.66,69.64,0.64,0.1385,0.2215,...,60.3,21.3,2.0,1.0,1.5,1.72,0.71,1.8,1.78,1.05
11705,2018-08-11,2411,Barclays Premier League,Newcastle,Tottenham Hotspur,67.21,85.75,0.201,0.5565,0.2425,...,23.8,57.8,1.0,2.0,1.08,2.15,1.6,1.5,1.05,2.1
11721,2018-08-11,2411,Barclays Premier League,Bournemouth,Cardiff City,62.75,59.47,0.4886,0.2451,0.2664,...,36.3,40.0,2.0,0.0,2.2,0.96,2.28,1.16,1.58,0.0
11737,2018-08-11,2411,Barclays Premier League,Huddersfield Town,Chelsea,60.2,84.39,0.1644,0.5945,0.2411,...,36.1,58.8,0.0,3.0,0.45,2.16,0.94,1.35,0.0,2.9
11744,2018-08-11,2411,Barclays Premier League,Watford,Brighton and Hove Albion,66.07,63.08,0.4693,0.247,0.2837,...,29.5,33.6,2.0,0.0,1.66,0.32,2.11,0.98,2.1,0.0


In [16]:
xGGoalsTable1819 = generateGoalsTable(data,'goals')
xGranking1819 = [np.linalg.eig(xGGoalsTable1819.astype(float))[1][i][0] for i in range(0,len(xGGoalsTable1819))]
xGranking1819_ordered = sorted(range(len(xGranking1819)), key=lambda k: xGranking1819[k])
teams = list(xGGoalsTable1819.columns)
rankedTable = [teams[j] for j in xGranking1819_ordered][::-1]

#Print the predicted table
p = 0
for team in rankedTable:
    p+=1
    print(p,": ",team)

1 :  Huddersfield Town
2 :  Fulham
3 :  Cardiff City
4 :  Brighton and Hove Albion
5 :  Newcastle
6 :  Burnley
7 :  Southampton
8 :  Watford
9 :  West Ham United
10 :  Leicester City
11 :  Wolverhampton
12 :  Everton
13 :  Bournemouth
14 :  Crystal Palace
15 :  Chelsea
16 :  Manchester United
17 :  Tottenham Hotspur
18 :  Arsenal
19 :  Liverpool
20 :  Manchester City


In [19]:
homeTable = generateTable(data, "HomexG")
awayTable = generateTable(data, "AwayxG")

homeTable_Against = generateTable(data, "HomexG")
awayTable_Against = generateTable(data, "AwayxG")


homeTable['Liverpool']['Arsenal'] + awayTable['Arsenal']['Liverpool']   #xG
#1.59 + 0.96
#team_i = Arsenal
#team_j = Liverpool
#homeTable_Against['Arsenal']['Liverpool'] + awayTable_Against['Liverpool']['Arsenal']   #xGA
#1.46 + 3.88
#data[data['AwayTeam']=='Arsenal']

2.55

In [76]:
goalsTable = pd.crosstab(data['HomeTeam'],data['AwayTeam'])
goalsTable['Arsenal']['Burnley']
#goalsTable
goalsTable_df = pd.DataFrame(index=goalsTable.index,columns=goalsTable.columns)
#goalsTable_df.columns#[(goalsTable['Arsenal'])&(goalsTable['Liverpool'])]
goalsTable_df['Arsenal']['Liverpool']#[goalsTable_df['HomeTeam']=='Arsenal']

nan

In [95]:
# Parameter M adjacency matrix where M_i,j represents the link from 'j' to 'i', such that for all 'j'
# sum(i, M_i,j) = 1
# Parameter d damping factor (default value 0.85)
# Parameter eps quadratic error for v (default value 1.0e-8)
# Return v, a vector of ranks such that v_i is the i-th rank from [0, 1]

import numpy as np

def pagerank(M, eps=1.0e-8, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * 100
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = d * np.matmul(M, v) + (1 - d) / N
    return v

M = np.array([[0, 0, 0, 0, 1],
              [0.5, 0, 0, 0, 0],
              [0.5, 0, 0, 0, 0],
              [0, 1, 0.5, 0, 0],
              [0, 0, 0.5, 1, 0]])
v = pagerank(M, 0.001, 0.85)

In [96]:
xGGoalsTable1819 = generateGoalsTable(data,'goals')
M = xGGoalsTable1819.as_matrix()
v = pagerank(M, 0.001, 0.85)
v

  


array([[            inf],
       [1.11912699e+308],
       [8.25173399e+307],
       [            inf],
       [1.00048339e+308],
       [1.24589546e+308],
       [7.63306835e+307],
       [9.99003133e+307],
       [            inf],
       [            inf],
       [1.05246841e+308],
       [5.57618997e+307],
       [8.05660848e+307],
       [8.18341496e+307],
       [5.95809832e+307],
       [9.73582553e+307],
       [            inf],
       [8.84679492e+307],
       [8.37487471e+307],
       [9.48042294e+307]])