# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 2. Features to be Used

Below are the preliminary features that will be used to build the model. More will be added on afterwards.

**Fatigue**
- Days since Home Team played last match 
- Days since Away Team played last match

**Home Team Form**
- Goals difference of home team in the last x matches    
- Goals difference of home team in the last x home matches    
- Average number of points gained by home team in the last x matches
- Average number of points gained by home team in the last x home matches
- Home Team Win streak 

**Away Team Form**
- Goals difference of away team in the last x matches  
- Goals difference of away team in the last x away matches
- Average number of points gained by away team in the last x matches
- Average number of points gained by away team in the last x away matches
- Away Team Win streak

**Home Team Performance Index**
- Home Defense Performance Index
- Home Midfield Performance Index
- Home Attack Performance Index

**Away Team Performance Index**
- Away Defense Performance Index
- Away Midfield Performance Index
- Away Attack Performance Index

**Betting Odds**
- B365H
- B365D
- B365A

In [3]:
df = pd.read_csv('datasets/dataset.csv')
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10
1,2017-08-12,brighton,city,0,2,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33,L,0.3,1.9,23.0,10,10
2,2017-08-12,chelsea,burnley,2,3,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0,L,1.5,0.6,62.0,6,10
3,2017-08-12,palace,huddersfield,0,3,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0,L,1.1,1.5,56.0,10,10
4,2017-08-12,everton,stoke,1,0,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75,W,0.6,0.4,60.0,9,10


In [4]:
df.shape

(1900, 26)

# 3. EDA

# 4. Feature Engineering

## Goals difference of home team in the last x matches

In [5]:
def getGoalsScoredByTeam(row, team):
    if row['HomeTeam'] == team:
        return row['FTHG']
    elif row['AwayTeam'] == team:
        return row['FTAG']

In [6]:
def getGoalsScoredByOpponent(row, team):
    if row['HomeTeam'] == team:
        return row['FTAG']
    elif row['AwayTeam'] == team:
        return row['FTHG']

In [7]:
temp_df = pd.DataFrame()

for team in df['HomeTeam'].unique():
    team_df = df[(df['HomeTeam'] == team) | (df['AwayTeam'] == team)]
    team_df = team_df.sort_values(by = 'Date')
    
    number_of_previous_matches = 5
    column1 = 0;
    column2 = 0;
    
    team_df['GoalsScoredByTeam'] = team_df.apply(lambda row: getGoalsScoredByTeam(row, team), axis=1)
    team_df['GoalsScoredByOpponent'] = team_df.apply(lambda row: getGoalsScoredByOpponent(row, team), axis=1)
    
    for i in range(1, number_of_previous_matches+1):
        column1 += team_df['GoalsScoredByTeam'].shift(i)
        column2 += team_df['GoalsScoredByOpponent'].shift(i)
    
    team_df['HxGD in Last 5 Matches'] = column1 - column2
    team_df.drop(columns=['GoalsScoredByTeam', 'GoalsScoredByOpponent'], inplace=True)
    team_df = team_df[team_df['HomeTeam'] == team]
    temp_df = pd.concat([temp_df, team_df]).reset_index(drop=True)

df = temp_df

In [8]:
df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,
1,2017-09-09,arsenal,bournemouth,3,0,17,7,9,2,14,10,10,3,0,1,0,0,1.36,5.5,9.0,W,2.2,0.6,58.0,10,10,
2,2017-09-25,arsenal,west-brom,2,0,16,7,6,3,8,17,7,4,1,4,0,0,1.33,5.5,11.0,W,2.2,0.9,69.0,5,5,-1.0
3,2017-10-01,arsenal,brighton,2,0,25,9,12,1,7,8,6,5,0,2,0,0,1.2,6.5,13.0,W,2.4,0.4,64.0,3,7,0.0
4,2017-10-28,arsenal,swansea,2,1,17,4,5,2,9,9,5,2,0,0,0,0,1.25,7.0,12.0,W,2.0,0.9,72.0,4,4,6.0
5,2017-11-18,arsenal,tottenham,2,0,14,14,5,4,11,16,7,4,4,1,0,0,2.5,3.6,2.89,W,2.1,0.7,43.0,10,10,3.0
6,2017-11-29,arsenal,huddersfield,5,0,21,7,7,2,10,12,7,2,0,1,0,0,1.19,7.5,17.0,W,4.0,0.5,69.0,3,3,5.0
7,2017-12-02,arsenal,united,1,3,33,8,15,4,11,10,12,1,3,2,0,1,2.5,3.39,3.0,L,4.7,1.8,75.0,3,4,7.0
8,2017-12-16,arsenal,newcastle,1,0,23,10,5,2,13,9,7,5,2,1,0,0,1.25,6.5,14.0,W,1.8,0.3,71.0,3,3,4.0
9,2017-12-22,arsenal,liverpool,3,3,11,14,4,9,9,11,7,8,1,0,0,0,2.54,3.6,2.79,D,1.2,2.0,54.0,3,5,4.0


In [9]:
df.shape

(1900, 27)

## Goals difference of home team in the last x home matches

In [10]:
temp_df = pd.DataFrame()

for team in df['HomeTeam'].unique():
    team_df = df[df['HomeTeam'] == team]
    
    number_of_previous_matches = 5
    column1 = 0;
    column2 = 0
    
    team_df = team_df.sort_values(by = 'Date')
    for i in range(1, number_of_previous_matches+1):
        column1 += team_df['FTHG'].shift(i)
        column2 += team_df['FTAG'].shift(i)

    team_df['HxGD in Last 5 Home Matches'] = column1 - column2
    temp_df = pd.concat([temp_df, team_df]).reset_index(drop=True)

df = temp_df

In [11]:
df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches,HxGD in Last 5 Home Matches
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,,
1,2017-09-09,arsenal,bournemouth,3,0,17,7,9,2,14,10,10,3,0,1,0,0,1.36,5.5,9.0,W,2.2,0.6,58.0,10,10,,
2,2017-09-25,arsenal,west-brom,2,0,16,7,6,3,8,17,7,4,1,4,0,0,1.33,5.5,11.0,W,2.2,0.9,69.0,5,5,-1.0,
3,2017-10-01,arsenal,brighton,2,0,25,9,12,1,7,8,6,5,0,2,0,0,1.2,6.5,13.0,W,2.4,0.4,64.0,3,7,0.0,
4,2017-10-28,arsenal,swansea,2,1,17,4,5,2,9,9,5,2,0,0,0,0,1.25,7.0,12.0,W,2.0,0.9,72.0,4,4,6.0,
5,2017-11-18,arsenal,tottenham,2,0,14,14,5,4,11,16,7,4,4,1,0,0,2.5,3.6,2.89,W,2.1,0.7,43.0,10,10,3.0,9.0
6,2017-11-29,arsenal,huddersfield,5,0,21,7,7,2,10,12,7,2,0,1,0,0,1.19,7.5,17.0,W,4.0,0.5,69.0,3,3,5.0,10.0
7,2017-12-02,arsenal,united,1,3,33,8,15,4,11,10,12,1,3,2,0,1,2.5,3.39,3.0,L,4.7,1.8,75.0,3,4,7.0,12.0
8,2017-12-16,arsenal,newcastle,1,0,23,10,5,2,13,9,7,5,2,1,0,0,1.25,6.5,14.0,W,1.8,0.3,71.0,3,3,4.0,8.0
9,2017-12-22,arsenal,liverpool,3,3,11,14,4,9,9,11,7,8,1,0,0,0,2.54,3.6,2.79,D,1.2,2.0,54.0,3,5,4.0,7.0


In [12]:
df.shape

(1900, 28)

## Average number of points gained by home team in the last x matches

In [13]:
def getPointsGained(row, team):
    if row['HomeTeam'] == team:
        if row['Result'] == 'W':
            return 3
        elif row['Result'] == 'D':
            return 1
        elif row['Result'] == 'L':
            return 0
    else:
        if row['Result'] == 'W':
            return 0
        elif row['Result'] == 'D':
            return 1
        elif row['Result'] == 'L':
            return 3

In [14]:
temp_df = pd.DataFrame()

for team in df['HomeTeam'].unique():
    team_df = df[(df['HomeTeam'] == team) | (df['AwayTeam'] == team)]
    team_df = team_df.sort_values(by = 'Date')
    
    team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row, team), axis=1)
    
    number_of_previous_matches = 5
    column = 0
    
    
    for i in range(1, number_of_previous_matches+1):
        column += team_df['Points Gained'].shift(i)
    
    team_df['HAvgPts in Last 5 Matches'] = column / number_of_previous_matches
    team_df.drop(columns=['Points Gained'], inplace=True)
    
    team_df = team_df[team_df['HomeTeam'] == team]
    temp_df = pd.concat([temp_df, team_df]).reset_index(drop=True)

df = temp_df

In [15]:
df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches,HxGD in Last 5 Home Matches,HAvgPts in Last 5 Matches
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,,,
1,2017-09-09,arsenal,bournemouth,3,0,17,7,9,2,14,10,10,3,0,1,0,0,1.36,5.5,9.0,W,2.2,0.6,58.0,10,10,,,
2,2017-09-25,arsenal,west-brom,2,0,16,7,6,3,8,17,7,4,1,4,0,0,1.33,5.5,11.0,W,2.2,0.9,69.0,5,5,-1.0,,1.4
3,2017-10-01,arsenal,brighton,2,0,25,9,12,1,7,8,6,5,0,2,0,0,1.2,6.5,13.0,W,2.4,0.4,64.0,3,7,0.0,,1.4
4,2017-10-28,arsenal,swansea,2,1,17,4,5,2,9,9,5,2,0,0,0,0,1.25,7.0,12.0,W,2.0,0.9,72.0,4,4,6.0,,2.0
5,2017-11-18,arsenal,tottenham,2,0,14,14,5,4,11,16,7,4,4,1,0,0,2.5,3.6,2.89,W,2.1,0.7,43.0,10,10,3.0,9.0,1.8
6,2017-11-29,arsenal,huddersfield,5,0,21,7,7,2,10,12,7,2,0,1,0,0,1.19,7.5,17.0,W,4.0,0.5,69.0,3,3,5.0,10.0,2.4
7,2017-12-02,arsenal,united,1,3,33,8,15,4,11,10,12,1,3,2,0,1,2.5,3.39,3.0,L,4.7,1.8,75.0,3,4,7.0,12.0,2.4
8,2017-12-16,arsenal,newcastle,1,0,23,10,5,2,13,9,7,5,2,1,0,0,1.25,6.5,14.0,W,1.8,0.3,71.0,3,3,4.0,8.0,1.6
9,2017-12-22,arsenal,liverpool,3,3,11,14,4,9,9,11,7,8,1,0,0,0,2.54,3.6,2.79,D,1.2,2.0,54.0,3,5,4.0,7.0,1.6


In [16]:
df.shape

(1900, 29)

## Average number of points gained by home team in the last x home matches

In [17]:
def getPointsGained(row):
    if row['Result'] == 'W':
        return 3
    elif row['Result'] == 'D':
        return 1
    elif row['Result'] == 'L':
        return 0

In [18]:
temp_df = pd.DataFrame()

for team in df['HomeTeam'].unique():
    team_df = df[(df['HomeTeam'] == team)]
    team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)
    
    number_of_previous_matches = 5
    column = 0
    
    team_df = team_df.sort_values(by = 'Date')
    for i in range(1, number_of_previous_matches+1):
        column += team_df['Points Gained'].shift(i)
    
    team_df['HAvgPts in Last 5 Home Matches'] = column / number_of_previous_matches
    team_df.drop(columns=['Points Gained'], inplace=True)
    
    team_df = team_df[team_df['HomeTeam'] == team]
    temp_df = pd.concat([temp_df, team_df]).reset_index(drop=True)

df = temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.appl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)


In [19]:
df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches,HxGD in Last 5 Home Matches,HAvgPts in Last 5 Matches,HAvgPts in Last 5 Home Matches
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,,,,
1,2017-09-09,arsenal,bournemouth,3,0,17,7,9,2,14,10,10,3,0,1,0,0,1.36,5.5,9.0,W,2.2,0.6,58.0,10,10,,,,
2,2017-09-25,arsenal,west-brom,2,0,16,7,6,3,8,17,7,4,1,4,0,0,1.33,5.5,11.0,W,2.2,0.9,69.0,5,5,-1.0,,1.4,
3,2017-10-01,arsenal,brighton,2,0,25,9,12,1,7,8,6,5,0,2,0,0,1.2,6.5,13.0,W,2.4,0.4,64.0,3,7,0.0,,1.4,
4,2017-10-28,arsenal,swansea,2,1,17,4,5,2,9,9,5,2,0,0,0,0,1.25,7.0,12.0,W,2.0,0.9,72.0,4,4,6.0,,2.0,
5,2017-11-18,arsenal,tottenham,2,0,14,14,5,4,11,16,7,4,4,1,0,0,2.5,3.6,2.89,W,2.1,0.7,43.0,10,10,3.0,9.0,1.8,3.0
6,2017-11-29,arsenal,huddersfield,5,0,21,7,7,2,10,12,7,2,0,1,0,0,1.19,7.5,17.0,W,4.0,0.5,69.0,3,3,5.0,10.0,2.4,3.0
7,2017-12-02,arsenal,united,1,3,33,8,15,4,11,10,12,1,3,2,0,1,2.5,3.39,3.0,L,4.7,1.8,75.0,3,4,7.0,12.0,2.4,3.0
8,2017-12-16,arsenal,newcastle,1,0,23,10,5,2,13,9,7,5,2,1,0,0,1.25,6.5,14.0,W,1.8,0.3,71.0,3,3,4.0,8.0,1.6,2.4
9,2017-12-22,arsenal,liverpool,3,3,11,14,4,9,9,11,7,8,1,0,0,0,2.54,3.6,2.79,D,1.2,2.0,54.0,3,5,4.0,7.0,1.6,2.4


In [20]:
df.shape

(1900, 30)

## Home Team Win streak

## Goals difference of away team in the last x matches  

In [21]:
def getGoalsScoredByTeam(row, team):
    if row['HomeTeam'] == team:
        return row['FTHG']
    elif row['AwayTeam'] == team:
        return row['FTAG']

In [22]:
def getGoalsScoredByOpponent(row, team):
    if row['HomeTeam'] == team:
        return row['FTAG']
    elif row['AwayTeam'] == team:
        return row['FTHG']

In [23]:
temp_df = pd.DataFrame()

for team in df['AwayTeam'].unique():
    team_df = df[(df['HomeTeam'] == team) | (df['AwayTeam'] == team)]
    team_df = team_df.sort_values(by = 'Date')
    
    number_of_previous_matches = 5
    column1 = 0;
    column2 = 0;
    
    team_df['GoalsScoredByTeam'] = team_df.apply(lambda row: getGoalsScoredByTeam(row, team), axis=1)
    team_df['GoalsScoredByOpponent'] = team_df.apply(lambda row: getGoalsScoredByOpponent(row, team), axis=1)
    
    for i in range(1, number_of_previous_matches+1):
        column1 += team_df['GoalsScoredByTeam'].shift(i)
        column2 += team_df['GoalsScoredByOpponent'].shift(i)
    
    team_df['AxGD in Last 5 Matches'] = column1 - column2
    team_df.drop(columns=['GoalsScoredByTeam', 'GoalsScoredByOpponent'], inplace=True)
    team_df = team_df[team_df['AwayTeam'] == team]
    temp_df = pd.concat([temp_df, team_df]).reset_index(drop=True)

df = temp_df

In [24]:
df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches,HxGD in Last 5 Home Matches,HAvgPts in Last 5 Matches,HAvgPts in Last 5 Home Matches,AxGD in Last 5 Matches
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,,,,,
1,2017-08-26,united,leicester,2,0,22,11,7,4,8,7,9,3,1,2,0,0,1.33,5.5,11.0,W,2.8,0.9,69.0,7,4,,,,,
2,2017-09-16,huddersfield,leicester,1,1,15,10,3,1,6,6,8,4,1,1,0,0,3.5,3.4,2.25,D,0.7,1.3,59.0,5,7,,,,,
3,2017-09-30,bournemouth,leicester,0,0,19,8,4,1,6,10,7,5,1,0,0,0,2.8,3.25,2.5,D,1.4,0.6,61.0,7,7,-6.0,,0.6,,-2.0
4,2017-10-21,swansea,leicester,1,2,19,11,4,6,6,9,7,6,0,1,0,0,2.79,3.25,2.75,L,0.8,1.9,56.0,7,5,-1.0,,0.8,,-2.0
5,2017-11-04,stoke,leicester,2,2,11,14,7,4,12,8,4,10,0,0,0,0,2.7,3.29,2.87,D,1.0,1.5,47.0,7,6,-8.0,-3.0,1.2,1.4,2.0
6,2017-11-24,west-ham,leicester,1,1,8,7,4,2,9,12,5,5,1,1,0,0,2.7,3.39,2.75,D,0.9,0.4,50.0,5,6,-8.0,-4.0,0.4,1.2,1.0
7,2017-12-09,newcastle,leicester,2,3,10,13,4,5,10,9,6,5,2,1,0,0,2.62,3.29,2.89,L,0.9,0.5,38.0,7,7,-9.0,-2.0,0.2,1.4,0.0
8,2017-12-13,southampton,leicester,1,4,12,16,4,11,13,9,9,9,1,0,0,0,2.1,3.39,3.89,L,1.5,1.8,53.0,3,4,-1.0,3.0,1.0,1.6,1.0
9,2017-12-26,watford,leicester,2,1,11,11,1,5,7,13,8,8,3,3,0,0,2.54,3.5,2.87,W,1.2,1.3,49.0,3,3,-6.0,-4.0,0.2,0.8,2.0


## Goals difference of away team in the last x away matches

In [25]:
temp_df = pd.DataFrame()

for team in df['AwayTeam'].unique():
    team_df = df[df['AwayTeam'] == team]
    team_df = team_df.sort_values(by = 'Date')
    
    number_of_previous_matches = 5
    column1 = 0;
    column2 = 0
    
    for i in range(1, number_of_previous_matches+1):
        column1 += team_df['FTHG'].shift(i)
        column2 += team_df['FTAG'].shift(i)

    team_df['AxGD in Last 5 Away Matches'] = column2 - column1
    temp_df = pd.concat([temp_df, team_df]).reset_index(drop=True)

df = temp_df

In [26]:
df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches,HxGD in Last 5 Home Matches,HAvgPts in Last 5 Matches,HAvgPts in Last 5 Home Matches,AxGD in Last 5 Matches,AxGD in Last 5 Away Matches
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,,,,,,
1,2017-08-26,united,leicester,2,0,22,11,7,4,8,7,9,3,1,2,0,0,1.33,5.5,11.0,W,2.8,0.9,69.0,7,4,,,,,,
2,2017-09-16,huddersfield,leicester,1,1,15,10,3,1,6,6,8,4,1,1,0,0,3.5,3.4,2.25,D,0.7,1.3,59.0,5,7,,,,,,
3,2017-09-30,bournemouth,leicester,0,0,19,8,4,1,6,10,7,5,1,0,0,0,2.8,3.25,2.5,D,1.4,0.6,61.0,7,7,-6.0,,0.6,,-2.0,
4,2017-10-21,swansea,leicester,1,2,19,11,4,6,6,9,7,6,0,1,0,0,2.79,3.25,2.75,L,0.8,1.9,56.0,7,5,-1.0,,0.8,,-2.0,
5,2017-11-04,stoke,leicester,2,2,11,14,7,4,12,8,4,10,0,0,0,0,2.7,3.29,2.87,D,1.0,1.5,47.0,7,6,-8.0,-3.0,1.2,1.4,2.0,-2.0
6,2017-11-24,west-ham,leicester,1,1,8,7,4,2,9,12,5,5,1,1,0,0,2.7,3.39,2.75,D,0.9,0.4,50.0,5,6,-8.0,-4.0,0.4,1.2,1.0,-1.0
7,2017-12-09,newcastle,leicester,2,3,10,13,4,5,10,9,6,5,2,1,0,0,2.62,3.29,2.89,L,0.9,0.5,38.0,7,7,-9.0,-2.0,0.2,1.4,0.0,1.0
8,2017-12-13,southampton,leicester,1,4,12,16,4,11,13,9,9,9,1,0,0,0,2.1,3.39,3.89,L,1.5,1.8,53.0,3,4,-1.0,3.0,1.0,1.6,1.0,2.0
9,2017-12-26,watford,leicester,2,1,11,11,1,5,7,13,8,8,3,3,0,0,2.54,3.5,2.87,W,1.2,1.3,49.0,3,3,-6.0,-4.0,0.2,0.8,2.0,5.0


In [27]:
df.shape

(1900, 32)

## Average number of points gained by away team in the last x matches

In [28]:
def getPointsGained(row, team):
    if row['HomeTeam'] == team:
        if row['Result'] == 'W':
            return 3
        elif row['Result'] == 'D':
            return 1
        elif row['Result'] == 'L':
            return 0
    else:
        if row['Result'] == 'W':
            return 0
        elif row['Result'] == 'D':
            return 1
        elif row['Result'] == 'L':
            return 3

In [29]:
temp_df = pd.DataFrame()

for team in df['AwayTeam'].unique():
    team_df = df[(df['HomeTeam'] == team) | (df['AwayTeam'] == team)]
    team_df = team_df.sort_values(by = 'Date')
    
    team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row, team), axis=1)
    
    number_of_previous_matches = 5
    column = 0
    
    for i in range(1, number_of_previous_matches+1):
        column += team_df['Points Gained'].shift(i)
    
    team_df['AAvgPts in Last 5 Matches'] = column / number_of_previous_matches
    team_df.drop(columns=['Points Gained'], inplace=True)
    
    team_df = team_df[team_df['AwayTeam'] == team]
    temp_df = pd.concat([temp_df, team_df]).reset_index(drop=True)

df = temp_df

In [30]:
df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches,HxGD in Last 5 Home Matches,HAvgPts in Last 5 Matches,HAvgPts in Last 5 Home Matches,AxGD in Last 5 Matches,AxGD in Last 5 Away Matches,AAvgPts in Last 5 Matches
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,,,,,,,
1,2017-08-26,united,leicester,2,0,22,11,7,4,8,7,9,3,1,2,0,0,1.33,5.5,11.0,W,2.8,0.9,69.0,7,4,,,,,,,
2,2017-09-16,huddersfield,leicester,1,1,15,10,3,1,6,6,8,4,1,1,0,0,3.5,3.4,2.25,D,0.7,1.3,59.0,5,7,,,,,,,
3,2017-09-30,bournemouth,leicester,0,0,19,8,4,1,6,10,7,5,1,0,0,0,2.8,3.25,2.5,D,1.4,0.6,61.0,7,7,-6.0,,0.6,,-2.0,,0.8
4,2017-10-21,swansea,leicester,1,2,19,11,4,6,6,9,7,6,0,1,0,0,2.79,3.25,2.75,L,0.8,1.9,56.0,7,5,-1.0,,0.8,,-2.0,,0.6
5,2017-11-04,stoke,leicester,2,2,11,14,7,4,12,8,4,10,0,0,0,0,2.7,3.29,2.87,D,1.0,1.5,47.0,7,6,-8.0,-3.0,1.2,1.4,2.0,-2.0,1.6
6,2017-11-24,west-ham,leicester,1,1,8,7,4,2,9,12,5,5,1,1,0,0,2.7,3.39,2.75,D,0.9,0.4,50.0,5,6,-8.0,-4.0,0.4,1.2,1.0,-1.0,1.6
7,2017-12-09,newcastle,leicester,2,3,10,13,4,5,10,9,6,5,2,1,0,0,2.62,3.29,2.89,L,0.9,0.5,38.0,7,7,-9.0,-2.0,0.2,1.4,0.0,1.0,1.6
8,2017-12-13,southampton,leicester,1,4,12,16,4,11,13,9,9,9,1,0,0,0,2.1,3.39,3.89,L,1.5,1.8,53.0,3,4,-1.0,3.0,1.0,1.6,1.0,2.0,2.0
9,2017-12-26,watford,leicester,2,1,11,11,1,5,7,13,8,8,3,3,0,0,2.54,3.5,2.87,W,1.2,1.3,49.0,3,3,-6.0,-4.0,0.2,0.8,2.0,5.0,2.0


In [31]:
df.shape

(1900, 33)

## Average number of points gained by away team in the last x away matches

In [32]:
def getPointsGained(row):
    if row['Result'] == 'W':
        return 0
    elif row['Result'] == 'D':
        return 1
    elif row['Result'] == 'L':
        return 3

In [33]:
temp_df = pd.DataFrame()

for team in df['AwayTeam'].unique():
    team_df = df[(df['AwayTeam'] == team)]
    team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)
    team_df = team_df.sort_values(by = 'Date')
    
    number_of_previous_matches = 5
    column = 0
    
    for i in range(1, number_of_previous_matches+1):
        column += team_df['Points Gained'].shift(i)
    
    team_df['AAvgPts in Last 5 Away Matches'] = column / number_of_previous_matches
    team_df.drop(columns=['Points Gained'], inplace=True)
    
    team_df = team_df[team_df['AwayTeam'] == team]
    temp_df = pd.concat([temp_df, team_df]).reset_index(drop=True)

df = temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.appl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.apply(lambda row: getPointsGained(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['Points Gained'] = team_df.appl

In [34]:
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches,HxGD in Last 5 Home Matches,HAvgPts in Last 5 Matches,HAvgPts in Last 5 Home Matches,AxGD in Last 5 Matches,AxGD in Last 5 Away Matches,AAvgPts in Last 5 Matches,AAvgPts in Last 5 Away Matches
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,,,,,,,,
1,2017-08-26,united,leicester,2,0,22,11,7,4,8,7,9,3,1,2,0,0,1.33,5.5,11.0,W,2.8,0.9,69.0,7,4,,,,,,,,
2,2017-09-16,huddersfield,leicester,1,1,15,10,3,1,6,6,8,4,1,1,0,0,3.5,3.4,2.25,D,0.7,1.3,59.0,5,7,,,,,,,,
3,2017-09-30,bournemouth,leicester,0,0,19,8,4,1,6,10,7,5,1,0,0,0,2.8,3.25,2.5,D,1.4,0.6,61.0,7,7,-6.0,,0.6,,-2.0,,0.8,
4,2017-10-21,swansea,leicester,1,2,19,11,4,6,6,9,7,6,0,1,0,0,2.79,3.25,2.75,L,0.8,1.9,56.0,7,5,-1.0,,0.8,,-2.0,,0.6,


In [35]:
df.shape

(1900, 34)

## Away Team Win streak

## Home Team Performance Index
- Home Defense Performance Index
- Home Midfield Performance Index
- Home Attack Performance Index

In [36]:
fifa_ratings_df = pd.read_csv('datasets/data-source-3/cleaned_dataset3.csv')
fifa_ratings_df.head()

Unnamed: 0,Version,Date,Team,Attack,Midfield,Defense
0,FIFA 22,2022-08-18,liverpool,86,84,85
1,FIFA 22,2022-08-18,city,84,87,86
2,FIFA 22,2022-08-18,chelsea,84,85,83
3,FIFA 22,2022-08-18,united,82,83,81
4,FIFA 22,2022-08-18,tottenham,83,81,78


In [37]:
def getHomeTeamRatingIndex(row):
    date = row['Date']
    hometeam = row['HomeTeam']
    
    ratings_df = fifa_ratings_df[(fifa_ratings_df['Team'] == hometeam) & (fifa_ratings_df['Date'] < date)]
    ratings_row = ratings_df.sort_values(by = ['Date', 'Version'], ascending=[False, False]).iloc[0, :]
    
    row['HDef'] = ratings_row['Defense']
    row['HMid'] = ratings_row['Midfield']
    row['HAtt'] = ratings_row['Attack']
    
    return row

In [38]:
df = df.apply(getHomeTeamRatingIndex, axis=1)
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed,HxGD in Last 5 Matches,HxGD in Last 5 Home Matches,HAvgPts in Last 5 Matches,HAvgPts in Last 5 Home Matches,AxGD in Last 5 Matches,AxGD in Last 5 Away Matches,AAvgPts in Last 5 Matches,AAvgPts in Last 5 Away Matches,HDef,HMid,HAtt
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10,,,,,,,,,82,83,81
1,2017-08-26,united,leicester,2,0,22,11,7,4,8,7,9,3,1,2,0,0,1.33,5.5,11.0,W,2.8,0.9,69.0,7,4,,,,,,,,,82,84,84
2,2017-09-16,huddersfield,leicester,1,1,15,10,3,1,6,6,8,4,1,1,0,0,3.5,3.4,2.25,D,0.7,1.3,59.0,5,7,,,,,,,,,71,70,70
3,2017-09-30,bournemouth,leicester,0,0,19,8,4,1,6,10,7,5,1,0,0,0,2.8,3.25,2.5,D,1.4,0.6,61.0,7,7,-6.0,,0.6,,-2.0,,0.8,,75,74,76
4,2017-10-21,swansea,leicester,1,2,19,11,4,6,6,9,7,6,0,1,0,0,2.79,3.25,2.75,L,0.8,1.9,56.0,7,5,-1.0,,0.8,,-2.0,,0.6,,75,77,76
