# Feature Engineering

In [47]:
from dataclasses import dataclass
from datetime import timedelta, datetime
import pandas as pd
import numpy as np

import sys
sys.path.append('..')

from src.features import build_features as buif

In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
FILE_PATH = '../data/interim/eo_football_interim'
football = pd.read_csv(FILE_PATH)
football.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,SeasonLabel
0,E0,2016-08-13,Burnley,Swansea,0,1,A,0,0,D,...,9,10,14,7,4,3,2,0,0,2016_2017
1,E0,2016-08-13,Crystal Palace,West Brom,0,1,A,0,0,D,...,3,12,15,3,6,2,2,0,0,2016_2017
2,E0,2016-08-13,Everton,Tottenham,1,1,D,1,0,H,...,4,10,14,5,6,0,0,0,0,2016_2017
3,E0,2016-08-13,Hull,Leicester,2,1,H,1,0,H,...,5,8,17,5,3,2,2,0,0,2016_2017
4,E0,2016-08-13,Man City,Sunderland,2,1,H,1,0,H,...,3,11,14,9,6,1,2,0,0,2016_2017


In [50]:
football['Date'] = pd.to_datetime(football['Date'])
football.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,SeasonLabel
0,E0,2016-08-13,Burnley,Swansea,0,1,A,0,0,D,...,9,10,14,7,4,3,2,0,0,2016_2017
1,E0,2016-08-13,Crystal Palace,West Brom,0,1,A,0,0,D,...,3,12,15,3,6,2,2,0,0,2016_2017
2,E0,2016-08-13,Everton,Tottenham,1,1,D,1,0,H,...,4,10,14,5,6,0,0,0,0,2016_2017
3,E0,2016-08-13,Hull,Leicester,2,1,H,1,0,H,...,5,8,17,5,3,2,2,0,0,2016_2017
4,E0,2016-08-13,Man City,Sunderland,2,1,H,1,0,H,...,3,11,14,9,6,1,2,0,0,2016_2017


## Features to create

This are the features that are engineered from the datasets. Some features are generated season-wise while some are for the entire dataset. The dataset ranges from 2016 - 2021, season 2016 would be used as dummy data to engineer features for the rest seasons. This data should not be used for machine learning model creation.

**Running Average Features**
- HCLPOS, ACLPOS : Current league position for the home(away) team
- HELORK10, AELORK10: Elo ran for the home(away) team when k = 10

In [51]:
league_tr = buif.LeaguePosAdder()
football = league_tr.fit(football)
current_league_table = league_tr.transform(football)
current_league_table

{'Man City': PositionStat(goals_for=83, goals_against=32, goals_diff=51, points=86, position=1),
 'Man United': PositionStat(goals_for=73, goals_against=44, goals_diff=29, points=74, position=2),
 'Liverpool': PositionStat(goals_for=68, goals_against=42, goals_diff=26, points=69, position=3),
 'Chelsea': PositionStat(goals_for=58, goals_against=36, goals_diff=22, points=67, position=4),
 'Leicester': PositionStat(goals_for=68, goals_against=50, goals_diff=18, points=66, position=5),
 'West Ham': PositionStat(goals_for=62, goals_against=47, goals_diff=15, points=65, position=6),
 'Tottenham': PositionStat(goals_for=68, goals_against=45, goals_diff=23, points=62, position=7),
 'Arsenal': PositionStat(goals_for=55, goals_against=39, goals_diff=16, points=61, position=8),
 'Everton': PositionStat(goals_for=47, goals_against=48, goals_diff=-1, points=59, position=9),
 'Leeds': PositionStat(goals_for=62, goals_against=54, goals_diff=8, points=59, position=10),
 'Aston Villa': PositionStat(go

In [52]:
football

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AF,HC,AC,HY,AY,HR,AR,SeasonLabel,HCLPOS,ACLPOS
0,E0,2016-08-13,Burnley,Swansea,0,1,A,0,0,D,...,14,7,4,3,2,0,0,2016_2017,0,0
1,E0,2016-08-13,Crystal Palace,West Brom,0,1,A,0,0,D,...,15,3,6,2,2,0,0,2016_2017,0,0
2,E0,2016-08-13,Everton,Tottenham,1,1,D,1,0,H,...,14,5,6,0,0,0,0,2016_2017,0,0
3,E0,2016-08-13,Hull,Leicester,2,1,H,1,0,H,...,17,5,3,2,2,0,0,2016_2017,0,0
4,E0,2016-08-13,Man City,Sunderland,2,1,H,1,0,H,...,14,9,6,1,2,0,0,2016_2017,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,E0,2021-05-23,Liverpool,Crystal Palace,2,0,H,1,0,H,...,8,14,1,2,2,0,0,2020_2021,5,14
1896,E0,2021-05-23,Man City,Everton,5,0,H,2,0,H,...,10,7,5,2,2,0,0,2020_2021,1,9
1897,E0,2021-05-23,Sheffield United,Burnley,1,0,H,1,0,H,...,1,8,9,3,1,0,0,2020_2021,20,17
1898,E0,2021-05-23,West Ham,Southampton,3,0,H,2,0,H,...,9,2,3,0,3,0,0,2020_2021,6,15


In [53]:
football['HELORT10'] = 0
football['AELORT10'] = 0

In [54]:
football.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AC,HY,AY,HR,AR,SeasonLabel,HCLPOS,ACLPOS,HELORT10,AELORT10
0,E0,2016-08-13,Burnley,Swansea,0,1,A,0,0,D,...,4,3,2,0,0,2016_2017,0,0,0,0
1,E0,2016-08-13,Crystal Palace,West Brom,0,1,A,0,0,D,...,6,2,2,0,0,2016_2017,0,0,0,0
2,E0,2016-08-13,Everton,Tottenham,1,1,D,1,0,H,...,6,0,0,0,0,2016_2017,0,0,0,0
3,E0,2016-08-13,Hull,Leicester,2,1,H,1,0,H,...,3,2,2,0,0,2016_2017,0,0,0,0
4,E0,2016-08-13,Man City,Sunderland,2,1,H,1,0,H,...,6,1,2,0,0,2016_2017,0,0,0,0


In [85]:
def calc_p(elort1, elort2, gf, ga, k=10):
    gd = np.abs(gf - ga)
    g = 0; w = 0.5; we = 0.5
    
    if (gd == 0) or (gd == 1):
        g = 1
    elif gd == 2:
        g = 3 / 2
    else:
        g = (11 + gd) / 8
    
    if gf > ga:
        w = 1
    elif ga > gf:
        w = 0
  
    minus_dr = -((elort1) - elort2)
    we = 1 / ((10 ** (minus_dr/400)) + 1)

    p = int(k * (g * (w - we)))
    
    return p

In [87]:
unique_teams = football.HomeTeam.unique()
elort_dict = {}

for team in unique_teams:
    elort_dict[team] = 0

HELORT10 = []
AELORT10 = []
for idx, row in football.iterrows():
    # current home team elort
    ht_current_elort = elort_dict[row.HomeTeam]
    at_current_elort = elort_dict[row.AwayTeam]
    
    # set the current_elort
    football.loc[idx, 'HELORT10'] = ht_current_elort
    football.loc[idx, 'AELORT10'] = at_current_elort
    
    # update the current_elort based on the result for home_team
    fthg = row.FTHG
    ftag = row.FTAG
    ht_new_elort = ht_current_elort + calc_p(ht_current_elort, at_current_elort, fthg, ftag, k=10)
    at_new_elort = at_current_elort + calc_p(at_current_elort, ht_current_elort, ftag, fthg, k=10)
    
    elort_dict[row.HomeTeam] = ht_new_elort
    elort_dict[row.AwayTeam] = at_new_elort

In [88]:
football

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AC,HY,AY,HR,AR,SeasonLabel,HCLPOS,ACLPOS,HELORT10,AELORT10
0,E0,2016-08-13,Burnley,Swansea,0,1,A,0,0,D,...,4,3,2,0,0,2016_2017,0,0,0,0
1,E0,2016-08-13,Crystal Palace,West Brom,0,1,A,0,0,D,...,6,2,2,0,0,2016_2017,0,0,0,0
2,E0,2016-08-13,Everton,Tottenham,1,1,D,1,0,H,...,6,0,0,0,0,2016_2017,0,0,0,0
3,E0,2016-08-13,Hull,Leicester,2,1,H,1,0,H,...,3,2,2,0,0,2016_2017,0,0,0,0
4,E0,2016-08-13,Man City,Sunderland,2,1,H,1,0,H,...,6,1,2,0,0,2016_2017,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,E0,2021-05-23,Liverpool,Crystal Palace,2,0,H,1,0,H,...,1,2,2,0,0,2020_2021,5,14,214,-35
1896,E0,2021-05-23,Man City,Everton,5,0,H,2,0,H,...,5,2,2,0,0,2020_2021,1,9,254,35
1897,E0,2021-05-23,Sheffield United,Burnley,1,0,H,1,0,H,...,9,3,1,0,0,2020_2021,20,17,-74,-18
1898,E0,2021-05-23,West Ham,Southampton,3,0,H,2,0,H,...,3,0,3,0,0,2020_2021,6,15,36,-25


In [89]:
football[football['SeasonLabel'] == '2020_2021']

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AC,HY,AY,HR,AR,SeasonLabel,HCLPOS,ACLPOS,HELORT10,AELORT10
1520,E0,2020-09-12,Fulham,Arsenal,0,3,A,0,1,A,...,3,2,2,0,0,2020_2021,0,0,-87,97
1521,E0,2020-09-12,Crystal Palace,Southampton,1,0,H,1,0,H,...,3,2,1,0,0,2020_2021,3,13,-22,-13
1522,E0,2020-09-12,Liverpool,Leeds,4,3,H,3,2,H,...,0,1,0,0,0,2020_2021,5,12,257,0
1523,E0,2020-09-12,West Ham,Newcastle,0,2,A,0,0,D,...,7,2,2,0,0,2020_2021,7,14,-20,-17
1524,E0,2020-09-13,West Brom,Leicester,0,3,A,0,0,D,...,5,1,1,0,0,2020_2021,9,16,-69,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,E0,2021-05-23,Liverpool,Crystal Palace,2,0,H,1,0,H,...,1,2,2,0,0,2020_2021,5,14,214,-35
1896,E0,2021-05-23,Man City,Everton,5,0,H,2,0,H,...,5,2,2,0,0,2020_2021,1,9,254,35
1897,E0,2021-05-23,Sheffield United,Burnley,1,0,H,1,0,H,...,9,3,1,0,0,2020_2021,20,17,-74,-18
1898,E0,2021-05-23,West Ham,Southampton,3,0,H,2,0,H,...,3,0,3,0,0,2020_2021,6,15,36,-25
