## Environment Set Up  

In [31]:
import numpy as np
import pandas as pd

## Load Data

In [32]:
keepcols = [
    "Div",
    "Date",
    "Time",
    "HomeTeam",
    "AwayTeam",
    "FTHG",
    "FTAG",
    "FTR",
    "HTHG",
    "HTAG",
    "HTR",
    "Referee",
    "HS",
    "AS",
    "HST",
    "AST",
    "HF",
    "AF",
    "HC",
    "AC",
    "HY",
    "AY",
    "HR",
    "AR",
]

results = pd.read_csv(
    "../data/E0.csv", usecols=keepcols, parse_dates=["Date"], dayfirst=True
)

results.head()


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E0,2021-08-13,20:00,Brentford,Arsenal,2,0,H,1,0,...,3,4,12,8,2,5,0,0,0,0
1,E0,2021-08-14,12:30,Man United,Leeds,5,1,H,1,0,...,8,3,11,9,5,4,1,2,0,0
2,E0,2021-08-14,15:00,Burnley,Brighton,1,2,A,1,0,...,3,8,10,7,7,6,2,1,0,0
3,E0,2021-08-14,15:00,Chelsea,Crystal Palace,3,0,H,2,0,...,6,1,15,11,5,2,0,0,0,0
4,E0,2021-08-14,15:00,Everton,Southampton,3,1,H,0,1,...,6,3,13,15,6,8,2,0,0,0


## Data Transformation

### Melt 

creates new dataframe (team_results) with 2 rows per match

In [33]:
## converting each matchup into 2 rows
## one where each team is 'current_team' and opponent is identfied
results["H"] = results["HomeTeam"]
results["A"] = results["AwayTeam"]
cols_to_keep = [
    "Div",
    "Date",
    "HomeTeam",
    "AwayTeam",
    "FTHG",
    "FTAG",
    "FTR",
    "HTHG",
    "HTAG",
    "HTR",
    "Referee",
]

team_results = pd.melt(
    results,
    id_vars=cols_to_keep,
    value_vars=["H", "A"],
    var_name="Home/Away",
    value_name="Team",
)

team_results["Opponent"] = np.where(
    team_results["Team"] == team_results["HomeTeam"],
    team_results["AwayTeam"],
    team_results["HomeTeam"],
)


team_results.head(20)


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,Home/Away,Team,Opponent
0,E0,2021-08-13,Brentford,Arsenal,2,0,H,1,0,H,M Oliver,H,Brentford,Arsenal
1,E0,2021-08-14,Man United,Leeds,5,1,H,1,0,H,P Tierney,H,Man United,Leeds
2,E0,2021-08-14,Burnley,Brighton,1,2,A,1,0,H,D Coote,H,Burnley,Brighton
3,E0,2021-08-14,Chelsea,Crystal Palace,3,0,H,2,0,H,J Moss,H,Chelsea,Crystal Palace
4,E0,2021-08-14,Everton,Southampton,3,1,H,0,1,A,A Madley,H,Everton,Southampton
5,E0,2021-08-14,Leicester,Wolves,1,0,H,1,0,H,C Pawson,H,Leicester,Wolves
6,E0,2021-08-14,Watford,Aston Villa,3,2,H,2,0,H,M Dean,H,Watford,Aston Villa
7,E0,2021-08-14,Norwich,Liverpool,0,3,A,0,1,A,A Marriner,H,Norwich,Liverpool
8,E0,2021-08-15,Newcastle,West Ham,2,4,A,2,1,H,M Atkinson,H,Newcastle,West Ham
9,E0,2021-08-15,Tottenham,Man City,1,0,H,0,0,D,A Taylor,H,Tottenham,Man City


In [34]:
team_results.tail(20)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,Home/Away,Team,Opponent
740,E0,2022-05-15,Leeds,Brighton,1,1,D,0,1,A,M Dean,A,Brighton,Leeds
741,E0,2022-05-15,Watford,Leicester,1,5,A,1,2,A,J Gillett,A,Leicester,Watford
742,E0,2022-05-15,West Ham,Man City,2,2,D,2,0,H,A Taylor,A,Man City,West Ham
743,E0,2022-05-15,Wolves,Norwich,1,1,D,0,1,A,T Harrington,A,Norwich,Wolves
744,E0,2022-05-15,Everton,Brentford,2,3,A,2,1,H,M Oliver,A,Brentford,Everton
745,E0,2022-05-16,Newcastle,Arsenal,2,0,H,0,0,D,D England,A,Arsenal,Newcastle
746,E0,2022-05-17,Southampton,Liverpool,1,2,A,1,1,D,M Atkinson,A,Liverpool,Southampton
747,E0,2022-05-19,Everton,Crystal Palace,3,2,H,0,2,A,A Taylor,A,Crystal Palace,Everton
748,E0,2022-05-19,Aston Villa,Burnley,1,1,D,0,1,A,P Tierney,A,Burnley,Aston Villa
749,E0,2022-05-19,Chelsea,Leicester,1,1,D,1,1,D,S Attwell,A,Leicester,Chelsea


## Get Points from Results

In [35]:
# Function and Dictionary

points_map = {
    'W': 3,
    'D': 1,
    'L': 0
}

def get_result(score, score_opp):
    if score == score_opp:
        return 'D'
    elif score > score_opp:
        return 'W'
    else:
        return 'L'

In [36]:
# full time goals
team_results["Goals"] = np.where(
    team_results["Team"] == team_results["HomeTeam"],
    team_results["FTHG"],
    team_results["FTAG"],
)
team_results["Goals_Opp"] = np.where(
    team_results["Team"] != team_results["HomeTeam"],
    team_results["FTHG"],
    team_results["FTAG"],
)
team_results["Result"] = np.vectorize(get_result)(
    team_results["Goals"], team_results["Goals_Opp"]
)
team_results["Points"] = team_results["Result"].map(points_map)

team_results["Score"] = team_results["Goals"].astype(str) + '-' + team_results["Goals_Opp"].astype(str) 

'''
# 1st half goals
team_results["1H_Goals"] = np.where(
    team_results["Team"] == team_results["HomeTeam"],
    team_results["HTHG"],
    team_results["HTAG"],
)
team_results["1H_Goals_Opp"] = np.where(
    team_results["Team"] != team_results["HomeTeam"],
    team_results["HTHG"],
    team_results["HTAG"],
)
team_results["1H_Result"] = np.vectorize(get_result)(
    team_results["1H_Goals"], team_results["1H_Goals_Opp"]
)
team_results["1H_Points"] = team_results["1H_Result"].map(points_map)

# 2nd half goals
team_results["2H_Goals"] = team_results["Goals"] - team_results["1H_Goals"]
team_results["2H_Goals_Opp"] = team_results["Goals_Opp"] - team_results["1H_Goals_Opp"]
team_results["2H_Result"] = np.vectorize(get_result)(
    team_results["2H_Goals"], team_results["2H_Goals_Opp"]
)
team_results["2H_Points"] = team_results["2H_Result"].map(points_map)
'''


'\n# 1st half goals\nteam_results["1H_Goals"] = np.where(\n    team_results["Team"] == team_results["HomeTeam"],\n    team_results["HTHG"],\n    team_results["HTAG"],\n)\nteam_results["1H_Goals_Opp"] = np.where(\n    team_results["Team"] != team_results["HomeTeam"],\n    team_results["HTHG"],\n    team_results["HTAG"],\n)\nteam_results["1H_Result"] = np.vectorize(get_result)(\n    team_results["1H_Goals"], team_results["1H_Goals_Opp"]\n)\nteam_results["1H_Points"] = team_results["1H_Result"].map(points_map)\n\n# 2nd half goals\nteam_results["2H_Goals"] = team_results["Goals"] - team_results["1H_Goals"]\nteam_results["2H_Goals_Opp"] = team_results["Goals_Opp"] - team_results["1H_Goals_Opp"]\nteam_results["2H_Result"] = np.vectorize(get_result)(\n    team_results["2H_Goals"], team_results["2H_Goals_Opp"]\n)\nteam_results["2H_Points"] = team_results["2H_Result"].map(points_map)\n'

In [37]:
# Drop unnecessary columns and sort by date
cols_to_drop = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR']
team_results = (team_results
                    .drop(cols_to_drop, axis=1)
                    .sort_values(by=['Date', 'Referee']))

In [38]:
team_results.head(20)

Unnamed: 0,Div,Date,Referee,Home/Away,Team,Opponent,Goals,Goals_Opp,Result,Points,Score
0,E0,2021-08-13,M Oliver,H,Brentford,Arsenal,2,0,W,3,2-0
380,E0,2021-08-13,M Oliver,A,Arsenal,Brentford,0,2,L,0,0-2
4,E0,2021-08-14,A Madley,H,Everton,Southampton,3,1,W,3,3-1
384,E0,2021-08-14,A Madley,A,Southampton,Everton,1,3,L,0,1-3
7,E0,2021-08-14,A Marriner,H,Norwich,Liverpool,0,3,L,0,0-3
387,E0,2021-08-14,A Marriner,A,Liverpool,Norwich,3,0,W,3,3-0
5,E0,2021-08-14,C Pawson,H,Leicester,Wolves,1,0,W,3,1-0
385,E0,2021-08-14,C Pawson,A,Wolves,Leicester,0,1,L,0,0-1
2,E0,2021-08-14,D Coote,H,Burnley,Brighton,1,2,L,0,1-2
382,E0,2021-08-14,D Coote,A,Brighton,Burnley,2,1,W,3,2-1


In [41]:
team_results.to_pickle('../data/pickle/team_results.pkl')