# 1. Info.

This notebook is to create all the data enrichments.

Before running the notebook make sure the files bundesliga.csv, la_liga.csv, ligue_1.csv, premier_league.csv, primeira_liga.csv, and serie_a.csv to guarantee the notebook's correct funtionility.

# 2. Data enrichments.

## 2.1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from functions import get_points, away_result, home_result

## 2.2. Read the data

In [2]:
data = pd.read_csv('../data/premier_league.csv')

## 2.3. Teams to categorical

In [3]:
data['HomeTeam'] = data['HomeTeam'].str.replace("'","").str.replace(" ","_")
data['AwayTeam'] = data['AwayTeam'].str.replace("'","").str.replace(" ","_")

In [4]:
teams_list = list(set(data['HomeTeam'].unique().tolist()+data['AwayTeam'].unique().tolist()))

In [5]:
# save the teams dict inside a csv file, so it can be used later in the project
teams_dict_df = pd.DataFrame({'Team':teams_list}).reset_index().rename(columns={'index':'index_team'})
# teams_dict_df.to_csv('../data/teams_dict.csv', index=False)

In [6]:
# create the dict
team_dict = {}
for tuple in teams_dict_df.itertuples():
    team_dict.update({tuple.Team:tuple.index_team})

data['HomeTeam'] = data['HomeTeam'].replace(team_dict)
data['AwayTeam'] = data['AwayTeam'].replace(team_dict)

## 2.4. Adjust dtypes

In [7]:
for column in data.loc[:, data.dtypes == float].columns:
    data[column] = data[column].astype('int')

## 2.5. Order the data to have one teams information by line

In [8]:
team_data = {
    'Date':[],
    'Team':[],
    'Opponent':[],
    'FTG_scored':[],
    'FTG_received':[],
    'FT_Result':[],
    'HTG_scored':[],
    'HTG_received':[],
    'HT_Result':[],
    'season':[],
    'shots':[],
    'shots_received':[],
    'shots_target':[],
    'shots_target_received':[],
    'fouls_commited':[],
    'fouls_received':[],
    'corners':[],
    'corners_against':[],
    'yellow_cards':[],
    'yellow_cards_opponent':[],
    'red_cards':[],
    'red_cards_opponent':[],
    'Home':[]
}

In [10]:
for tuple in data.itertuples():
    # home
    team_data['Date'].append(tuple.Date)
    team_data['Team'].append(tuple.HomeTeam)
    team_data['Opponent'].append(tuple.AwayTeam)
    team_data['FTG_scored'].append(tuple.FTHG)
    team_data['FTG_received'].append(tuple.FTAG)
    team_data['FT_Result'].append(home_result[tuple.FTR])
    team_data['HTG_scored'].append(tuple.HTHG)
    team_data['HTG_received'].append(tuple.HTAG)
    team_data['HT_Result'].append(home_result[tuple.HTR])
    team_data['season'].append(tuple.season)
    team_data['shots'].append(tuple.HS)
    team_data['shots_received'].append(tuple.AS)
    team_data['shots_target'].append(tuple.HST)
    team_data['shots_target_received'].append(tuple.AST)
    team_data['fouls_commited'].append(tuple.HF)
    team_data['fouls_received'].append(tuple.AF)
    team_data['corners'].append(tuple.HC)
    team_data['corners_against'].append(tuple.AC)
    team_data['yellow_cards'].append(tuple.HY)
    team_data['yellow_cards_opponent'].append(tuple.AY)
    team_data['red_cards'].append(tuple.HR)
    team_data['red_cards_opponent'].append(tuple.AR)
    team_data['Home'].append(1)

    # away
    team_data['Date'].append(tuple.Date)
    team_data['Team'].append(tuple.AwayTeam)
    team_data['Opponent'].append(tuple.HomeTeam)
    team_data['FTG_scored'].append(tuple.FTAG)
    team_data['FTG_received'].append(tuple.FTHG)
    team_data['FT_Result'].append(away_result[tuple.FTR]) ####
    team_data['HTG_scored'].append(tuple.HTAG)
    team_data['HTG_received'].append(tuple.HTHG)
    team_data['HT_Result'].append(away_result[tuple.HTR]) ####
    team_data['season'].append(tuple.season)
    team_data['shots'].append(tuple.AS)
    team_data['shots_received'].append(tuple.HS)
    team_data['shots_target'].append(tuple.AST)
    team_data['shots_target_received'].append(tuple.HST)
    team_data['fouls_commited'].append(tuple.AF)
    team_data['fouls_received'].append(tuple.HF)
    team_data['corners'].append(tuple.AC)
    team_data['corners_against'].append(tuple.HC)
    team_data['yellow_cards'].append(tuple.AY)
    team_data['yellow_cards_opponent'].append(tuple.HY)
    team_data['red_cards'].append(tuple.AR)
    team_data['red_cards_opponent'].append(tuple.HR)
    team_data['Home'].append(0)

In [11]:
df = pd.DataFrame(data=team_data)

## 2.5. Add goals scored, received, goal difference and points by team

In [12]:
df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%Y")

In [13]:
sorted_data = df.sort_values(by=['season','Date']).reset_index(drop=True)

In [14]:
FTG_scored = {}
FTG_received = {}
HTG_scored = {}
HTG_received = {}
total_points = {}
shots = {}
shots_received = {}
shots_target = {}
shots_target_received = {}
fouls_commited = {}
fouls_received = {}
corners = {}
corners_against = {}
yellow_cards = {}
yellow_cards_opponent = {}
red_cards = {}
red_cards_opponent = {}

# columns to fill
sorted_data['FTG_scored_Total'] = 0
sorted_data['FTG_received_Total'] = 0
sorted_data['HTG_scored_Total'] = 0
sorted_data['HTG_received_Total'] = 0
sorted_data['shots_Total'] = 0
sorted_data['shots_received_Total'] = 0
sorted_data['shots_target_Total'] = 0
sorted_data['shots_target_received_Total'] = 0
sorted_data['fouls_commited_Total'] = 0
sorted_data['fouls_received_Total'] = 0
sorted_data['corners_Total'] = 0
sorted_data['corners_against_Total'] = 0
sorted_data['yellow_cards_Total'] = 0
sorted_data['yellow_cards_opponent_Total'] = 0
sorted_data['red_cards_Total'] = 0
sorted_data['red_cards_opponent_Total'] = 0

sorted_data['points'] = 0

for season in sorted_data['season'].unique().tolist():
    
    season_df =  sorted_data[sorted_data['season']==season]
    
    # at the begining of the season restart all the dictionaries
    dict_Fscored = FTG_scored.copy()
    dict_Freceived = FTG_received.copy()
    dict_Hscored = HTG_scored.copy()
    dict_Hreceived = HTG_received.copy()
    dict_points = total_points.copy()
    dict_shots = shots.copy()
    dict_shots_received = shots_received.copy()
    dict_shots_target = shots_target.copy()
    dict_shots_target_received = shots_target_received.copy()
    dict_fouls_commited = fouls_commited.copy()
    dict_fouls_received = fouls_received.copy()
    dict_corners = corners.copy()
    dict_corners_against = corners_against.copy()
    dict_yellow_cards = yellow_cards.copy()
    dict_yellow_cards_opponent = yellow_cards_opponent.copy()
    dict_red_cards = red_cards.copy()
    dict_red_cards_opponent = red_cards_opponent.copy()

    for tuple in season_df.itertuples():

        # update values in dict to fill the dfFrame
        if dict_Fscored.get(tuple.Team, False):

            # update the dataFrame
            sorted_data.loc[tuple.Index,'FTG_scored_Total'] = dict_Fscored[tuple.Team]
            sorted_data.loc[tuple.Index,'FTG_received_Total'] = dict_Freceived[tuple.Team]
            sorted_data.loc[tuple.Index,'HTG_scored_Total'] = dict_Hscored[tuple.Team]
            sorted_data.loc[tuple.Index,'HTG_received_Total'] = dict_Hreceived[tuple.Team]
            sorted_data.loc[tuple.Index,'shots_Total'] = dict_shots[tuple.Team]
            sorted_data.loc[tuple.Index,'shots_received_Total'] = dict_shots_received[tuple.Team]
            sorted_data.loc[tuple.Index,'shots_target_Total'] = dict_shots_target[tuple.Team]
            sorted_data.loc[tuple.Index,'shots_target_received_Total'] = dict_shots_target_received[tuple.Team]
            sorted_data.loc[tuple.Index,'fouls_commited_Total'] = dict_fouls_commited[tuple.Team]
            sorted_data.loc[tuple.Index,'fouls_received_Total'] = dict_fouls_received[tuple.Team]
            sorted_data.loc[tuple.Index,'corners_Total'] = dict_corners[tuple.Team]
            sorted_data.loc[tuple.Index,'corners_against_Total'] = dict_corners_against[tuple.Team]
            sorted_data.loc[tuple.Index,'yellow_cards_Total'] = dict_yellow_cards[tuple.Team]
            sorted_data.loc[tuple.Index,'yellow_cards_opponent_Total'] = dict_yellow_cards_opponent[tuple.Team]
            sorted_data.loc[tuple.Index,'red_cards_Total'] = dict_red_cards[tuple.Team]
            sorted_data.loc[tuple.Index,'red_cards_opponent_Total'] = dict_red_cards_opponent[tuple.Team]
            sorted_data.loc[tuple.Index,'points'] = dict_points[tuple.Team]

            dict_Fscored[tuple.Team] = dict_Fscored[tuple.Team] + tuple.FTG_scored
            dict_Freceived[tuple.Team] = dict_Freceived[tuple.Team] + tuple.FTG_received
            dict_Hscored[tuple.Team] = dict_Hscored[tuple.Team] + tuple.HTG_scored
            dict_Hreceived[tuple.Team] = dict_Hreceived[tuple.Team] + tuple.HTG_received
            dict_shots[tuple.Team] = dict_shots[tuple.Team] + tuple.shots
            dict_shots_received[tuple.Team] = dict_shots_received[tuple.Team] + tuple.shots_received
            dict_shots_target[tuple.Team] = dict_shots_target[tuple.Team] + tuple.shots_target
            dict_shots_target_received[tuple.Team] = dict_shots_target_received[tuple.Team] + tuple.shots_target_received
            dict_fouls_commited[tuple.Team] = dict_fouls_commited[tuple.Team] + tuple.fouls_commited
            dict_fouls_received[tuple.Team] = dict_fouls_received[tuple.Team] + tuple.fouls_received
            dict_corners[tuple.Team] = dict_corners[tuple.Team] + tuple.corners
            dict_corners_against[tuple.Team] = dict_corners_against[tuple.Team] + tuple.corners_against
            dict_yellow_cards[tuple.Team] = dict_yellow_cards[tuple.Team] + tuple.yellow_cards
            dict_yellow_cards_opponent[tuple.Team] = dict_yellow_cards_opponent[tuple.Team] + tuple.yellow_cards_opponent
            dict_red_cards[tuple.Team] = dict_red_cards[tuple.Team] + tuple.red_cards
            dict_red_cards_opponent[tuple.Team] = dict_red_cards_opponent[tuple.Team] + tuple.red_cards_opponent
            dict_points[tuple.Team] = dict_points[tuple.Team] + get_points(tuple.FT_Result)

        else:

            # update the dataFrame
            sorted_data.loc[tuple.Index,'FTG_scored_Total'] = 0
            sorted_data.loc[tuple.Index,'FTG_received_Total'] = 0
            sorted_data.loc[tuple.Index,'HTG_scored_Total'] = 0
            sorted_data.loc[tuple.Index,'HTG_received_Total'] = 0
            sorted_data.loc[tuple.Index,'shots_Total'] = 0
            sorted_data.loc[tuple.Index,'shots_received_Total'] = 0
            sorted_data.loc[tuple.Index,'shots_target_Total'] = 0
            sorted_data.loc[tuple.Index,'shots_target_received_Total'] = 0
            sorted_data.loc[tuple.Index,'fouls_commited_Total'] = 0
            sorted_data.loc[tuple.Index,'fouls_received_Total'] = 0
            sorted_data.loc[tuple.Index,'corners_Total'] = 0
            sorted_data.loc[tuple.Index,'corners_against_Total'] = 0
            sorted_data.loc[tuple.Index,'yellow_cards_Total'] = 0
            sorted_data.loc[tuple.Index,'yellow_cards_opponent_Total'] = 0
            sorted_data.loc[tuple.Index,'red_cards_Total'] = 0
            sorted_data.loc[tuple.Index,'red_cards_opponent_Total'] = 0
            sorted_data.loc[tuple.Index,'points'] = 0

            dict_Fscored.update({tuple.Team:tuple.FTG_scored})
            dict_Freceived.update({tuple.Team:tuple.FTG_received})
            dict_Hscored.update({tuple.Team:tuple.HTG_scored})
            dict_Hreceived.update({tuple.Team:tuple.HTG_received})
            dict_points.update({tuple.Team:get_points(tuple.FT_Result)})
            dict_shots.update({tuple.Team:tuple.shots})
            dict_shots_received.update({tuple.Team:tuple.shots_received})
            dict_shots_target.update({tuple.Team:tuple.shots_target})
            dict_shots_target_received.update({tuple.Team:tuple.shots_target_received})
            dict_fouls_commited.update({tuple.Team:tuple.fouls_commited})
            dict_fouls_received.update({tuple.Team:tuple.fouls_received})
            dict_corners.update({tuple.Team:tuple.corners})
            dict_corners_against.update({tuple.Team:tuple.corners_against})
            dict_yellow_cards.update({tuple.Team:tuple.yellow_cards})
            dict_yellow_cards_opponent.update({tuple.Team:tuple.yellow_cards_opponent})
            dict_red_cards.update({tuple.Team:tuple.red_cards})
            dict_red_cards_opponent.update({tuple.Team:tuple.red_cards_opponent})

In [15]:
sorted_data['goal_difference'] = sorted_data['FTG_scored_Total'] - sorted_data['FTG_received_Total']

## 2.6. Position

Current position at the begining of the match

In [16]:
seasons_list = sorted_data['season'].unique().tolist()
sorted_data['position'] = 0

for season in seasons_list:
    season_data =  sorted_data[sorted_data['season']==season]
    fixture_list = sorted_data['Date'].unique().tolist()
    fixture_list.sort()

    for fixture in fixture_list:
        if fixture == 1:
            pass
        else:
            # the season dataFrame will be filtered to get the past data, drop duplicates
            # sort by points and goals and finally merge the output with the data from the
            # current fixture
            past_data = season_data[season_data['Date']<=fixture]
            unique_teams_data = past_data.drop_duplicates(
                subset=['Team'], 
                keep='last'
            )
            position_data = unique_teams_data.sort_values(
                by=['points','goal_difference','FTG_scored_Total','FTG_received_Total'],
                ascending=False
            ).reset_index(drop=True)
            current_position_data = position_data[['Team']].reset_index()
            current_position_data.columns = ['current_position','Team']
            current_position_data['current_position'] = current_position_data['current_position']+1
            
            fixture_data = season_data[season_data['Date']==fixture].reset_index()
            
            fixture_position_data = fixture_data.merge(current_position_data, how='left', on='Team')
            
            for tuple in fixture_position_data.itertuples():
                sorted_data.loc[tuple.index,'position'] = tuple.current_position

## 2.7. win_rate, mooving_win_rate, mooving goals scored, and mooving goals received

The mooving rates are calculated based on a 5 mooving average because that is the number of matchs to calculate a team's form in multiple football data sites.

In [17]:
# this new column is because when creating the win rate, the only result that should sum must be the win.
sorted_data['Win'] = sorted_data['FT_Result'].replace({2:0})

In [18]:
sorted_data['win_rate'] = ''
sorted_data['mooving_win_rate'] = ''
sorted_data['mooving_goals_scored'] = ''
sorted_data['mooving_goals_received'] = ''

idx_delete = []

for season in sorted_data['season'].unique().tolist():
    # get data season
    data_season = sorted_data.query(f"season == '{season}'")

    # iterate by team
    for team in data_season['Team'].unique().tolist():
        # get data by team
        data_team = data_season.query(f"Team == {team}").reset_index().rename(columns={'index':'old_idx'})


        # get the win_rate, mooving win_rate and mooving_goals
        for idx, row in data_team.iterrows():
            # condition to avoid out of index mistake
            if idx + 1 <= len(data_team['old_idx']):

                # win_rate
                total_wins = int(data_team.query(f"old_idx < {row['old_idx']}")['Win'].sum())
                if total_wins == 0:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'win_rate'] = 0
                else:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'win_rate'] = total_wins / idx

                # mooving_win_rate
                # get the index list filter
                if idx < 5:
                    index_list = [idx_number for idx_number in range(0,idx+1)]
                else:
                    index_list = [idx_number for idx_number in range(idx-4,idx+1)]

                total_wins = int(data_team.query(f"index in {index_list}")['Win'].sum())
                if total_wins == 0:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_win_rate'] = 0
                elif idx == 0:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_win_rate'] = 0
                else:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_win_rate'] = total_wins / 5
                
                # mooving goals scored
                total_goals_scored = int(data_team.query(f"index in {index_list}")['FTG_scored_Total'].sum())
                if total_goals_scored == 0:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_goals_scored'] = 0
                elif idx == 0:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_goals_scored'] = 0
                else:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_goals_scored'] = total_goals_scored / 5

                # mooving goals received
                total_goals_received = int(data_team.query(f"index in {index_list}")['FTG_received_Total'].sum())
                if total_goals_received == 0:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_goals_received'] = 0
                elif idx == 0:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_goals_received'] = 0
                else:
                    sorted_data.loc[data_team.loc[idx,'old_idx'],'mooving_goals_received'] = total_goals_received / 5

                # index of rows to be delete
                if idx < 5:
                    idx_delete.append(data_team.loc[idx,'old_idx'])


# 3. Export the enriched data

In [19]:
sorted_data.to_csv('../data/enriched_data/premier_league.csv', index=False)

End of the botebook