### Building chunk templates
#### This notebook was used for the development of methods that can be used to build event chunks based on each specific type of event 

1) First we need to build the basic structure of the template where basic information is displayed that is consistent over all event chunks

    This includes:
    - gameweek
    - date of match
    - event minute
    - player taking the event (event taker)
    - type of event (for this an event mapping was generated that translates the type names into normal language)
    - place of event on pitch (x and y coordinates)
    - scoreline at time (related to minute)
    - team of event taker
    - opponent


In [1]:
import os
import numpy as np
import pandas as pd
import json
import ast
import traceback
import helpers.chunks_creators

from importlib import reload
from tqdm import tqdm
from datetime import datetime

reload(helpers.chunks_creators)

from helpers.chunks_creators import get_information_template, build_event_chunk, build_team_chunk, process_player
from helpers.data_handlers import basic_player_data, load_event_mapping, map_coordinates_to_18_zones
from helpers.extract_info import extract_player_rating, extract_next_event_dict, check_extra_time


In [1]:
def write_error_log(gw, teams, error, log_file_path):
    """Write errors to a log file."""
    with open(log_file_path, 'a') as f:
        f.write("-----------------------------------------------\n")
        f.write(f"Error in GW{gw} - {teams[0]} vs {teams[1]}: {error}\n")
        f.write(traceback.format_exc())
        f.write("-----------------------------------------------\n")


def write_chunks_to_file(chunk, output_file):
    """Write chunks to a file."""
    
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))
        
    with open(output_file, 'a') as f:
        f.write(chunk)
        f.write("-----------------------------------------------\n")

def get_league_standings(gw, input_dir):
    """Load league standings from previous gameweek."""
    try:
        input_dir = os.path.join(input_dir + f"{gw - 1}", "league_standings.csv")
        return pd.read_csv(input_dir)
    except FileNotFoundError:
        return None

In [4]:
gameweeks = 34

# directory containing all the gameweeks where event data is modified to hold league standings as well as match outcomes
input_dir = 'data/Bundesliga/modified/GW'

# load in event mapping which is a json file containing the mapping of event types to their respective event names
event_mapping = load_event_mapping('data/mapping_event.json')
event_description_mapping = load_event_mapping('data/mapping_event_description.json')
position_mapping = load_event_mapping('data/mapping_position.json')

# loop through all the gameweeks
for gw in tqdm(range(1, gameweeks + 1)):
    
    # get all the files in the directory
    directory = os.listdir(input_dir + str(gw))
    
    # get the directory for the player data
    gw_dir = input_dir + str(gw)
    
    player_dir = 'data/Bundesliga/original/GW' + str(gw)
    
    # load in the league standings for the previous gameweek
    try:
        # load in the league standings for the previous gameweek as it is needed to derive the form of the team
        league_standings = pd.read_csv(input_dir + str(gw - 1) + "/" + "league_standings.csv")
    except FileNotFoundError:
        # first gameweek does not have a previous gameweek
        league_standings = None

    # load the league standings for the current gameweek to update the chunks
    ls_for_chunk = pd.read_csv(input_dir + str(gw) + "/" + "league_standings.csv")
    
    # loop through all the files in the directory
    for fname in directory:
        
        if "EventData" in fname:
            event_df = pd.read_csv(os.path.join(gw_dir,fname))
            player_df = pd.read_csv(os.path.join(player_dir, fname.replace("EventData", "PlayerData")))
            
            # when both the event and player data is available start processing the data
            try:                
                basic_player_info = basic_player_data(player_df)
                
                # teams playing in the match
                teams = event_df['teamId'].unique()
                
                # Mapping team ids to team names
                team_mapping = {team_id: event_df['teamName'][event_df['teamId'] == team_id].iloc[0] for team_id in teams}
                
                # output_file to write the chunks to
                output_dir = f"data/event_chunks/GW{gw}/"

                # create the output directory if it does not exist
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                
                # output file to write the chunks to
                output_file= os.path.join(output_dir, f"GW{gw}_{teams[0]}_{teams[1]}.txt")
                
                # intialize the file to write the chunks to
                chunk_dir = "data/event_chunks"
                if not os.path.exists(chunk_dir):
                    os.makedirs(chunk_dir)
                    
                with open(output_file, 'w') as f:
                    f.write("-----------------------------------------------\n")
                
                # list to hold all the chunks - later it will be written to a column of the event_df
                chunks = []
                
                # list to hold the teams for which the team chunk has been built in the current match
                teams_built = []    
                
                # loop through all the events in the event_df
                for ind, event in event_df.iterrows():
                    
                    # skip the formation set and end events as they do not contain any useful information
                    if event['type'] in ['FormationSet', 'End']:
                        # append a None to the chunks list so that the index of the chunks list matches the index of the event_df
                        chunks.append(None)
                        continue
                    
                    # getting the information template for the event type
                    information = get_information_template()
                    
                    # check if the player is given in the data, so that we can either fill in the player information or not
                    if not (np.isnan(event['playerId'])):
                        # get the player information
                        player = basic_player_info[basic_player_info['playerId'] == event['playerId']]
                        players_stats = ast.literal_eval(player_df[player_df['playerId'] == event['playerId']]['stats'].values[0])
                        
                        # populate the information template with information that is specific to the player
                        information['event_taker'] = player['name'].values[0]
                        try: # some players do not have ratings or any stats
                            information['event_taker_rating'] = extract_player_rating(players_stats['ratings'], event['minute'])
                        except:
                            information['event_taker_rating'] = None
                        information['event_taker_position'] = position_mapping[player['position'].values[0]]

                    # populate the information template with information that is specific to this event
                    information['game_week'] = gw
                    information['event_type'] = event['type']
                    information['event_success'] = "successfully" if event['outcomeType'] == "Successful" else "unsuccessfully"
                    information['event_type_mapped'] = event_mapping[event['type']]
                    information['event_description'] = event_description_mapping[event['type']]
                    information['event_qualifiers'] = ast.literal_eval(event['simplified_qualifiers'])
                    information['event_period'] = "First Half" if event['period'] == "FirstHalf" else "Second Half"
                    information['event_time'] = str(int(event['minute'])) + ":" + str(int(event['second'])) if not np.isnan(event['second']) else str(int(event['minute']))
                    information['event_location'] = map_coordinates_to_18_zones(event['x'], event['y'])
                    information['event_taker_team'] = event['teamName']
                    
                    if not (np.isnan(event['endX']) and np.isnan(event['endY'])): 
                        information['event_end_location'] = map_coordinates_to_18_zones(event['endX'], event['endY'])
                    
                    # get the opponent team
                    opponent_id = teams[0] if teams[0] != event['teamId'] else teams[1]
                    information['opponent_team'] = team_mapping[opponent_id]
                    information['league_standings'] = league_standings # league standings for the previous gameweek to get the form of the team
                    
                    # next event in sequence
                    try:
                        if np.isnan(event_df.iloc[ind + 1]['playerId']):
                            playerName = None
                            information['next_event'] = extract_next_event_dict(event_df.iloc[ind + 1], playerName)
                        else:
                            information['next_event'] = extract_next_event_dict(event_df.iloc[ind + 1], basic_player_info[basic_player_info['playerId'] == event_df.iloc[ind + 1]['playerId']]['name'].values[0])
                    except:
                        information['next_event'] = None
                    # get the match score and convert it to a dictionary from string
                    event['matchScore'] = ast.literal_eval(event['matchScore'])
                    information['match_score'] = {event['teamName'] : event['matchScore'][str(event['teamId'])], information['opponent_team'] : event['matchScore'][str(opponent_id)]}
                    
                    try:
                        # calling the function to build the chunk with the populated information
                        chunk = build_event_chunk(information)
                    except Exception as e:
                        write_error_log(gw, teams, e, f"data/error_log_{datetime.now().strftime('date_%Y-%m-%d')}.txt")
                            
                    # append the chunk to the chunks list
                    chunks.append(chunk)
                    
                    # build the team chunk if it has not been built yet
                    if event['teamId'] not in teams_built:
                        team_chunk = build_team_chunk(information)
                        teams_built.append(event['teamId'])
                        write_chunks_to_file(team_chunk, output_file)
                        # Add the team chunk to the league standings
                        ls_for_chunk.loc[ls_for_chunk['teamId'] == event['teamId'], "chunks"] = str(team_chunk)     

                    # write the chunk to a file
                    write_chunks_to_file(chunk, output_file)
                    
            except Exception as e:
                print(f"Error in GW{gw} - {teams[0]} vs {teams[1]}: {e}")
                print(traceback.format_exc())
                write_error_log(gw, teams, e, f"data/error_log_{datetime.now().strftime('date_%Y-%m-%d')}.txt")
                
            # Write player chunks
            player_chunks = process_player(player_df, team_mapping, position_mapping, gw)
            for player_chunk in player_chunks:
                write_chunks_to_file(player_chunk, output_file)

            # Save updated data
            event_df['chunks'] = chunks
            player_df['chunks'] = player_chunks
            event_df.to_csv(os.path.join(gw_dir, fname), index=False)
            player_df.to_csv(os.path.join(gw_dir, fname.replace("EventData", "PlayerData")), index=False)
            if league_standings is not None:
                league_standings.to_csv(os.path.join(input_dir + str(gw) , "league_standings.csv"), index=False)
                
    # Save updated league standings
    ls_for_chunk.to_csv(gw_dir + "/" + "league_standings.csv", index=False)

100%|██████████| 34/34 [20:13<00:00, 35.68s/it]
