# Workspace - Building all types of data

This notebook was utilized to generate different types of analysis as well as datasets used for preprocessing. The sections are not in any particular order and were generated at different points in the project lifespan.

The sections include:
- Simplification of 'qualifiers' column - for simpler information extraction 
- Building a csv with all event types - for analysis and data understanding
- Building a csv with all events - for analysis and data understanding
- Building a dataset with all chunks - synthetically generated chunks were grouped into one chunk for the rag pipeline
- Evaluating all features in datasets - for analysis and data understanding 


In [1]:
import os
import pandas as pd 
import ast
import json
import numpy as np

from tqdm import tqdm
from helpers.data_handlers import get_ordner

### Simplification of 'qualifiers' column
Modifies the 'qualifiers' column with the method 'simplify_qualifiers'

In [2]:
from helpers.data_handlers import simplify_qualifiers

# in this case the input ordner is the output ordner as well

amount_gameweeks = 34
type_data = "EventData"
input_ordner = get_ordner("modified")

# Loop through each gameweek directory
for gameweek in tqdm(range(1, amount_gameweeks + 1)):
    
    # Get the files in the directory
    directory = os.listdir(input_ordner + str(gameweek))
    
    # Add events from all files to the list
    for fname in directory:
        
        if type_data in fname:
            
            # Read the file
            file_path = os.path.join(input_ordner + str(gameweek) + "/" + fname)
            df = pd.read_csv(file_path)
            
            # events not to simlify as these are already in desired form
            skip_events = ['MatchWon', 'MatchLost', 'MatchDraw']
            # Simplify the qualifiers
            df = simplify_qualifiers(df, skip_events)
            
            # Save the simplified DataFrame
            df.to_csv(file_path, index=False)

100%|██████████| 34/34 [00:45<00:00,  1.34s/it]


### Building a csv with all event types
Generates a df that has all the event types - used to count the occurrence of each event

In [3]:
amount_gameweeks = 34
type_data = "EventData"
input_ordner = get_ordner("modified")

# Create an empty list to store all event types
all_events_type = []

# Loop through each gameweek directory
for gameweek in range(1, amount_gameweeks + 1):
    
    # Get the files in the directory
    directory = os.listdir(input_ordner + str(gameweek))
    
    # Add events from all files to the list
    for fname in directory:
        
        if type_data in fname:
            
            # Read the file
            file_path = os.path.join(input_ordner + str(gameweek) + "/" + fname)
            df = pd.read_csv(file_path)
            
            if 'type' in df.columns:
                # Extend the list with the 'type' values
                all_events_type.extend(df['type'].tolist())

# Create a DataFrame from the list
all_events_df = pd.DataFrame(all_events_type, columns=['type'])
all_events_df.value_counts().to_csv("data/Evaluation/EventTypesCount.csv")

### Building a csv with all events

In [4]:
regenerate = True # TODO Set to True to regenerate the all_events.csv file
amount_gameweeks = 34
type_data = "EventData"
input_ordner = get_ordner("modified")

if regenerate:
    # Create an empty DataFrame to hold all events
    all_events_df = pd.DataFrame()

    # Loop through each gameweek directory
    for gameweek in tqdm(range(1, amount_gameweeks + 1)):
        # Get the directory path
        directory_path = os.path.join(input_ordner + str(gameweek))
        
        # Ensure the directory exists
        if os.path.exists(directory_path):
            # Get all files in the directory
            directory = os.listdir(directory_path)
            
            # Loop through the files
            for fname in directory:
                if type_data in fname:
                    # Read the file and append it to the master DataFrame
                    file_path = os.path.join(directory_path, fname)
                    try:
                        df = pd.read_csv(file_path)
                        df['gameweek'] = gameweek  # Add a column to indicate the gameweek
                        all_events_df = pd.concat([all_events_df, df], ignore_index=True)
                    except Exception as e:
                        print(f"Error reading file {file_path}: {e}")
        else:
            print(f"Directory {directory_path} does not exist.")
            
    all_events_df.to_csv("data/all_events.csv", index=False) 
else:
    all_events_df = pd.read_csv("data/all_events.csv")

100%|██████████| 34/34 [00:13<00:00,  2.52it/s]


### Building one dataset with all chunks

In [50]:
# Number of gameweeks
gameweeks = 34

# Directory containing all the gameweeks where event data is modified to hold league standings as well as match outcomes
input_dir = 'data/Bundesliga/modified/GW'

# Initialize the list to hold the final data structure
chunks = []

# Loop through all the gameweeks
for gw in tqdm(range(1, gameweeks + 1)):
    # Get all the files in the directory
    directory = os.listdir(input_dir + str(gw))

    # Load the league standings for the current gameweek
    league_standings_path = os.path.join(input_dir + str(gw), "league_standings.csv")
    if os.path.exists(league_standings_path):
        league_standings = pd.read_csv(league_standings_path)
    else:
        league_standings = pd.DataFrame()

    # Loop through all the files in the directory
    for fname in directory:
        if "EventData" in fname:
            # Load event and player data
            event_df = pd.read_csv(os.path.join(input_dir + str(gw), fname))
            player_df_path = os.path.join(input_dir + str(gw), fname.replace("EventData", "PlayerData"))
            if os.path.exists(player_df_path):
                player_df = pd.read_csv(player_df_path)
            else:
                player_df = pd.DataFrame()

            # Create chunks for each row in event data
            for i, row in event_df.iterrows():
                if row['type'] in ['FormationSet', 'End']:
                    continue

                entry = {
                    "metadata": {
                        'gameweek': gw,
                        'event_id': row['eventId'],
                        'team_id': row['teamId'],
                        'player_id': row.get('playerId', None),
                        'event_type': row['type']
                    },
                    "content": row.get('chunks', None)
                }
                chunks.append(entry)

            # Create chunks for player data
            for i, row in player_df.iterrows():
                entry = {
                    "metadata": {
                        'gameweek': gw,
                        'entity': 'PLAYER',
                        'team_id': row['teamId'],
                        'player_id': row['playerId'],
                        'event_type': None
                    },
                    "content": row.get('chunks', None)
                }
                chunks.append(entry)

    # Add league standings chunks
    for i, row in league_standings.iterrows():
        entry = {
            "metadata": {
                'gameweek': gw,
                'entity': 'LEAGUE',
                'team_id': row['teamId'],
                'player_id': None,
                'event_type': None
            },
            "content": row.get('chunks', None)
        }
        chunks.append(entry)

# Save the chunks to a JSON file
output_file = 'data/event_chunks/chunks.json'
with open(output_file, 'w') as json_file:
    json.dump(chunks, json_file, indent=4)

print(f"Chunks saved to {output_file}")


100%|██████████| 34/34 [00:17<00:00,  1.89it/s]


Entries saved to data/event_chunks/chunks.json


### Evaluating all features we have
#### view results in data/Evaluation/EventData

In [19]:
def evaluate_features(df):
    """
    Evaluates the features in a given DataFrame by examining the number of unique values and the number of missing values.

    Parameters:
        df: DataFrame to evaluate.
    Returns:
        DataFrame that contains the unique value count and missing value count for each feature.
    """
    
    # Replace empty lists with None values in the 'qualifiers' column
    df.loc[df['qualifiers'].apply(lambda x: str(x) == '[]'), 'qualifiers'] = None
    
    feature_evaluation = pd.DataFrame({
        'Unique Values': df.nunique(),
        'Missing Values': df.isnull().sum(),
        'Missing Percentage': df.isnull().mean() * 100
    })
    
    return feature_evaluation

In [20]:
eval_directory = "data/Evaluation/EventData"
    
for event in all_events_df['type'].unique():
    all_df = all_events_df[all_events_df['type'] == event]
    evaluation = evaluate_features(all_df)
    
    relevant_columns = evaluation.index[evaluation['Missing Percentage'] < 30]
    
    text_directory = os.path.join(eval_directory, "Text")
    if not os.path.exists(text_directory):
        os.makedirs(text_directory)
        
    with open (os.path.join(text_directory, f"{event}.txt"), "w") as f:
        f.write(f"Event: {event}\n")
        f.write(f"This event occurred {len(all_df)} times.\n")
        f.write("\n")
        f.write("Relevant Columns: columns with less than 50% missing values:\n")
        f.write(str(relevant_columns.values))
        f.write("\n\n")
        f.write(evaluation.to_string())
    
    csv_directory = os.path.join(eval_directory, "CSV")
    if not os.path.exists(csv_directory):
        os.makedirs(csv_directory)
        
    evaluation.to_csv(os.path.join(csv_directory, f"{event}.csv"))