# Dataframe construction

In [10]:
import pandas as pd
import os 
import numpy as np

## Finding the path of the files 

In [1]:
def Files_path(path, endswith):
    """
    Retrieving the directory of the of files from the path provided.

    Parameters:
    - path: str, path to the main directory.
    - endswith: str, name of the file at the end of it.

    Return:
    - List with directory's to specific files.
    """
    list_with_files = []
    
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(endswith):
                list_with_files.append(os.path.join(root, file))  
    return list_with_files

In [2]:
def Complete_files(path, endswith1, endswith2, endswith3):
    """
    Retrieving the directory of specific files from the path provided,
    ensuring a set of three files (one for each ending) from the folder if possible,
    otherwise skipping the folder and returning an empty list.

    Parameters:
    - path: str, path to the main directory.
    - endswith1: str, first file ending to search for.
    - endswith2: str, second file ending to search for.
    - endswith3: str, third file ending to search for.

    Return:
    - List with directories to specific files, containing a set of three files
      if possible, otherwise an empty list.
    """
    list_with_files = []
    file_counts = {endswith1: 0, endswith2: 0, endswith3: 0}

    for root, dirs, files in os.walk(path):
        found_files = []  # List to store found files for current folder
        for file in files:
            if file.endswith(endswith1):
                found_files.append(os.path.join(root, file))
                file_counts[endswith1] += 1
            elif file.endswith(endswith2):
                found_files.append(os.path.join(root, file))
                file_counts[endswith2] += 1
            elif file.endswith(endswith3):
                found_files.append(os.path.join(root, file))
                file_counts[endswith3] += 1

        # Check if a complete set of three files is found
        if len(found_files) == 3:
            list_with_files.extend(found_files)  # Add all three files
        else:
            # Skip to the next iteration if not a complete set
            continue

    return list_with_files


## Reads and filters either a json- or a pandas dataframe file 

In [3]:
def filter_labels(filepath=None, labels_to_include=None, df=None):
    """
    Filters a dataframe or a JSON file to include only rows with specified labels.
    
    This function can operate in two modes:
    1. If a filepath is provided, it loads the JSON file into a DataFrame, filters it, and returns the filtered DataFrame.
    2. If a DataFrame is directly provided via the 'df' parameter, it filters this DataFrame.
    
    Parameters:
    - filepath: str, optional, path to the JSON file.
    - labels_to_include: list of str, optional,labels to be included in the output DataFrame.
    - df: pandas.DataFrame, optional, a DataFrame to be filtered.
    
    Returns:
    - A filtered DataFrame with the specified labels.
    """
    # conditions to read dataframe, and checking if provided input is sufficient. 
    if df is None:
        if filepath is None:
            raise ValueError("Either 'filepath' or 'df' must be provided.")
        df = pd.read_json(filepath)
        df = pd.json_normalize(df['annotations'])  

    # condition to extract the labels from the input.
    if labels_to_include is not None:
        df_filtered = df[df['label'].isin(labels_to_include)]
        return df_filtered
    else:
        df_filtered = df  
        return df_filtered[['label','gameTime']]


## Creating a list from the first and second half of the game

In [4]:
def create_list_from_json(file_path1, file_path2):
    """
    Creates a list from two JSON files by extracting text values from segments.
    
    Parameters:
    - file_path1: str, path to the first JSON file.
    - file_path2: str, path to the second JSON file.
    
    Returns:
    - A list with a single column 'ASR_text' containing the extracted text values.
    """
    
    # Function to process a single JSON file
    def process_file(file_path):
        text_values = []
        
        df = pd.read_json(file_path)
        
        # Extract text values from each segment and add them to the list
        for i in range(len(df)):
            text_value = df.loc[i, 'segments'][2]
            game_time_start = df.loc[i, 'segments'][0]
            game_time_end = df.loc[i, 'segments'][1]
            text_values.append([game_time_start, game_time_end,text_value])
        return text_values
    
    # Process each of the two JSON files
    ASR_list_one = process_file(file_path1)
    ASR_list_two = process_file(file_path2)
    
    return ASR_list_one, ASR_list_two

## Creating a pamdas dataframe from the games ASR list 

In [5]:
def dataframe_from_ASR_text(ASR_list):
    """
    Creates a DataFrame from a list of ASR text entries, where each entry includes start time, end time, and the text itself.
    Start Time and End Time are set as integers representing seconds.
    
    Parameters:
    - sentences_list: list of lists, each containing start time, end time, and a sentence.
    
    Returns:
    - A pandas DataFrame with columns 'Start Time', 'End Time', and 'ASR_text' for each entry.
    """
    # Initialize lists to store the start times, end times, and ASR text separately
    start_times = []
    end_times = []
    asr_texts = []
    
    # Iterate through the provided list to extract start time, end time, and text
    for entry in ASR_list:
        start_time, end_time, text = entry
        start_times.append(int(float(start_time)))  # Convert to float then to integer
        end_times.append(int(float(end_time)))      # Convert to float then to integer
        asr_texts.append(text.strip())
    
    # Create a DataFrame with the extracted information
    df = pd.DataFrame({
        'Start (sec)': start_times,
        'End (sec)': end_times,
        'ASR_text': asr_texts
    })
    
    return df


## Converting gametime from json file to seconds

In [6]:
def gametime_to_sec(game_time):
    """
    Converts game time from the format 'half - MM:SS' to total seconds.
    
    Parameters:
    - game_time: str, game time in the format 'half - MM:SS'. The first symbol is either '1' or '2' indicating the half.
    
    Returns:
    - int, total game time in seconds.
    """
    # Split the string to extract the half and the time
    half, time_str = game_time.split(' - ')
    minutes, seconds = map(int, time_str.split(':'))
    
    # Convert time to seconds
    total_seconds = minutes * 60 + seconds
    
    return total_seconds


## Exstracts ASR text with a defined standard devation based on given dataframe 

In [7]:
def asr_texts_timestamped(sec_df, ASR_df, SD=30):
    """
    For each timestamp it retrieves associated ASR texts from ASR_df
    within a 15-second range and combines them into a single text per 'sec'.
    
    Parameters:
    - sec_df: DataFrame with a 'sec' column indicating specific times in seconds.
    - ASR_df: DataFrame with 'Start (sec)', 'End (sec)', and 'ASR_text' columns.
    
    Returns:
    - DataFrame with two columns: 'sec' from sec_df and 'ASR_text'
      containing all associated texts from ASR_df within the standard deviated range.
    """
    combined_texts = []

    for sec in sec_df['sec']:
        Start = sec - SD
        End = sec + SD

        # Filter ASR_df for texts within the specified time range
        filtered_df = ASR_df.loc[(ASR_df['Start (sec)'] > Start) & (ASR_df['Start (sec)'] < End)]

        # Combine the filtered texts into a single string
        combined_text = ' '.join(filtered_df['ASR_text'])
        combined_texts.append(combined_text)

    # Create a new DataFrame with the results
    result_df = pd.DataFrame({
        'sec': sec_df['sec'],
        'ASR_text': combined_texts
    })

    return result_df

In [9]:
# path to the main folder
path = 'SN-ASR_captions_and_actions'

# finding the path for every game and Label-v2.json file.
one_ASR_list = Files_path(path, endswith = "1_half-ASR.json")
two_ASR_list = Files_path(path, endswith = "2_half-ASR.json")
label_path = Files_path(path, endswith = 'v2.json')

print('First half of the game:')
print(one_ASR_list[0])
print(len(one_ASR_list))
print()
print('Second half of the game:')
print(two_ASR_list[0])
print(len(two_ASR_list))
print()
print('Labels of the game:')
print(label_path[0])
print(len(label_path))

First half of the game:
SN-ASR_captions_and_actions/france_ligue-1/2015-2016/2015-11-07 - 19-00 Paris SG 5 - 0 Toulouse/1_half-ASR.json
515

Second half of the game:
SN-ASR_captions_and_actions/france_ligue-1/2015-2016/2015-11-07 - 19-00 Paris SG 5 - 0 Toulouse/2_half-ASR.json
515

Labels of the game:
SN-ASR_captions_and_actions/france_ligue-1/2015-2016/2015-11-07 - 19-00 Paris SG 5 - 0 Toulouse/Labels-v2.json
500


In [10]:
# Creating a dataframe from the Labels-v2.json file
label_df = filter_labels(filepath=label_path[0])
label_df.head()

Unnamed: 0,label,gameTime
0,Kick-off,1 - 00:00
1,Ball out of play,1 - 01:12
2,Throw-in,1 - 01:20
3,Ball out of play,1 - 03:30
4,Throw-in,1 - 03:40


In [11]:
# Define the labels to include
labels = ['Corner', 'Foul', 'Goal']

# Filtering the labels 
filtered_df = filter_labels(filepath=None, labels_to_include=labels, df=label_df)
filtered_df.reset_index(inplace=True)

filtered_df.head()

Unnamed: 0,index,label,gameTime
0,5,Foul,1 - 04:33
1,8,Goal,1 - 05:44
2,22,Foul,1 - 11:23
3,33,Foul,1 - 14:51
4,37,Foul,1 - 15:43


In [12]:
# Extract the half from 'gameTime' and convert to integer
filtered_df['Half'] = filtered_df['gameTime'].str.split(' - ').apply(lambda x: int(x[0]))

# Split the DataFrame based on the half
first_half_label_df = filtered_df.loc[filtered_df['Half'] == 1].copy()
second_half_label_df = filtered_df.loc[filtered_df['Half'] == 2].copy()

# Optionally, drop the 'Half' column if it's no longer needed
first_half_label_df.drop(columns=['Half'], inplace=True)
second_half_label_df.drop(columns=['Half'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Half'] = filtered_df['gameTime'].str.split(' - ').apply(lambda x: int(x[0]))


In [13]:
first_half_label_df.head()

Unnamed: 0,index,label,gameTime
0,5,Foul,1 - 04:33
1,8,Goal,1 - 05:44
2,22,Foul,1 - 11:23
3,33,Foul,1 - 14:51
4,37,Foul,1 - 15:43


In [14]:
# resetting the index 
second_half_label_df = second_half_label_df.reset_index(drop=True)
second_half_label_df.head()

Unnamed: 0,index,label,gameTime
0,105,Corner,2 - 00:54
1,106,Foul,2 - 02:30
2,117,Foul,2 - 08:47
3,119,Foul,2 - 09:49
4,127,Foul,2 - 11:22


In [15]:
# Adding gametime converted to sec as a column
game_sec_one = [gametime_to_sec(i) for i in first_half_label_df['gameTime']]
game_sec_two = [gametime_to_sec(i) for i in second_half_label_df['gameTime']]

game_sec_one_df = pd.DataFrame(game_sec_one, columns=['sec'])
game_sec_two_df = pd.DataFrame(game_sec_two, columns=['sec'])

first_half_filtered_df = pd.concat([first_half_label_df, game_sec_one_df, ], axis=1, join="inner")
second_half_filtered_df = pd.concat([second_half_label_df, game_sec_two_df, ], axis=1, join="inner")

In [16]:
first_half_filtered_df.head()

Unnamed: 0,index,label,gameTime,sec
0,5,Foul,1 - 04:33,273
1,8,Goal,1 - 05:44,344
2,22,Foul,1 - 11:23,683
3,33,Foul,1 - 14:51,891
4,37,Foul,1 - 15:43,943


In [17]:
second_half_filtered_df.head()

Unnamed: 0,index,label,gameTime,sec
0,105,Corner,2 - 00:54,54
1,106,Foul,2 - 02:30,150
2,117,Foul,2 - 08:47,527
3,119,Foul,2 - 09:49,589
4,127,Foul,2 - 11:22,682


In [18]:
# Creating a list for both the first and second half of the game.
ASR_text_list_one, ASR_text_list_two = create_list_from_json(one_ASR_list[0], two_ASR_list[0])

# Retrieving the last timepoint for the first half
#Last_list_one = ASR_text_list_one[-1]
#Last_value = int(Last_list_one[1].rstrip('.0'))

In [19]:
# Changing the list to a pandas dataframe 
ASR_one_df = dataframe_from_ASR_text(ASR_text_list_one)
ASR_two_df = dataframe_from_ASR_text(ASR_text_list_two)

In [20]:
ASR_one_df.tail()

Unnamed: 0,Start (sec),End (sec),ASR_text
816,2682,2683,Put the ball to sleep on the line.
817,2684,2685,Ninkov played it out. Corner kick for PSG.
818,2689,2690,First corner of the match for either side.
819,2691,2692,And it's coming to 45th minute.
820,2694,2695,Which is about to expire.


In [21]:
ASR_two_df.tail()

Unnamed: 0,Start (sec),End (sec),ASR_text
686,2721,2722,From the three-time defending champions
687,2723,2724,It's a type of performance you would expect
688,2724,2725,A team that's sitting
689,2726,2727,Imperiously atop of Ligon
690,2728,2729,Against a team that's


In [22]:
# Using the provided function to retrieve and combine ASR text with the first half.
combined_asr_one = asr_texts_timestamped(first_half_filtered_df, ASR_one_df)
combined_asr_one.drop(columns=['sec'], inplace=True)
label_ASR_df_one = pd.concat([first_half_filtered_df, combined_asr_one, ], axis=1, join="inner") 
label_ASR_df_one.head()

Unnamed: 0,index,label,gameTime,sec,ASR_text
0,5,Foul,1 - 04:33,273,Hanabiik. That's a real poor clearance from Al...
1,8,Goal,1 - 05:44,344,Early warning given by the ref. Keep your hand...
2,22,Foul,1 - 11:23,683,It was 1-0 and needed an own goal by Zoumana K...
3,33,Foul,1 - 14:51,891,Out of touch for Toulouse. Quarter of an hour ...
4,37,Foul,1 - 15:43,943,Not going to fall into that trap too easily. B...


In [23]:
len(first_half_filtered_df), len(label_ASR_df_one)

(15, 15)

In [24]:
# Using the provided function to retrieve and combine ASR text with the second half.
combined_asr_two = asr_texts_timestamped(second_half_filtered_df, ASR_two_df)
combined_asr_two.drop(columns=['sec'], inplace=True)
label_ASR_df_two = pd.concat([second_half_filtered_df, combined_asr_two, ], axis=1, join="inner") 
label_ASR_df_two.head()

Unnamed: 0,index,label,gameTime,sec,ASR_text
0,105,Corner,2 - 00:54,54,the return ball for van der Veel is cut short ...
1,106,Foul,2 - 02:30,150,"Yeah, a tremendous addition Bringing the job o..."
2,117,Foul,2 - 08:47,527,"It is for Edison Cavani He's an off run, he'll..."
3,119,Foul,2 - 09:49,589,"Knowing exactly where the Z-man was lurking, f..."
4,127,Foul,2 - 11:22,682,And he's got it And he's got it And he's got i...


In [25]:
len(second_half_filtered_df), len(label_ASR_df_two)

(21, 21)

In [26]:
# Creating a complete dataframe with the combination of ASR, first, and second half.
Complete_df = pd.concat([label_ASR_df_one, label_ASR_df_two, ], axis=0, ignore_index=True) 

Complete_df.head()

Unnamed: 0,index,label,gameTime,sec,ASR_text
0,5,Foul,1 - 04:33,273,Hanabiik. That's a real poor clearance from Al...
1,8,Goal,1 - 05:44,344,Early warning given by the ref. Keep your hand...
2,22,Foul,1 - 11:23,683,It was 1-0 and needed an own goal by Zoumana K...
3,33,Foul,1 - 14:51,891,Out of touch for Toulouse. Quarter of an hour ...
4,37,Foul,1 - 15:43,943,Not going to fall into that trap too easily. B...


In [27]:
len(Complete_df)

36

In [28]:
# checking the ASR text for the first goal in the game, Goal: 1-05.44
Complete_df['ASR_text'][1]



## Implementing all the functions to create a dataframe for all games

In [14]:
def create_dataframe(path):

    all_dfs = []
    
    files = Complete_files(path, '1_half-ASR.json', '2_half-ASR.json', 'Labels-v2.json')

    for index, file_group in enumerate(zip(*[iter(files)] * 3)):  

        # divides the paths to respective parts
        half_one, label, half_two = file_group

        label_df = filter_labels(filepath=label)

        labels_include = ['Corner', 'Foul', 'Goal']

        # Filtering the labels 
        filtered_df = filter_labels(filepath=None, labels_to_include=labels_include, df=label_df)
        filtered_df.reset_index(inplace=True)
        #filtered_df = label_df

        # Extract the half from 'gameTime' and convert to integer
        filtered_df['Half'] = filtered_df['gameTime'].str.split(' - ').apply(lambda x: int(x[0]))

        # Split the DataFrame based on the half
        first_half_label_df = filtered_df[filtered_df['Half'] == 1]
        second_half_label_df = filtered_df[filtered_df['Half'] == 2]

        # Optionally, drop the 'Half' column if it's no longer needed
        first_half_label_df.drop(columns=['Half'], inplace=True)
        second_half_label_df.drop(columns=['Half'], inplace=True)

        # resetting the index 
        second_half_label_df = second_half_label_df.reset_index(drop=True)

        # Adding gamtime converted to sec as a column
        game_sec_one = [gametime_to_sec(i) for i in first_half_label_df['gameTime']]
        game_sec_two = [gametime_to_sec(i) for i in second_half_label_df['gameTime']]

        game_sec_one_df = pd.DataFrame(game_sec_one, columns=['sec'])
        game_sec_two_df = pd.DataFrame(game_sec_two, columns=['sec'])

        first_half_filtered_df = pd.concat([first_half_label_df, game_sec_one_df, ], axis=1, join="inner")
        second_half_filtered_df = pd.concat([second_half_label_df, game_sec_two_df, ], axis=1, join="inner")

        # Creating a list for both the first and second half of the game.
        ASR_text_list_one, ASR_text_list_two = create_list_from_json(half_one, half_two)
        
        # Changing the list to a pandas dataframe 
        ASR_one_df = dataframe_from_ASR_text(ASR_text_list_one)
        ASR_two_df = dataframe_from_ASR_text(ASR_text_list_two)

        # Using the provided function to retrieve and combine ASR text with the first half.
        combined_asr_one = asr_texts_timestamped(first_half_filtered_df, ASR_one_df)
        combined_asr_one.drop(columns=['sec'], inplace=True)
        label_ASR_df_one = pd.concat([first_half_filtered_df, combined_asr_one, ], axis=1, join="inner") 

        # Using the provided function to retrieve and combine ASR text with the second half.
        combined_asr_two = asr_texts_timestamped(second_half_filtered_df, ASR_two_df)
        combined_asr_two.drop(columns=['sec'], inplace=True)
        label_ASR_df_two = pd.concat([second_half_filtered_df, combined_asr_two, ], axis=1, join="inner") 

        # Creating a complete dataframe with the combination of ASR, first, and second half.
        Complete_df = pd.concat([label_ASR_df_one, label_ASR_df_two, ], axis=0, ignore_index=True) 

        # Append the complete dataframe to the list
        all_dfs.append(Complete_df)

    # Concatenate all DataFrames in the list into the Com DataFrame
    Com = pd.concat(all_dfs, ignore_index=True)
    
    return Com

        

In [15]:
# Path for the whole dataset 
path = 'SN-ASR_captions_and_actions'

# creating a dataframe for all the leagues
pd.options.mode.chained_assignment = None 
df_tot = create_dataframe(path)

In [16]:
df_tot.head()

Unnamed: 0,index,label,gameTime,sec,ASR_text
0,5,Foul,1 - 04:33,273,Box-to-box midfielder who was allowed to play ...
1,8,Goal,1 - 05:44,344,Jose Angel Di Maria over the ball. Many bodies...
2,22,Foul,1 - 11:23,683,Only won four times in Paris. This is the 28th...
3,33,Foul,1 - 14:51,891,"...to Laurent Blanc. And the latest one is, I ..."
4,37,Foul,1 - 15:43,943,Is there any hope for this Toulouse side? They...


In [1]:
df_tot.shape

In [34]:
# Saving the dataframe as a csv file 
df_tot.to_csv('SoccerNet_df.csv', index=False)  