In [3]:
%%capture
# The above commmand hides the output of the cell and should be the first line of the cell!

# Install libraries
! pip install pandas
! pip install tqdm

In [2]:
import numpy as np

#import statsbomb as sb
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from mplsoccer.pitch import Pitch, VerticalPitch

# Import libraries that we use
from pathlib import Path
import pandas as pd
import json
from tqdm import tqdm

In [3]:
# Define the input directories
input_dir = Path("input")
data_dir = Path("data")

# You can adjust the match_number and match_id according to your assigned group match.
match_number = "26"
match_id = "133018"
match_name = match_number + "_" + match_id

# Define each data file paths
match_file = input_dir / f"{match_name}_meta.json"
event_file = input_dir / f"{match_name}_events.json"
tracking_file = input_dir / f"{match_name}_tracking.dat"


***MATCH DATA***


In [12]:
with open(match_file, encoding="utf-8") as f: # Open the event json file to read it
    match_dic = json.load(f)    
    meta_dic = {}
    meta_dic["match_id"] = match_dic["GameID"]
    meta_dic["home_team_id"] =  match_dic["HomeTeam"]["TeamID"]
    meta_dic["home_team_name"] =  match_dic["HomeTeam"]["ShortName"]
    meta_dic["away_team_id"] = match_dic["AwayTeam"]["TeamID"]
    meta_dic["away_team_name"] =  match_dic["AwayTeam"]["ShortName"]

    # save the team-player information
    lineup_dics = []
    for team in ["HomeTeam","AwayTeam"]:
        team_dic = match_dic[team]
        for player in team_dic["Players"]:
            lineup_dic = {}
            lineup_dic["team_id"] = team_dic["TeamID"]
            lineup_dic["player_id"] = player["PlayerID"]
            lineup_dic["player_first_name"] = player["FirstName"]
            lineup_dic["player_last_name"] = player["LastName"]
            lineup_dic["player_jersey_number"] = player["JerseyNo"]
            lineup_dic["player_starting_frame"] = player["StartFrameCount"]
            lineup_dic["player_ending_frame"] = player["EndFrameCount"]
            lineup_dics.append(lineup_dic)

# Transform the dictionaries into a panda dataframe
meta_df = pd.DataFrame([meta_dic])
lineup_df = pd.DataFrame(lineup_dics) 

# Save the dataframes as CSV files in "data_dir" folder without adding an extra index column providing the index=False parameter.
meta_df.to_csv(data_dir / "meta.csv", index=False)
lineup_df.to_csv(data_dir / "lineup.csv", index=False)

print(f"The meta dataframe is:")
display(meta_df.head())

print(f"The first five rows of the lineup dataframe are:")
display(lineup_df.head())

The meta dataframe is:


Unnamed: 0,match_id,home_team_id,home_team_name,away_team_id,away_team_name
0,133018,43935,Belgium,43872,Morocco


The first five rows of the lineup dataframe are:


Unnamed: 0,team_id,player_id,player_first_name,player_last_name,player_jersey_number,player_starting_frame,player_ending_frame
0,43935,290821,Axel,WITSEL,6,1440349,1612771
1,43935,401444,Youri,TIELEMANS,8,1559523,1612771
2,43935,290864,Toby,ALDERWEIRELD,2,1440349,1612771
3,43935,411443,Timothy,CASTAGNE,21,1440349,1612771
4,43935,389558,Wout,FAES,4,0,0


***EVENT DATA***

In [7]:
event_dics = [] # A list of event dictionaries that we will fill in

with open(event_file) as f: # Open the event json file to read it
    events = f.readlines() # Read all the lines at once
    print(f"Number of events in the opening match are {len(events)}.\n")
    for event in tqdm(events): # Iterate over the events one by one and convert them into a dictionary (i.e., a data structure with keys and values)
        event_dic = json.loads(event)
        event_dics.append(event_dic)

events_df = pd.DataFrame(event_dics) # Transform the list of event dictionaries into a panda dataframe
events_df.to_csv(data_dir / "events.csv", index=False) # Save the event dataframe in "data_dir" folder without adding an extra index column providing the index=False parameter.

print(f"The first five rows of the events dataframe are:")
display(events_df.head())

Number of events in the opening match are 8380.



100%|██████████| 8380/8380 [00:00<00:00, 88825.99it/s]


The first five rows of the events dataframe are:


Unnamed: 0,match_id,match_run_time_in_ms,match_run_time,match_time_in_ms,event_id,player_seq_id,event_order,half_time,side,category,...,y,y_mirrored,x_location_start,x_location_start_mirrored,x_location_end,x_location_end_mirrored,y_location_start,y_location_start_mirrored,y_location_end,y_location_end_mirrored
0,133018,0,00:00:00,0,1,1,1,1,l,in_possession,...,0.501176,0.501176,,,,,,,,
1,133018,0,00:00:00,0,2,1,2,1,l,in_possession,...,0.501176,0.501176,0.502952,0.502952,0.438857,0.438857,0.501176,0.501176,0.505441,0.505441
2,133018,570,00:00:01,570,3,1,3,1,l,in_possession,...,0.505441,0.505441,,,,,,,,
3,133018,580,00:00:01,580,4,2,1,1,r,out_of_possession,...,0.383382,0.616618,,,,,,,,
4,133018,639,00:00:01,639,5,3,1,1,l,in_possession,...,0.510294,0.510294,,,,,,,,


***TRACKING DATA***

In [8]:
tracking_dics = []

num_lines = sum(1 for line in open(tracking_file,'r'))
print(f"The tracking dataset has {num_lines} frames.")

# Define a dictionary mapping the home (1) and away (0) teams to their values in a match
team_id_mapping = {"1": meta_df.home_team_id.iloc[0], "0": meta_df.away_team_id.iloc[0], "3": "3"}

not_known_sets = set()

# The tracking file will contain both player and ball tracking data
with open(tracking_file, "r") as f:
    for line in tqdm(f, total=num_lines): # Read line by line and extract the information pieces as elaborated in the documentation.
        frame_number, objects_xy_coordinates, ball_data, _ = line.split(":")
        for object in objects_xy_coordinates.split(";"):
            if object=="":
                break
            team, not_known, jersey_number, x, y, speed = object.split(",")            
            if team not in ["1", "0", "3"]: # If the object (i.e., person) does not belong to the Home (1) or Away (0) team. Then, ignore it. You can also keep the referees (3) if you want to study them later! 
                continue

            tracking_dic = {}
            tracking_dic["frame"] = int(frame_number)
            tracking_dic["team_id"] = int(team_id_mapping[team])
            tracking_dic["player_jersey_number"] = jersey_number
            tracking_dic["x"] = float(x)
            tracking_dic["y"] = float(y)
            tracking_dic["z"] = None
            tracking_dic["speed"] = float(speed)
            tracking_dic["state"] = None
            tracking_dics.append(tracking_dic)
        
        ball_x, ball_y, ball_z, ball_speed, ball_owning_team, ball_state = ball_data.split(",")[:6]

        tracking_dic = {}
        tracking_dic["frame"] = int(frame_number)
        tracking_dic["team_id"] = -1 # Let's say -1 is the ball id
        tracking_dic["player_jersey_number"] = None
        tracking_dic["x"] = float(ball_x)
        tracking_dic["y"] = float(ball_y)
        tracking_dic["z"] = float(ball_z)
        tracking_dic["speed"] = float(ball_speed)
        tracking_dic["state"] = ball_state.replace(";","")
        tracking_dics.append(tracking_dic)

tracking_df = pd.DataFrame(tracking_dics) # Transform the list of dictionaries into a panda dataframe

tracking_df.to_csv(data_dir / "tracking.csv", index=False) # Save the dataframe in "data_dir" folder without adding an extra index column providing the index=False parameter.

print(f"The first five rows of the tracking dataframe are:")
display(tracking_df.head())

The tracking dataset has 172423 frames.


100%|██████████| 172423/172423 [00:06<00:00, 28265.47it/s]


The first five rows of the tracking dataframe are:


Unnamed: 0,frame,team_id,player_jersey_number,x,y,z,speed,state
0,1440349,43935,23,34.0,8.0,,0.61,
1,1440349,43872,7,9.0,1753.0,,0.5,
2,1440349,43872,8,584.0,-806.0,,0.06,
3,1440349,43935,21,-1848.0,1399.0,,0.29,
4,1440349,43935,16,17.0,2042.0,,1.78,


In [9]:
# Let's see a full frame (first one)
tracking_df[tracking_df.frame==1713295]

Unnamed: 0,frame,team_id,player_jersey_number,x,y,z,speed,state


<h3><b>Play Directions!</b></h3>

Let's calculate the play directions of each team per match half.    


In [10]:
with open(match_file, encoding="utf-8") as f: # Open the event json file to read it
    match_dic = json.load(f)
    home_team_id = meta_df.home_team_id.iloc[0]
    away_team_id = meta_df.away_team_id.iloc[0]

    starting_frame = tracking_df[tracking_df.frame==int(match_dic["Phase1StartFrame"])]

    # The team starting the match from the left side of the pitch is considered 'realHome_teamId' 
    # and is determined by identifying to which team the player with minimum x-coordinate belongs to.
    realHome_teamId = starting_frame[starting_frame.x==starting_frame.x.min()].team_id.iloc[0]
    realAway_teamId = home_team_id if realHome_teamId != home_team_id else away_team_id

    playdirection_dics = []
    for half_number in [1,2,3,4,5]:
        if not match_dic[f"Phase{half_number}StartFrame"]:
            continue
        for team_id in [realHome_teamId, realAway_teamId]:
            playdirection_dic = {}
            playdirection_dic["half"] = int(half_number)
            playdirection_dic["team_id"] = team_id
            playdirection_dic["start_frame"] = int(match_dic[f"Phase{half_number}StartFrame"]) # Keep the start frame number as integer
            playdirection_dic["end_frame"] = int(match_dic[f"Phase{half_number}EndFrame"]) # Keep the end frame number as integer
            if half_number % 2 == 1:
                playdirection_dic["playdirection"] = "LEFT_TO_RIGHT" if team_id == realHome_teamId else "RIGHT_TO_LEFT"
            else:
                playdirection_dic["playdirection"] = "RIGHT_TO_LEFT" if team_id == realHome_teamId else "LEFT_TO_RIGHT"

            playdirection_dics.append(playdirection_dic)

playdirection_df = pd.DataFrame(playdirection_dics) # Transform the list of dictionaries into a panda dataframe

playdirection_df.to_csv(data_dir / "playdirection.csv", index=False) # Save the dataframe in "data_dir" folder without adding an extra index column providing the index=False parameter.

print(f"The playdirection dataframe is:")
display(playdirection_df)

The playdirection dataframe is:


Unnamed: 0,half,team_id,start_frame,end_frame,playdirection
0,1,43935,1440349,1513405,LEFT_TO_RIGHT
1,1,43872,1440349,1513405,RIGHT_TO_LEFT
2,2,43935,1537414,1612771,RIGHT_TO_LEFT
3,2,43872,1537414,1612771,LEFT_TO_RIGHT
