In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsbombpy
import mplsoccer
import json
from pathlib import Path
import numpy as np

from auxiliar_functions import *

### Pass Risk Measure – Concept
The goal is to approximate StatsBomb’s proprietary pass difficulty metrics (e.g. xPass, OBV) with a simplified model. The idea behind pass risk definition is the likelihood a pass will *fail*, given its contextual environment. Contextual factors considered are pass length and height, angle, bypassed opponents, field zone (for both origin and destination), and pressure (by nearest-defender distance).


In [None]:
data_dir = Path("data/italy_euro2020")
matches_file = data_dir / "matches" / "55" / "43.json"

# ---- Load matches file
matches = json.loads(matches_file.read_text(encoding="utf-8"))
df = pd.json_normalize(matches)

# Columns we’ll want to see
cols = [
    "match_id",
    "match_date",
    "competition_stage.name",
    "home_team.home_team_name",
    "away_team.away_team_name",
]
df = df[cols].rename(columns={  "competition_stage.name": "stage",
                                "home_team.home_team_name": "home",
                                "away_team.away_team_name": "away",
                                })

ita = df[(df["home"] == "Italy") | (df["away"] == "Italy")].sort_values("match_date").reset_index(drop=True)

ita["opponent"] = ita.apply(lambda r: r["away"] if r["home"]=="Italy" else r["home"], axis=1)
ita["home_away"] = ita["home"].eq("Italy").map({True:"H", False:"A"})

match_ids = ita["match_id"].tolist()

summary_rows = []
for id in match_ids:
    event = file_exists(data_dir / "events" / f"{id}.json")
    lineup = file_exists(data_dir / "lineups" / f"{id}.json")
    fr_path = data_dir / "three-sixty" / f"{id}.json"
    fr_exists = file_exists(fr_path)
    n_frames = count_freeze_frames(id, data_dir) if fr_exists else 0
    summary_rows.append({
        "match_id": id,
        "events.json": event,
        "lineups.json": lineup,
        "three-sixty.json": fr_exists,
        "frames_count": n_frames,
    })

# Merge into the main DataFrame
availability = pd.DataFrame(summary_rows)
ita = ita.merge(availability, on="match_id")

print("Italy Euro 2020 matches data availability summary:")
display(ita)

Italy Euro 2020 matches with data availability:


Unnamed: 0,match_id,match_date,stage,home,away,opponent,home_away,events.json,lineups.json,three-sixty.json,frames_count
0,3788741,2021-06-11,Group Stage,Turkey,Italy,Turkey,A,True,True,True,3370
1,3788754,2021-06-16,Group Stage,Italy,Switzerland,Switzerland,H,True,True,True,3423
2,3788766,2021-06-20,Group Stage,Italy,Wales,Wales,H,True,True,True,2800
3,3794685,2021-06-26,Round of 16,Italy,Austria,Austria,H,True,True,True,4456
4,3795107,2021-07-02,Quarter-finals,Belgium,Italy,Belgium,A,True,True,True,2965
5,3795220,2021-07-06,Semi-finals,Italy,Spain,Spain,H,True,True,True,4247
6,3795506,2021-07-11,Final,Italy,England,England,H,True,True,True,4303


In [None]:
# Global variables
match_ids = [3788741, 3788754, 3788766, 3794679, 3794686, 3795220, 3795506] # by manual exploration
data_dir = "italy_euro2020/data"

**Preprocessing**

Data filtering and engineering steps necessary to obtain relevant features for this analysis.

In [None]:
data = Path(data_dir)

italy_euro2020\data
