In [9]:
import requests
import json

BASE = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"

# 1) get competitions
comp = requests.get(f"{BASE}/competitions.json").json()

# find La Liga (Spain)
la_liga = [
    c for c in comp
    if c["competition_name"] == "Premier League"
]

print(la_liga)

[{'competition_id': 2, 'season_id': 27, 'country_name': 'England', 'competition_name': 'Premier League', 'competition_gender': 'male', 'competition_youth': False, 'competition_international': False, 'season_name': '2015/2016', 'match_updated': '2025-04-23T14:36:29.347042', 'match_updated_360': '2021-06-13T16:17:31.694', 'match_available_360': None, 'match_available': '2025-04-23T14:36:29.347042'}, {'competition_id': 2, 'season_id': 44, 'country_name': 'England', 'competition_name': 'Premier League', 'competition_gender': 'male', 'competition_youth': False, 'competition_international': False, 'season_name': '2003/2004', 'match_updated': '2025-06-24T13:53:07.585114', 'match_updated_360': '2021-06-13T16:17:31.694', 'match_available_360': None, 'match_available': '2025-06-24T13:53:07.585114'}]


In [10]:
competition_id = 2
season_id = 27

matches = requests.get(
    f"{BASE}/matches/{competition_id}/{season_id}.json"
).json()

print(len(matches))
print(matches[0])


380
{'match_id': 3754058, 'match_date': '2016-01-02', 'kick_off': '16:00:00.000', 'competition': {'competition_id': 2, 'country_name': 'England', 'competition_name': 'Premier League'}, 'season': {'season_id': 27, 'season_name': '2015/2016'}, 'home_team': {'home_team_id': 22, 'home_team_name': 'Leicester City', 'home_team_gender': 'male', 'home_team_group': None, 'country': {'id': 68, 'name': 'England'}, 'managers': [{'id': 60, 'name': 'Claudio Ranieri', 'nickname': None, 'dob': '1951-10-20', 'country': {'id': 112, 'name': 'Italy'}}]}, 'away_team': {'away_team_id': 28, 'away_team_name': 'AFC Bournemouth', 'away_team_gender': 'male', 'away_team_group': None, 'country': {'id': 68, 'name': 'England'}, 'managers': [{'id': 38, 'name': 'Eddie Howe', 'nickname': None, 'dob': '1977-11-29', 'country': {'id': 68, 'name': 'England'}}]}, 'home_score': 0, 'away_score': 0, 'match_status': 'available', 'match_status_360': 'processing', 'last_updated': '2021-10-29T23:44:19.940296', 'last_updated_360': 

In [16]:
match_ids = [m["match_id"] for m in matches]

with open("file_id.txt", "w") as f:
    for mid in match_ids:
        f.write(str(mid) + "\n")

print("Wrote all real match ids to file_id.txt")

Wrote all real match ids to file_id.txt


In [17]:
import os
import requests
from pathlib import Path

BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"

def read_ids(path):
    with open(path) as f:
        return [line.strip() for line in f if line.strip()]

def download_file(url, save_path):
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open(save_path, "wb") as f:
        f.write(r.content)

In [18]:
match_ids = read_ids("file_id.txt")

# สร้างโฟลเดอร์
Path("events").mkdir(exist_ok=True)
Path("lineups").mkdir(exist_ok=True)

for mid in match_ids:
    print(f"Downloading match {mid}...")

    event_url = f"{BASE_URL}/events/{mid}.json"
    lineup_url = f"{BASE_URL}/lineups/{mid}.json"

    download_file(event_url, f"events/{mid}.json")
    download_file(lineup_url, f"lineups/{mid}.json")

print("✅ Done. All files downloaded.")

Downloading match 3754058...
Downloading match 3754245...
Downloading match 3754136...
Downloading match 3754037...
Downloading match 3754039...
Downloading match 3754041...
Downloading match 3754042...
Downloading match 3754043...
Downloading match 3754045...
Downloading match 3754048...
Downloading match 3754049...
Downloading match 3754050...
Downloading match 3754052...
Downloading match 3754053...
Downloading match 3754055...
Downloading match 3754226...
Downloading match 3754014...
Downloading match 3754091...
Downloading match 3754135...
Downloading match 3754293...
Downloading match 3754339...
Downloading match 3754146...
Downloading match 3754347...
Downloading match 3754255...
Downloading match 3754278...
Downloading match 3753979...
Downloading match 3753972...
Downloading match 3753978...
Downloading match 3753974...
Downloading match 3753993...
Downloading match 3753983...
Downloading match 3754015...
Downloading match 3754038...
Downloading match 3754001...
Downloading ma

In [None]:
!pip install pandas numpy tqdm tslearn matplotlib streamlit

In [25]:
import json
import pandas as pd
from pathlib import Path

def load_events(events_path):
    rows = []

    for file in Path(events_path).glob("*.json"):
        with open(file) as f:
            data = json.load(f)

        for e in data:
            if 'player' not in e:
                continue

            rows.append({
                "match_id": file.stem,
                "minute": e.get("minute", 0),
                "player": e["player"]["name"],
                "team": e["team"]["name"],
                "type": e["type"]["name"],
                "x": e.get("location", [None, None])[0],
                "y": e.get("location", [None, None])[1],
            })

    return pd.DataFrame(rows)


In [26]:
import pandas as pd
import numpy as np

def build_player_ts(df):
    df["minute_bin"] = df["minute"].clip(0, 89)

    def agg_minute(x):
        return pd.Series({
            "touches": len(x),
            "passes": (x["type"] == "Pass").sum(),
            "carries": (x["type"] == "Carry").sum(),
            "defensive_actions": x["type"].isin(
                ["Duel", "Ball Recovery", "Interception", "Clearance"]
            ).sum(),
            "avg_x": x["x"].mean(),
            "avg_y": x["y"].mean(),
        })

    ts = (
        df.groupby(["player", "match_id", "minute_bin"])
        .apply(agg_minute)
        .reset_index()
    )

    # Ensure every player has 90 rows (fill missing minutes)
    full = []
    for (player, match), g in ts.groupby(["player", "match_id"]):
        g = g.set_index("minute_bin").reindex(range(90), fill_value=0)
        g["player"] = player
        g["match_id"] = match
        g["minute_bin"] = g.index
        full.append(g.reset_index(drop=True))

    return pd.concat(full, ignore_index=True)


In [32]:
import numpy as np

FEATURES = [
    "touches",
    "passes",
    "carries",
    "defensive_actions",
    "avg_x",
    "avg_y",
]

def to_dtw_format(ts_df):
    players = {}

    for player, g in ts_df.groupby("player"):
        g = (
            g.groupby("minute_bin")[FEATURES]
            .mean()
            .fillna(0)
        )

        players[player] = g.values  # 90 x F

    return players

In [34]:
from tslearn.metrics import dtw
from tqdm import tqdm

def compute_similarity(players_ts, target_key):
    target_ts = players_ts[target_key]
    scores = []

    for key, ts in tqdm(players_ts.items()):
        if key == target_key:
            continue

        dist = dtw(target_ts, ts)
        scores.append((key, dist))

    scores.sort(key=lambda x: x[1])
    return scores


In [42]:
import json
import os
import pandas as pd

def load_positions(lineup_path="lineups"):
    pos = {}

    for file in os.listdir(lineup_path):
        with open(os.path.join(lineup_path, file)) as f:
            data = json.load(f)

        for team in data:
            for p in team["lineup"]:
                name = p["player_name"]

                # skip if already found
                if name in pos:
                    continue

                positions = p.get("positions", [])

                if len(positions) > 0:
                    position = positions[0]["position"]
                    pos[name] = position

    return pd.DataFrame([
        {"player": k, "position": v}
        for k, v in pos.items()
    ])

def map_position(pos):
    pos = pos.lower()

    if "goalkeeper" in pos:
        return "GK"

    if "center back" in pos or pos == "center back":
        return "CB"

    if "left back" in pos:
        return "LB"

    if "right back" in pos:
        return "RB"

    if "wing back" in pos:
        return "WB"

    if "defensive midfield" in pos:
        return "CDM"

    if "center midfield" in pos:
        return "CM"

    if "attacking midfield" in pos:
        return "CAM"

    if "left midfield" in pos:
        return "LM"

    if "right midfield" in pos:
        return "RM"

    if "left wing" in pos:
        return "LW"

    if "right wing" in pos:
        return "RW"

    if "center forward" in pos:
        return "CF"

    return "UNK"  # เผื่อกรณีแปลก ๆ

df = load_positions()
df["position"] = df["position"].apply(map_position)

df.to_csv("positions.csv", index=False)
print("positions.csv created:", df.shape)


positions.csv created: (550, 2)


In [47]:
import pandas as pd
from tslearn.metrics import dtw
from tqdm import tqdm
from itertools import combinations
from collections import defaultdict

FEATURES = [
    "touches",
    "passes",
    "carries",
    "defensive_actions",
    "avg_x",
    "avg_y"
]

ts_df = pd.read_csv("player_timeseries.csv")
players = ts_df["player"].unique()


def prepare_ts(df, player):
    p = df[df.player == player]

    full = pd.DataFrame({"minute_bin": range(1, 91)})
    p = full.merge(p, on="minute_bin", how="left")
    p[FEATURES] = p[FEATURES].fillna(0)

    return p[FEATURES].values


# ===== Cache TS =====
player_ts = {}
for p in players:
    player_ts[p] = prepare_ts(ts_df, p)


# ===== Compute DTW only once per pair =====
dist_map = defaultdict(dict)

pairs = list(combinations(players, 2))

for p1, p2 in tqdm(pairs):
    d = dtw(player_ts[p1], player_ts[p2])
    dist_map[p1][p2] = d
    dist_map[p2][p1] = d  # reuse result


# ===== Build Top3 =====
rows = []

for p in players:
    scores = list(dist_map[p].items())
    scores.sort(key=lambda x: x[1])

    for sp, _ in scores[:3]:
        rows.append({
            "player": p,
            "similar_player": sp
        })

pd.DataFrame(rows).to_csv("top3_similarity.csv", index=False)

100%|██████████| 150426/150426 [32:00<00:00, 78.33it/s]  
