# 01 - Data Collection: StatsBomb Open Data (FIFA World Cup 2018)
This notebook pulls match, event, and lineup data for the 2018 FIFA World Cup from StatsBomb Open Data.
Data is saved into the `data/raw/` directory.

In [None]:
# Install packages if running locally
!pip install pandas requests

In [None]:
import pandas as pd
import requests
from pathlib import Path

In [None]:
# Directory for raw data
raw_data_dir = Path("../data/raw")
raw_data_dir.mkdir(parents=True, exist_ok=True)
BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"

In [None]:
# Download match metadata for FIFA World Cup 2018
matches_url = BASE_URL + "matches/43/3.json"
matches = requests.get(matches_url).json()
matches_df = pd.json_normalize(matches)
matches_df.to_csv(raw_data_dir / "matches_worldcup_2018.csv", index=False)
matches_df[['match_id', 'home_team.home_team_name', 'away_team.away_team_name']].head()

In [None]:
# Download event data
event_data = []
for match in matches:
    match_id = match['match_id']
    url = BASE_URL + f"events/{match_id}.json"
    try:
        events = requests.get(url).json()
        df = pd.json_normalize(events)
        df['match_id'] = match_id
        event_data.append(df)
    except Exception as e:
        print(f"Failed to load events for match {match_id}: {e}")

all_events_df = pd.concat(event_data, ignore_index=True)
all_events_df.to_csv(raw_data_dir / "events_worldcup_2018.csv", index=False)
all_events_df.head()

In [None]:
# Download lineup data
lineup_data = []
for match in matches:
    match_id = match['match_id']
    url = BASE_URL + f"lineups/{match_id}.json"
    try:
        lineups = requests.get(url).json()
        for team in lineups:
            for player in team.get('lineup', []):
                player['team_name'] = team['team_name']
                player['match_id'] = match_id
                lineup_data.append(player)
    except Exception as e:
        print(f"Failed to load lineups for match {match_id}: {e}")

lineups_df = pd.json_normalize(lineup_data)
lineups_df.to_csv(raw_data_dir / "lineups_worldcup_2018.csv", index=False)
lineups_df.head()