In [1]:
!git clone https://github.com/statsbomb/open-data.git

Cloning into 'open-data'...
remote: Enumerating objects: 49950, done.[K
remote: Counting objects: 100% (3339/3339), done.[K
remote: Compressing objects: 100% (1189/1189), done.[K
remote: Total 49950 (delta 3324), reused 2150 (delta 2150), pack-reused 46611 (from 4)[K
Receiving objects: 100% (49950/49950), 6.49 GiB | 5.85 MiB/s, done.
Resolving deltas: 100% (46985/46985), done.
Updating files: 100% (7340/7340), done.


In [1]:
import pandas as pd
import json
import os
import numpy as np
from pathlib import Path

# Set up data directory
data_dir = Path('./open-data/data')
print("Available data folders:")
for folder in sorted(os.listdir(data_dir)):
    print(f"- {folder}")

Available data folders:
- competitions.json
- events
- lineups
- matches
- three-sixty


In [2]:
# Load competitions data
competitions_file = data_dir / 'competitions.json'
with open(competitions_file, 'r') as f:
    competitions = json.load(f)

competitions_df = pd.DataFrame(competitions)
print("Available competitions:")
print(competitions_df[['competition_id', 'competition_name', 'season_name']].head(10))

Available competitions:
   competition_id        competition_name season_name
0               9           1. Bundesliga   2023/2024
1               9           1. Bundesliga   2015/2016
2            1267  African Cup of Nations        2023
3              16        Champions League   2018/2019
4              16        Champions League   2017/2018
5              16        Champions League   2016/2017
6              16        Champions League   2015/2016
7              16        Champions League   2014/2015
8              16        Champions League   2013/2014
9              16        Champions League   2012/2013


In [3]:
# Load matches for a specific competition (e.g., Premier League 2019/20)
matches_file = data_dir / 'matches' / '2' / '27.json'  # Premier League 2019/20
with open(matches_file, 'r') as f:
    matches = json.load(f)

matches_df = pd.DataFrame(matches)
print(f"Number of matches: {len(matches_df)}")
print("\nMatch data columns:")
print(matches_df.columns.tolist())
print(f"\nSample match info:")
print(matches_df[['match_id', 'home_team', 'away_team', 'match_date']].head())

Number of matches: 380

Match data columns:
['match_id', 'match_date', 'kick_off', 'competition', 'season', 'home_team', 'away_team', 'home_score', 'away_score', 'match_status', 'match_status_360', 'last_updated', 'last_updated_360', 'metadata', 'match_week', 'competition_stage', 'stadium', 'referee']

Sample match info:
   match_id                                          home_team  \
0   3754058  {'home_team_id': 22, 'home_team_name': 'Leices...   
1   3754245  {'home_team_id': 27, 'home_team_name': 'West B...   
2   3754136  {'home_team_id': 37, 'home_team_name': 'Newcas...   
3   3754037  {'home_team_id': 29, 'home_team_name': 'Everto...   
4   3754039  {'home_team_id': 31, 'home_team_name': 'Crysta...   

                                           away_team  match_date  
0  {'away_team_id': 28, 'away_team_name': 'AFC Bo...  2016-01-02  
1  {'away_team_id': 41, 'away_team_name': 'Sunder...  2015-10-17  
2  {'away_team_id': 59, 'away_team_name': 'Aston ...  2015-12-19  
3  {'away_te

In [4]:
# Load lineups for a sample match to understand player data
sample_match_id = matches_df['match_id'].iloc[0]
lineups_file = data_dir / 'lineups' / f'{sample_match_id}.json'

with open(lineups_file, 'r') as f:
    lineups = json.load(f)

print(f"Teams in match {sample_match_id}:")
for team in lineups:
    print(f"\nTeam: {team['team_name']} (ID: {team['team_id']})")
    players = team['lineup']
    print(f"Number of players: {len(players)}")
    
    # Show sample player data
    sample_player = players[0]
    print(f"\nSample player data structure:")
    for key, value in sample_player.items():
        print(f"  {key}: {value}")
    break

Teams in match 3754058:

Team: AFC Bournemouth (ID: 28)
Number of players: 18

Sample player data structure:
  player_id: 3049
  player_name: Matt Ritchie
  player_nickname: None
  jersey_number: 30
  country: {'id': 201, 'name': 'Scotland'}
  cards: []
  positions: [{'position_id': 12, 'position': 'Right Midfield', 'from': '00:00', 'to': None, 'from_period': 1, 'to_period': None, 'start_reason': 'Starting XI', 'end_reason': 'Final Whistle'}]


In [5]:
# Load events data for the same match to understand available metrics
events_file = data_dir / 'events' / f'{sample_match_id}.json'

with open(events_file, 'r') as f:
    events = json.load(f)

events_df = pd.DataFrame(events)
print(f"Number of events in match: {len(events_df)}")
print(f"\nAvailable event types:")
event_types = events_df['type'].value_counts()
print(event_types.head(15))

print(f"\nSample event data columns:")
print(events_df.columns.tolist())

Number of events in match: 3576

Available event types:
type
{'id': 30, 'name': 'Pass'}             1002
{'id': 42, 'name': 'Ball Receipt*'}     924
{'id': 43, 'name': 'Carry'}             742
{'id': 17, 'name': 'Pressure'}          363
{'id': 2, 'name': 'Ball Recovery'}      102
{'id': 4, 'name': 'Duel'}                83
{'id': 9, 'name': 'Clearance'}           59
{'id': 6, 'name': 'Block'}               42
{'id': 23, 'name': 'Goal Keeper'}        35
{'id': 38, 'name': 'Miscontrol'}         34
{'id': 14, 'name': 'Dribble'}            31
{'id': 3, 'name': 'Dispossessed'}        29
{'id': 16, 'name': 'Shot'}               27
{'id': 10, 'name': 'Interception'}       25
{'id': 39, 'name': 'Dribbled Past'}      19
Name: count, dtype: int64

Sample event data columns:
['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type', 'possession', 'possession_team', 'play_pattern', 'team', 'duration', 'tactics', 'related_events', 'player', 'position', 'location', 'pass', 'carry', 'under_pr

In [6]:
# Analyze player-specific metrics from events
player_events = events_df[events_df['player'].notna()].copy()
print(f"Events with player data: {len(player_events)}")

# Get unique players and their basic stats
player_stats = []
for player_name in player_events['player'].unique()[:10]:  # First 10 players
    player_data = player_events[player_events['player'] == player_name]
    
    stats = {
        'player_name': player_name,
        'total_events': len(player_data),
        'passes': len(player_data[player_data['type'] == 'Pass']),
        'shots': len(player_data[player_data['type'] == 'Shot']),
        'dribbles': len(player_data[player_data['type'] == 'Dribble']),
        'tackles': len(player_data[player_data['type'] == 'Tackle']),
        'interceptions': len(player_data[player_data['type'] == 'Interception']),
    }
    
    if len(player_data) > 0:
        stats['team'] = player_data['team'].iloc[0]
        stats['position'] = player_data['position'].iloc[0] if 'position' in player_data.columns else 'Unknown'
    
    player_stats.append(stats)

player_stats_df = pd.DataFrame(player_stats)
print("\nSample player statistics:")
print(player_stats_df)

Events with player data: 3565


TypeError: unhashable type: 'dict'

# Data Analysis Summary

Based on the exploration above, we have rich football data that includes:

## Available Data Types:
1. **Competitions & Seasons**: Different leagues and time periods
2. **Matches**: Game-level information with teams and results
3. **Lineups**: Player rosters and positions for each match
4. **Events**: Detailed play-by-play data with player actions

## Key Metrics for Player Compatibility:
- **Technical Skills**: Pass accuracy, dribble success, touch quality
- **Tactical Behavior**: Positioning, movement patterns, decision-making
- **Physical Attributes**: Distance covered, sprint frequency, duels won
- **Team Interaction**: Pass networks, assist patterns, defensive coordination