# Soccer Analytics — Data Exploration Notebook

This notebook loads the StatsBomb Parquet extracts, inspects the main tables (matches, events, lineups, three-sixty, reference), and performs basic data-quality checks and small exploratory analyses. Use the sections below to navigate the notebook.

Sections:
- Setup and imports
- Data loading and preview (matches, events, lineups, three-sixty, reference)
- Quick checks (date range, coverage, orphaned records)
- Small examples (freeze frame, shots)
- Splitting dataset for modeling

Note: run cells top-to-bottom to ensure required variables are available.

In [None]:
# Setup and imports
# This cell installs small deps (if needed), imports libraries and sets display options.
!pip install matplotlib && pip install -q pandas
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
from pathlib import Path

# Suppress common warnings for cleaner notebook output
warnings.filterwarnings("ignore")

# Display options for notebooks
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

# Data directory
DATA_DIR = Path("..") / "data" / "Statsbomb"
print(f"DATA_DIR: {DATA_DIR}")
print('Files present:', sorted([p.name for p in DATA_DIR.iterdir() if p.is_file()]))


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip

In [90]:
# Matches: one row per match
matches_path = DATA_DIR / "matches.parquet"
if not matches_path.exists():
    raise FileNotFoundError(f"Missing {matches_path}. Make sure data is downloaded into data/Statsbomb")

matches = pd.read_parquet(matches_path)
# Show a compact preview to confirm successful load
print('Matches shape:', matches.shape)
matches.head(3)

Matches shape: (3464, 36)


Unnamed: 0,match_id,match_date,match_week,match_status,match_status_360,kickoff,home_score,away_score,competition_id,competition,competition_stage,season_id,season,home_team_id,home_team,home_managers,away_team_id,away_team,away_managers,stadium_id,stadium,referee_id,referee,last_updated,last_updated_360,data_version,shot_fidelity_version,xy_fidelity_version,competition_name,gender,is_youth,is_international,country_name,season_name,match_updated,match_available_360
0,9880,2018-04-14,32,available,scheduled,16:15:00,2,1,11,La Liga,Regular Season,1,2017/2018,217,Barcelona,"[{""id"":227,""name"":""Ernesto Valverde Tejedor"",""...",207,Valencia,"[{""id"":211,""name"":""Marcelino García Toral"",""ni...",342.0,Spotify Camp Nou,2728.0,Carlos del Cerro Grande,2023-02-08T17:23:53.901920,2021-06-13T16:17:31.694,1.1.0,2,2,La Liga,male,False,False,Spain,2017/2018,2025-07-14T10:01:16.674864,
1,9912,2018-04-29,35,available,scheduled,20:45:00,2,4,11,La Liga,Regular Season,1,2017/2018,219,RC Deportivo La Coruña,"[{""id"":371,""name"":""Clarence Seedorf"",""nickname...",217,Barcelona,"[{""id"":227,""name"":""Ernesto Valverde Tejedor"",""...",4658.0,Estadio Abanca-Riazor,2602.0,Ricardo De Burgos Bengoetxea,2022-12-05T14:42:44.641092,2021-06-13T16:17:31.694,1.1.0,2,2,La Liga,male,False,False,Spain,2017/2018,2025-07-14T10:01:16.674864,
2,9924,2018-05-06,36,available,scheduled,20:45:00,2,2,11,La Liga,Regular Season,1,2017/2018,217,Barcelona,"[{""id"":227,""name"":""Ernesto Valverde Tejedor"",""...",220,Real Madrid,"[{""id"":56,""name"":""Zinédine Zidane"",""nickname"":...",342.0,Spotify Camp Nou,2608.0,Alejandro José Hernández Hernández,2022-12-01T03:25:12.063586,2021-06-13T16:17:31.694,1.1.0,2,2,La Liga,male,False,False,Spain,2017/2018,2025-07-14T10:01:16.674864,


In [121]:
# --- Matches: Enhanced Deep EDA ---
print('\n' + '='*80)
print('DEEP DIVE: matches (Enhanced)')
print('='*80)

# 1) Basic counts and date range
n_matches = len(matches)
print(f'Total matches: {n_matches:,}')

# Date analysis
matches['match_date'] = pd.to_datetime(matches['match_date'], errors='coerce')
min_date = matches['match_date'].min()
max_date = matches['match_date'].max()
date_span_days = (max_date - min_date).days
print(f'Date range: {min_date} → {max_date} ({date_span_days:,} days)')

# Temporal distribution
matches['year'] = matches['match_date'].dt.year
matches['month'] = matches['match_date'].dt.month
matches['day_of_week'] = matches['match_date'].dt.day_name()

print('\nMatches by year:')
year_dist = matches['year'].value_counts().sort_index()
for yr, cnt in year_dist.items():
    print(f'  {yr}: {cnt:4d} matches')

print('\nMatches by day of week:')
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_dist = matches['day_of_week'].value_counts().reindex(dow_order, fill_value=0)
for day, cnt in dow_dist.items():
    pct = cnt/n_matches*100
    print(f'  {day:9s}: {cnt:4d} ({pct:5.2f}%)')

# Detect gaps in coverage
match_dates = matches['match_date'].dt.date.value_counts().sort_index()
date_diffs = pd.Series(match_dates.index).diff().dt.days.dropna()
long_gaps = date_diffs[date_diffs > 30]
if not long_gaps.empty:
    print(f'\nLong gaps in coverage (>30 days): {len(long_gaps)} instances')
    print(f'  Longest gap: {date_diffs.max():.0f} days')

# 2) Competition Analysis
comp_counts = matches['competition_name'].fillna('MISSING').value_counts(dropna=False)
print('\n--- Competition Coverage ---')
print(f'Total competitions: {len(comp_counts)}')
print('\nTop 15 competitions:')
for i, (comp, cnt) in enumerate(comp_counts.head(15).items(), 1):
    pct = cnt/n_matches*100
    print(f'{i:2d}. {comp:40s}: {cnt:4d} ({pct:5.2f}%)')

# Competition-Season coverage
print('\n--- Competition × Season Coverage ---')
comp_season_counts = matches.groupby(['competition_name', 'season_name']).size().reset_index(name='matches')

# For each top competition, show season coverage
top_comps = comp_counts.head(5).index
for comp in top_comps:
    comp_data = comp_season_counts[comp_season_counts['competition_name'] == comp]
    n_seasons = len(comp_data)
    avg_matches = comp_data['matches'].mean()
    std_matches = comp_data['matches'].std()
    
    print(f'\n{comp}:')
    print(f'  Seasons covered: {n_seasons}')
    print(f'  Avg matches/season: {avg_matches:.1f} (±{std_matches:.1f})')
    
    # Flag outlier seasons
    if n_seasons > 2:
        q1 = comp_data['matches'].quantile(0.25)
        outliers = comp_data[comp_data['matches'] < q1 * 0.5]
        if not outliers.empty:
            print(f'  ⚠ Potential incomplete seasons ({len(outliers)}):')
            for _, row in outliers.head(3).iterrows():
                print(f'    - {row["season_name"]}: {row["matches"]} matches')

# 3) Team Analysis
all_teams = pd.concat([matches['home_team'], matches['away_team']])
team_counts = all_teams.value_counts()

print(f'\n--- Team Coverage ---')
print(f'Total unique teams: {len(team_counts)}')
print(f'Most frequent team: {team_counts.index[0]} ({team_counts.iloc[0]} appearances)')
print(f'Least frequent team: {team_counts.index[-1]} ({team_counts.iloc[-1]} appearances)')

# Teams appearing in multiple competitions
team_comp_counts = matches.melt(
    id_vars=['competition_name'], 
    value_vars=['home_team', 'away_team'], 
    value_name='team'
).groupby('team')['competition_name'].nunique().sort_values(ascending=False)

multi_comp_teams = team_comp_counts[team_comp_counts > 1]
print(f'\nTeams in multiple competitions: {len(multi_comp_teams)}')
if not multi_comp_teams.empty:
    print('Top 5 multi-competition teams:')
    for team, n_comps in multi_comp_teams.head(5).items():
        print(f'  {team}: {n_comps} competitions')

# 4) Score Analysis
print(f"\n--- Match Outcomes ---")

# Basic outcome distribution
home_wins = (matches['home_score'] > matches['away_score']).sum()
away_wins = (matches['away_score'] > matches['home_score']).sum()
draws = (matches['home_score'] == matches['away_score']).sum()

print('Results:')
print(f'  Home wins: {home_wins:5d} ({home_wins/n_matches*100:5.2f}%)')
print(f'  Draws:     {draws:5d} ({draws/n_matches*100:5.2f}%)')
print(f'  Away wins: {away_wins:5d} ({away_wins/n_matches*100:5.2f}%)')

# Goal statistics
total_home_goals = matches['home_score'].sum()
total_away_goals = matches['away_score'].sum()
total_goals = total_home_goals + total_away_goals

print(f'\nGoal statistics:')
print(f'  Total goals: {total_goals:.0f}')
print(f'  Avg goals/match: {total_goals/n_matches:.2f}')
print(f'  Avg home goals: {total_home_goals/n_matches:.2f}')
print(f'  Avg away goals: {total_away_goals/n_matches:.2f}')

# Scoreline distribution
matches['scoreline'] = matches.apply(
    lambda r: f"{int(r['home_score'])}-{int(r['away_score'])}", axis=1
)
scoreline_dist = matches['scoreline'].value_counts()

print('\nMost common scorelines:')
for i, (score, cnt) in enumerate(scoreline_dist.head(10).items(), 1):
    pct = cnt/n_matches*100
    print(f'{i:2d}. {score:6s}: {cnt:4d} ({pct:5.2f}%)')

# High/low scoring matches
matches['total_goals'] = matches['home_score'] + matches['away_score']
high_scoring = matches[matches['total_goals'] >= 6]
low_scoring = matches[matches['total_goals'] == 0]

print(f'\nHigh-scoring matches (6+ goals): {len(high_scoring)} ({len(high_scoring)/n_matches*100:.2f}%)')
print(f'Goalless draws (0-0): {len(low_scoring)} ({len(low_scoring)/n_matches*100:.2f}%)')

# Goal difference distribution
matches['goal_diff'] = matches['home_score'] - matches['away_score']
gd_dist = matches['goal_diff'].value_counts().sort_index()

print('\nGoal difference distribution (home - away):')
for gd in range(-5, 6):
    if gd in gd_dist.index:
        cnt = gd_dist[gd]
        pct = cnt/n_matches*100
        bar = '█' * int(pct/2)
        print(f'  {gd:+3d}: {cnt:4d} ({pct:5.2f}%) {bar}')

# Competition-specific home advantage
print('\nHome win rate by competition (top 5):')
comp_home_wr = matches.groupby('competition_name').apply(
    lambda x: (x['home_score'] > x['away_score']).sum() / len(x) * 100
).sort_values(ascending=False)

for comp, wr in comp_home_wr.head(5).items():
    comp_matches = matches[matches['competition_name'] == comp]
    print(f'  {comp:40s}: {wr:5.2f}% ({len(comp_matches)} matches)')

# 5) Data Quality Assessment
print('\n--- Data Quality ---')

# Missing values
miss = matches.isnull().sum()
miss_pct = (miss / len(matches) * 100).round(2)
miss_df = pd.DataFrame({'count': miss, 'pct': miss_pct})
miss_df = miss_df[miss_df['count'] > 0].sort_values('count', ascending=False)

if miss_df.empty:
    print('✓ No missing values in any column')
else:
    print(f'Columns with missing values: {len(miss_df)}/{len(matches.columns)}')
    print('\nTop 10 columns by missing data:')
    for col, row in miss_df.head(10).iterrows():
        print(f'  {col:40s}: {row["count"]:6.0f} ({row["pct"]:5.2f}%)')

# Duplicates by match_id
if 'match_id' in matches.columns:
    dup_id_count = matches['match_id'].duplicated().sum()
    print(f'\nDuplicate match_ids: {dup_id_count}')
    if dup_id_count > 0:
        print('⚠ Warning: Duplicate match_ids detected!')
        dup_examples = matches[matches['match_id'].duplicated(keep=False)].sort_values('match_id').head(3)
        print('Sample duplicates:')
        display(dup_examples[['match_id', 'home_team', 'away_team']])

# Duplicates by logical key (date, teams)
dup_logical = matches.duplicated(subset=['match_date', 'home_team', 'away_team']).sum()
print(f'Duplicate matches by (date, home_team, away_team): {dup_logical}')
if dup_logical > 0:
    print('⚠ Warning: Same teams playing on same date multiple times!')

# Data type summary
print('\nData type summary:')
dtype_counts = matches.dtypes.value_counts()
for dtype, cnt in dtype_counts.items():
    print(f'  {str(dtype):20s}: {cnt} columns')

# Unusual values check
max_home = matches['home_score'].max()
max_away = matches['away_score'].max()

if max_home > 10 or max_away > 10:
    print(f'\n⚠ Unusually high scores detected:')
    print(f'  Max home score: {max_home:.0f}')
    print(f'  Max away score: {max_away:.0f}')
    
    extreme = matches[(matches['home_score'] > 10) | (matches['away_score'] > 10)]
    if not extreme.empty:
        print(f'  Matches with 10+ goals for one team: {len(extreme)}')
        print('  Examples:')
        display(extreme[['match_date', 'home_team', 'away_team', 'home_score', 'away_score']].head(3))

print('\n' + '='*80)
print('Matches deep EDA complete.')
print('='*80)


DEEP DIVE: matches (Enhanced)
Total matches: 3,464
Date range: 1958-06-24 00:00:00 → 2025-07-27 00:00:00 (24,505 days)

Matches by year:
  1958:    2 matches
  1962:    1 matches
  1970:    6 matches
  1971:    1 matches
  1972:    1 matches
  1973:    1 matches
  1974:    7 matches
  1977:    1 matches
  1978:    1 matches
  1979:    1 matches
  1981:    1 matches
  1983:    1 matches
  1984:    1 matches
  1986:    4 matches
  1989:    3 matches
  1990:    1 matches
  1997:    1 matches
  1999:    1 matches
  2003:   19 matches
  2004:   25 matches
  2005:   13 matches
  2006:   16 matches
  2007:   33 matches
  2008:   27 matches
  2009:   31 matches
  2010:   37 matches
  2011:   36 matches
  2012:   39 matches
  2013:   27 matches
  2014:   37 matches
  2015:  893 matches
  2016:  967 matches
  2017:   39 matches
  2018:  194 matches
  2019:  190 matches
  2020:  120 matches
  2021:  204 matches
  2022:  193 matches
  2023:  105 matches
  2024:  153 matches
  2025:   31 matches



Unnamed: 0,match_date,home_team,away_team,home_score,away_score
2031,2019-12-01,Arsenal WFC,Bristol City WFC,11,1
3068,2019-06-11,United States Women's,Thailand Women's,13,0



Matches deep EDA complete.
