In [1]:
import os
from pathlib import Path

import pandas as pd

# Set project root relative to this notebook
project_root = Path('..').resolve()
data_dir = project_root / 'data' / 'raw'

print(f"Project root: {project_root}")
print(f"Data dir: {data_dir}")

Project root: /Users/vishalsingh/Projects/Football-Fantasy-Manager
Data dir: /Users/vishalsingh/Projects/Football-Fantasy-Manager/data/raw


In [2]:
# Load fixtures table (Parquet) created by the data ingestion pipeline

fixtures_path = data_dir / 'fixtures_2024-25.parquet'
print(fixtures_path)

if fixtures_path.exists():
    fixtures_df = pd.read_parquet(fixtures_path)
    display(fixtures_df.head())
    display(fixtures_df.describe(include='all'))
else:
    print(f"Fixtures file not found at {fixtures_path}")

/Users/vishalsingh/Projects/Football-Fantasy-Manager/data/raw/fixtures_2024-25.parquet


Unnamed: 0,match_id,season,gameweek,team_id,opponent_team_id,was_home,fixture_difficulty,kickoff_time,finished,team_a_score,team_h_score,team_name,opponent_team_name,days_rest,is_double_gw
0,9,2024-25,1,1,14,False,3,2025-08-17 15:30:00+00:00,True,1.0,0.0,Arsenal,Man Utd,0.0,False
1,11,2024-25,2,1,11,True,2,2025-08-23 16:30:00+00:00,True,0.0,5.0,Arsenal,Leeds,6.041667,False
2,25,2024-25,3,1,12,False,4,2025-08-31 15:30:00+00:00,True,0.0,1.0,Arsenal,Liverpool,7.958333,False
3,31,2024-25,4,1,16,True,2,2025-09-13 11:30:00+00:00,True,0.0,3.0,Arsenal,Nott'm Forest,12.833333,False
4,41,2024-25,5,1,13,True,4,2025-09-21 15:30:00+00:00,True,1.0,1.0,Arsenal,Man City,8.166667,False


Unnamed: 0,match_id,season,gameweek,team_id,opponent_team_id,was_home,fixture_difficulty,kickoff_time,finished,team_a_score,team_h_score,team_name,opponent_team_name,days_rest,is_double_gw
count,760.0,760,760.0,760.0,760.0,760,760.0,760,760,440.0,440.0,760,760,760.0,760
unique,,1,,,,2,,,2,,,20,20,,1
top,,2024-25,,,,False,,,True,,,Arsenal,Man Utd,,False
freq,,760,,,,380,,,440,,,38,38,,760
mean,190.5,,19.5,10.5,10.5,,2.95,2026-01-04 20:00:40.263158016+00:00,,1.186364,1.554545,,,7.384265,
min,1.0,,1.0,1.0,1.0,,2.0,2025-08-15 19:00:00+00:00,,0.0,0.0,,,0.0,
25%,95.75,,10.0,5.75,5.75,,2.0,2025-11-01 16:52:30+00:00,,0.0,1.0,,,5.848958,
50%,190.5,,19.5,10.5,10.5,,3.0,2026-01-02 16:15:00+00:00,,1.0,1.5,,,7.0,
75%,285.25,,29.0,15.25,15.25,,3.0,2026-03-04 20:00:00+00:00,,2.0,2.0,,,8.0,
max,380.0,,38.0,20.0,20.0,,5.0,2026-05-24 15:00:00+00:00,,5.0,5.0,,,20.958333,


In [3]:
# Quick fixture-level exploration

if 'fixtures_df' in globals():
    print("Columns:", fixtures_df.columns.tolist())
    print("\nGameweeks present:", sorted(fixtures_df['gameweek'].dropna().unique()))

    # Distribution of fixture difficulty for each team
    difficulty_by_team = (
        fixtures_df
        .groupby(['team_name'])['fixture_difficulty']
        .value_counts()
        .unstack(fill_value=0)
        .sort_index()
    )
    display(difficulty_by_team.head())

Columns: ['match_id', 'season', 'gameweek', 'team_id', 'opponent_team_id', 'was_home', 'fixture_difficulty', 'kickoff_time', 'finished', 'team_a_score', 'team_h_score', 'team_name', 'opponent_team_name', 'days_rest', 'is_double_gw']

Gameweeks present: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38)]


fixture_difficulty,2,3,4,5
team_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arsenal,11,21,6,0
Aston Villa,11,19,7,1
Bournemouth,11,20,6,1
Brentford,11,19,7,1
Brighton,11,19,7,1


In [4]:
# Optional: read raw JSON files for deeper inspection

bootstrap_path = data_dir / 'bootstrap-static_2024-25.json'
fixtures_json_path = data_dir / 'fixtures_2024-25.json'

for path in [bootstrap_path, fixtures_json_path]:
    print("\n", path)
    if path.exists():
        df_sample = pd.read_json(path, lines=False, orient='records', typ='frame') if path.name.startswith('fixtures') else None
        print("Exists. File size (bytes):", path.stat().st_size)
    else:
        print("Not found.")


 /Users/vishalsingh/Projects/Football-Fantasy-Manager/data/raw/bootstrap-static_2024-25.json
Exists. File size (bytes): 2521535

 /Users/vishalsingh/Projects/Football-Fantasy-Manager/data/raw/fixtures_2024-25.json
Exists. File size (bytes): 1542958


In [5]:
print(df_sample.shape)
print(df_sample.columns)
df_sample.head()

(380, 17)
Index(['code', 'event', 'finished', 'finished_provisional', 'id',
       'kickoff_time', 'minutes', 'provisional_start_time', 'started',
       'team_a', 'team_a_score', 'team_h', 'team_h_score', 'stats',
       'team_h_difficulty', 'team_a_difficulty', 'pulse_id'],
      dtype='object')


Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id
0,2561895,1,True,True,1,2025-08-15 19:00:00+00:00,90,False,True,4,2.0,12,4.0,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,4,124791
1,2561896,1,True,True,2,2025-08-16 11:30:00+00:00,90,False,True,15,0.0,2,0.0,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",3,3,124792
2,2561897,1,True,True,3,2025-08-16 14:00:00+00:00,90,False,True,10,1.0,6,1.0,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,3,124793
3,2561900,1,True,True,6,2025-08-16 14:00:00+00:00,90,False,True,3,0.0,18,3.0,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,3,124796
4,2561899,1,True,True,5,2025-08-16 14:00:00+00:00,90,False,True,19,0.0,17,3.0,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,3,124795
