In [4]:
# IMPORT NECESSARY LIBRARIES
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from clean_data import clean_match_data
from google.colab import drive
# scipy?
# sklearn?

In [2]:
# MOUNT GOOGLE DRIVE
drive.mount('/content/drive')

Mounted at /content/drive


# 1. Data Cleaning

In [14]:
# BASE DATASET PATH
DATA_DIR = Path("/content/drive/MyDrive/Datasets/big-five-football-xg-data/")

# LOAD & COMBINE ALL CSV FILES
def load_raw_data(csv_dir):
    csv_dir = Path(csv_dir)
    dfs = [pd.read_csv(f) for f in csv_dir.glob("*.csv")]
    df = pd.concat(dfs, ignore_index=True)
    return df

# STANDARDIZE COLUMN NAMES
def standardize_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
    )
    return df

# PARSE DATETIME SAFELY
def parse_datetime(df):
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["time"] = df["time"].astype(str).str.zfill(5)
    df["datetime"] = pd.to_datetime(
        df["date"].astype(str) + " " + df["time"],
        errors="coerce"
    )
    return df

# CLEAN xG COLUMNS
def clean_xg(df):
    for col in ["xg_home", "xg_away"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        # xG sanity bounds
        df.loc[df[col] < 0, col] = np.nan
        df.loc[df[col] > 6, col] = np.nan
    return df

# PARSE SCORE INTO GOALS
def parse_score(df):
    df["score"] = (
        df["score"]
        .astype(str)
        .str.replace("–", "-", regex=False)
        .str.replace("—", "-", regex=False)
    )
    goals = df["score"].str.split("-", expand=True)
    df["home_goals"] = pd.to_numeric(goals[0], errors="coerce")
    df["away_goals"] = pd.to_numeric(goals[1], errors="coerce")
    return df

# CLEAN TEAM NAMES
def clean_team_names(df):
    for col in ["home", "away"]:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
        )
    return df

# REMOVE DUPLICATES
def remove_duplicates(df):
    df = df.drop_duplicates(
        subset=["datetime", "home", "away"]
    )
    return df

# DROP INDEX COLUMNS
def drop_index_columns(df):
    index_cols = [c for c in df.columns if c.startswith("unnamed")]
    return df.drop(columns=index_cols)

# LOGICAL VALIDATION
def validate_rows(df):
    conditions = (
        df["datetime"].notna() &
        df["home"].notna() &
        df["away"].notna() &
        df["xg_home"].notna() &
        df["xg_away"].notna() &
        df["home_goals"].notna() &
        df["away_goals"].notna()
    )
    df = df.loc[conditions].copy()
    return df

# FINAL CLEANING PIPELINE
def clean_match_data(csv_dir):
    df = load_raw_data(csv_dir)
    df = standardize_columns(df)
    df = parse_datetime(df)
    df = clean_xg(df)
    df = parse_score(df)
    df = clean_team_names(df)
    df = remove_duplicates(df)
    df = drop_index_columns(df)
    df = validate_rows(df)
    # Sort chronologically (CRITICAL)
    df = df.sort_values("datetime").reset_index(drop=True)
    return df

In [15]:
# SAVE CLEANED DATA
df_clean = clean_match_data(DATA_DIR)

OUTPUT_DIR = Path("/content/drive/MyDrive/Datasets/big-five-football-xg-data/processed/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

clean_path = OUTPUT_DIR / "matches_cleaned.csv"
df_clean.to_csv(clean_path, index=False)

print(f"Saved cleaned data to: {clean_path}")
print(f"Rows: {len(df_clean)}")

Saved cleaned data to: /content/drive/MyDrive/Datasets/big-five-football-xg-data/processed/matches_cleaned.csv
Rows: 9034


# 2. Feature Engineering

In [17]:
# LOAD CLEANED DATA
df_clean = pd.read_csv(
    clean_path,
    parse_dates=["datetime"]
)
