In [18]:
# Script to combine multiple seasons df into one df

import pandas as pd
import os
from glob import glob

# Step 1: Point to the folder with all your CSVs
data_folder = "../data"
csv_files = sorted(glob(os.path.join(data_folder, "*_full.csv")))

# Step 2: Combine all CSVs
all_data = []
for file in csv_files:
    df = pd.read_csv(file)
    df['source_file'] = os.path.basename(file)  # optional: track file of origin
    all_data.append(df)

combined_df = pd.concat(all_data, ignore_index=True)

# Step 3: Save the combined file
combined_df.to_csv("../data/all_seasons_combined.csv", index=False)

# Step 4: Show basic info
print(f"âœ… Combined {len(csv_files)} files")
print(f"ðŸ“Š Total matches: {len(combined_df)}")
print("ðŸ“„ Saved as: ../data/all_seasons_combined.csv")

âœ… Combined 21 files
ðŸ“Š Total matches: 7902
ðŸ“„ Saved as: ../data/all_seasons_combined.csv


In [19]:
# script to clean the data


# Load the full dataset
df = pd.read_csv("../data/all_seasons_combined.csv")

# âœ… 1. Drop 'date' and 'source_file'
df.drop(columns=["date", "source_file"], inplace=True, errors='ignore')

# âœ… 2. Convert odds to float
for col in ["h_odd", "d_odd", "a_odd"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# âœ… 3. Extract starting year of season
df['season'] = df['season'].str[:4]

# âœ… 4. Split match_name
df[['home_team', 'away_team']] = df['match_name'].str.split(" vs ", expand=True)

# âœ… 5. Split result into home/away score
df[['home_score', 'away_score']] = df['result'].str.split("-", expand=True).astype("Int64")

# âœ… 6. Determine winner
def get_winner(row):
    if pd.isnull(row['home_score']) or pd.isnull(row['away_score']):
        return "UNKNOWN"
    elif row['home_score'] > row['away_score']:
        return "HOME_TEAM"
    elif row['home_score'] < row['away_score']:
        return "AWAY_TEAM"
    else:
        return "DRAW"

df['winner'] = df.apply(get_winner, axis=1)

# âœ… 7. Save cleaned version
df.to_csv("../data/all_seasons_cleaned.csv", index=False)
print("âœ… Cleaned dataset saved as: data/all_seasons_cleaned.csv")


âœ… Cleaned dataset saved as: data/all_seasons_cleaned.csv


In [20]:
# script for more cleaning and feature engineering of data

# Load cleaned data from previous step
df = pd.read_csv("../data/all_seasons_cleaned.csv")

# âœ… Drop the 'result' column (no longer needed)
df.drop(columns=['result'], inplace=True, errors='ignore')

# âœ… Convert American odds to Decimal odds
def american_to_decimal(odd):
    if pd.isnull(odd):
        return None
    try:
        odd = float(odd)
        if odd > 0:
            return round(1 + odd / 100, 2)
        else:
            return round(1 + 100 / abs(odd), 2)
    except:
        return None

for col in ['h_odd', 'd_odd', 'a_odd']:
    df[col] = df[col].apply(american_to_decimal)

# Save updated DataFrame
df.to_csv("../data/all_seasons_ready.csv", index=False)

print("âœ… Cleaned odds and removed result column.")
print("ðŸ“„ Saved as: ../data/all_seasons_ready.csv")


âœ… Cleaned odds and removed result column.
ðŸ“„ Saved as: ../data/all_seasons_ready.csv
