In [1]:
# Script to combine multiple seasons df into one df

import pandas as pd
import os
from glob import glob

# Step 1: Point to the folder with all your CSVs
data_folder = "../data"
csv_files = sorted(glob(os.path.join(data_folder, "*_full.csv")))

# Step 2: Combine all CSVs
all_data = []
for file in csv_files:
    df = pd.read_csv(file)
    df['source_file'] = os.path.basename(file)  # optional: track file of origin
    all_data.append(df)

combined_df = pd.concat(all_data, ignore_index=True)

# Step 3: Save the combined file
combined_df.to_csv("../data/all_seasons_combined.csv", index=False)

# Step 4: Show basic info
print(f"✅ Combined {len(csv_files)} files")
print(f"📊 Total matches: {len(combined_df)}")
print("📄 Saved as: ../data/all_seasons_combined.csv")

✅ Combined 21 files
📊 Total matches: 7902
📄 Saved as: ../data/all_seasons_combined.csv


In [2]:
# script to clean the data


# Load the full dataset
df = pd.read_csv("../data/all_seasons_combined.csv")

# ✅ 1. Drop 'date' and 'source_file'
df.drop(columns=["date", "source_file"], inplace=True, errors='ignore')

# ✅ 2. Convert odds to float
for col in ["h_odd", "d_odd", "a_odd"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# ✅ 3. Extract starting year of season
df['season'] = df['season'].str[:4]

# ✅ 4. Split match_name
df[['home_team', 'away_team']] = df['match_name'].str.split(" vs ", expand=True)

# ✅ 5. Split result into home/away score
df[['home_score', 'away_score']] = df['result'].str.split("-", expand=True).astype("Int64")

# ✅ 6. Determine winner
def get_winner(row):
    if pd.isnull(row['home_score']) or pd.isnull(row['away_score']):
        return "UNKNOWN"
    elif row['home_score'] > row['away_score']:
        return "HOME_TEAM"
    elif row['home_score'] < row['away_score']:
        return "AWAY_TEAM"
    else:
        return "DRAW"

df['winner'] = df.apply(get_winner, axis=1)

# ✅ 7. Save cleaned version
df.to_csv("../data/all_seasons_cleaned.csv", index=False)
print("✅ Cleaned dataset saved as: data/all_seasons_cleaned.csv")


✅ Cleaned dataset saved as: data/all_seasons_cleaned.csv


In [3]:
# script for more cleaning and feature engineering of data

# Load cleaned data from previous step
df = pd.read_csv("../data/all_seasons_cleaned.csv")

# ✅ Drop the 'result' column (no longer needed)
df.drop(columns=['result'], inplace=True, errors='ignore')

# ✅ Convert American odds to Decimal odds
def american_to_decimal(odd):
    if pd.isnull(odd):
        return None
    try:
        odd = float(odd)
        if odd > 0:
            return round(1 + odd / 100, 2)
        else:
            return round(1 + 100 / abs(odd), 2)
    except:
        return None

for col in ['h_odd', 'd_odd', 'a_odd']:
    df[col] = df[col].apply(american_to_decimal)

# Save updated DataFrame
df.to_csv("../data/all_seasons_ready.csv", index=False)

print("✅ Cleaned odds and removed result column.")
print("📄 Saved as: ../data/all_seasons_ready.csv")


✅ Cleaned odds and removed result column.
📄 Saved as: ../data/all_seasons_ready.csv


In [None]:
import pandas as pd
import numpy as np

# If not already loaded
df = pd.read_csv("../data/all_seasons_ready.csv")  # If you are inside /notebooks folder
print("✅ Dataset loaded with shape:", df.shape)

# Basic helper to calculate points from a match
def get_points(home_goals, away_goals):
    if home_goals > away_goals:
        return 3, 0
    elif home_goals < away_goals:
        return 0, 3
    else:
        return 1, 1

# Helper to update form streaks (win/draw/loss)
def update_streak(result, streaks):
    if result == "W":
        return (streaks[0] + 1, 0, 0)  # (win, draw, loss)
    elif result == "D":
        return (0, streaks[1] + 1, 0)
    else:
        return (0, 0, streaks[2] + 1)


In [None]:
# Dicts to store teams' match histories
team_stats = {}

# Dict to store last match date (if you ever add dates)
team_last_match = {}

# Dict to store last opponent
last_meeting = {}

# Placeholder for final feature-engineered rows
engineered_rows = []


In [None]:
for idx, row in df.iterrows():
    home = row['home_team']
    away = row['away_team']

    # Initialize teams if not already
    for team in [home, away]:
        if team not in team_stats:
            team_stats[team] = {
                'points': 0, 'goals_for': 0, 'goals_against': 0,
                'wins': 0, 'draws': 0, 'losses': 0,
                'win_streak': 0, 'draw_streak': 0, 'loss_streak': 0,
                'matches_played': 0,
                'points_list': [],
                'goals_list': [],
                'goals_against_list': [],
            }

    # Get last stats before match
    ht_stats = team_stats[home]
    at_stats = team_stats[away]

    # Calculate new features
    h_points, a_points = get_points(row['home_score'], row['away_score'])

    # Last winner between these two teams
    ls_winner = last_meeting.get((home, away), "UNKNOWN")

    engineered_rows.append({
        'season': row['season'],
        'match_name': row['match_name'],
        'home_team': home,
        'away_team': away,
        'winner': row['winner'],
        'home_score': row['home_score'],
        'away_score': row['away_score'],
        'h_odd': row['h_odd'],
        'd_odd': row['d_odd'],
        'a_odd': row['a_odd'],

        # Home team stats
        'h_match_points': h_points,
        'ht_points': ht_stats['points'],
        'ht_goals': ht_stats['goals_for'],
        'ht_goals_sf': ht_stats['goals_against'],
        'ht_wins': ht_stats['wins'],
        'ht_draws': ht_stats['draws'],
        'ht_losses': ht_stats['losses'],
        'ht_win_streak': ht_stats['win_streak'],
        'ht_draw_streak': ht_stats['draw_streak'],
        'ht_loss_streak': ht_stats['loss_streak'],
        
        # Away team stats
        'a_match_points': a_points,
        'at_points': at_stats['points'],
        'at_goals': at_stats['goals_for'],
        'at_goals_sf': at_stats['goals_against'],
        'at_wins': at_stats['wins'],
        'at_draws': at_stats['draws'],
        'at_losses': at_stats['losses'],
        'at_win_streak': at_stats['win_streak'],
        'at_draw_streak': at_stats['draw_streak'],
        'at_loss_streak': at_stats['loss_streak'],

        # Last meeting winner
        'ls_winner': ls_winner,
    })

    # Update team stats after match
    # Home
    ht_stats['matches_played'] += 1
    ht_stats['points'] += h_points
    ht_stats['goals_for'] += row['home_score']
    ht_stats['goals_against'] += row['away_score']
    if h_points == 3:
        ht_stats['wins'] += 1
    elif h_points == 1:
        ht_stats['draws'] += 1
    else:
        ht_stats['losses'] += 1

    result = "W" if h_points == 3 else "D" if h_points == 1 else "L"
    ht_stats['win_streak'], ht_stats['draw_streak'], ht_stats['loss_streak'] = update_streak(result, (ht_stats['win_streak'], ht_stats['draw_streak'], ht_stats['loss_streak']))

    # Away
    at_stats['matches_played'] += 1
    at_stats['points'] += a_points
    at_stats['goals_for'] += row['away_score']
    at_stats['goals_against'] += row['home_score']
    if a_points == 3:
        at_stats['wins'] += 1
    elif a_points == 1:
        at_stats['draws'] += 1
    else:
        at_stats['losses'] += 1

    result = "W" if a_points == 3 else "D" if a_points == 1 else "L"
    at_stats['win_streak'], at_stats['draw_streak'], at_stats['loss_streak'] = update_streak(result, (at_stats['win_streak'], at_stats['draw_streak'], at_stats['loss_streak']))

    # Update last meeting
    if row['winner'] == "HOME_TEAM":
        last_meeting[(home, away)] = home
    elif row['winner'] == "AWAY_TEAM":
        last_meeting[(home, away)] = away
    else:
        last_meeting[(home, away)] = "DRAW"
