# FDR Imputation Model - Comprehensive Comparison

This notebook compares two approaches to predicting Fixture Difficulty Ratings:
1. **Baseline**: Overall rolling 5-game averages
2. **Home/Away Split**: Separate 6-game rolling averages for home vs away form (FPL official approach)

## FPL Official Approach
> "The FDR is based on a complex algorithm developed by FPL experts. A set of formulas process key Opta data variables, along with **each team's home and away form for the past six matches**, to generate a rank for the perceived difficulty of each Gameweek opponent."

## Goal
Improve exact accuracy while maintaining high within-1 accuracy.

## Setup

In [18]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, confusion_matrix, f1_score
import mord
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported")

✓ Libraries imported


## 1. Load Data

In [19]:
# Load FBREF team data
team_df = pd.read_csv('team_finaldat.csv')
team_df['Date'] = pd.to_datetime(team_df['Date'])

def get_fpl_season(date):
    if date.month >= 8:
        return f'{date.year}-{str(date.year+1)[-2:]}'
    else:
        return f'{date.year-1}-{str(date.year)[-2:]}'

team_df['Season'] = team_df['Date'].apply(get_fpl_season)

team_name_mapping = {
    'Manchester City': 'Man City',
    'Manchester Utd': 'Man Utd',
    'Tottenham': 'Spurs',
    'Nott\'ham Forest': 'Nottingham',
    'Sheffield Utd': 'Sheffield',
    'Newcastle Utd': 'Newcastle',
    'Leicester City': 'Leicester',
    'West Brom': 'West Brom',
    'Cardiff City': 'Cardiff',
    'Huddersfield': 'Huddersfield',
    'Swansea City': 'Swansea',
    'Stoke City': 'Stoke'
}

team_df['Team_FPL'] = team_df['Team'].replace(team_name_mapping)
team_df['Opponent_FPL'] = team_df['Opponent'].replace(team_name_mapping)

print(f"✓ Loaded {len(team_df)} matches from {team_df['Season'].nunique()} seasons")

✓ Loaded 5320 matches from 7 seasons


In [20]:
# Load FPL fixtures with FDR labels
fpl_data_path = Path('Fantasy-Premier-League/data')
all_fixtures = []
all_teams_mapping = {}

for season_folder in sorted(fpl_data_path.iterdir()):
    if not season_folder.is_dir() or season_folder.name < '2018-19':
        continue

    fixtures_file = season_folder / 'fixtures.csv'
    if fixtures_file.exists():
        df = pd.read_csv(fixtures_file)
        df['Season'] = season_folder.name
        all_fixtures.append(df)

    teams_file = season_folder / 'teams.csv'
    if teams_file.exists():
        teams = pd.read_csv(teams_file)
        all_teams_mapping[season_folder.name] = dict(zip(teams['id'], teams['name']))
    else:
        raw_file = season_folder / 'raw.json'
        if raw_file.exists():
            with open(raw_file, 'r') as f:
                data = json.load(f)
                if 'teams' in data:
                    all_teams_mapping[season_folder.name] = {t['id']: t['name'] for t in data['teams']}

fixtures_df = pd.concat(all_fixtures, ignore_index=True)
print(f"✓ Loaded {len(fixtures_df)} fixtures with FDR labels")

✓ Loaded 2660 fixtures with FDR labels


## 2A. Baseline Approach: Overall Rolling Metrics (5 games)

Standard rolling averages without home/away distinction.

In [21]:
def calculate_overall_rolling_metrics(df, window=5):
    """Calculate overall rolling metrics (no home/away split)"""
    df = df.sort_values(['Team', 'Date'])
    rolling_metrics = []

    for team in df['Team'].unique():
        team_data = df[df['Team'] == team].copy()

        team_data['rolling_xG'] = team_data['xG'].shift(1).rolling(window=window, min_periods=1).mean()
        team_data['rolling_xGA'] = team_data['xGA'].shift(1).rolling(window=window, min_periods=1).mean()
        team_data['rolling_goals_scored'] = team_data['Goals Scored'].shift(1).rolling(window=window, min_periods=1).mean()
        team_data['rolling_goals_conceded'] = team_data['Goals Conceded'].shift(1).rolling(window=window, min_periods=1).mean()
        team_data['rolling_possession'] = team_data['Possession'].shift(1).rolling(window=window, min_periods=1).mean()

        team_data['Points'] = team_data['Result'].map({'W': 3, 'D': 1, 'L': 0})
        team_data['rolling_points'] = team_data['Points'].shift(1).rolling(window=window, min_periods=1).mean()

        rolling_metrics.append(team_data)

    return pd.concat(rolling_metrics, ignore_index=True)

team_df_baseline = calculate_overall_rolling_metrics(team_df.copy(), window=5)
print(f"✓ Calculated overall rolling metrics (5-game window)")

✓ Calculated overall rolling metrics (5-game window)


## 2B. Home/Away Approach: Separate Rolling Metrics (6 games)

Separate rolling averages for home and away matches, matching FPL's official approach.

In [22]:
def calculate_home_away_rolling_metrics(df, window=6):
    """Calculate separate home and away rolling metrics"""
    df = df.sort_values(['Team', 'Date'])
    rolling_metrics = []

    for team in df['Team'].unique():
        team_data = df[df['Team'] == team].copy()
        
        home_mask = team_data['Venue'] == 'Home'
        away_mask = team_data['Venue'] == 'Away'
        
        team_data['Points'] = team_data['Result'].map({'W': 3, 'D': 1, 'L': 0})
        
        # HOME FORM
        for col_suffix in ['xG', 'xGA', 'goals_scored', 'goals_conceded', 'points', 'possession']:
            team_data[f'home_rolling_{col_suffix}'] = np.nan
        
        home_indices = team_data[home_mask].index
        if len(home_indices) > 0:
            team_data.loc[home_indices, 'home_rolling_xG'] = team_data.loc[home_indices, 'xG'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[home_indices, 'home_rolling_xGA'] = team_data.loc[home_indices, 'xGA'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[home_indices, 'home_rolling_goals_scored'] = team_data.loc[home_indices, 'Goals Scored'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[home_indices, 'home_rolling_goals_conceded'] = team_data.loc[home_indices, 'Goals Conceded'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[home_indices, 'home_rolling_points'] = team_data.loc[home_indices, 'Points'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[home_indices, 'home_rolling_possession'] = team_data.loc[home_indices, 'Possession'].shift(1).rolling(window, min_periods=1).mean()
        
        # AWAY FORM
        for col_suffix in ['xG', 'xGA', 'goals_scored', 'goals_conceded', 'points', 'possession']:
            team_data[f'away_rolling_{col_suffix}'] = np.nan
        
        away_indices = team_data[away_mask].index
        if len(away_indices) > 0:
            team_data.loc[away_indices, 'away_rolling_xG'] = team_data.loc[away_indices, 'xG'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[away_indices, 'away_rolling_xGA'] = team_data.loc[away_indices, 'xGA'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[away_indices, 'away_rolling_goals_scored'] = team_data.loc[away_indices, 'Goals Scored'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[away_indices, 'away_rolling_goals_conceded'] = team_data.loc[away_indices, 'Goals Conceded'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[away_indices, 'away_rolling_points'] = team_data.loc[away_indices, 'Points'].shift(1).rolling(window, min_periods=1).mean()
            team_data.loc[away_indices, 'away_rolling_possession'] = team_data.loc[away_indices, 'Possession'].shift(1).rolling(window, min_periods=1).mean()
        
        # Forward fill to make available for any match
        for col in ['home_rolling_xG', 'home_rolling_xGA', 'home_rolling_goals_scored', 
                    'home_rolling_goals_conceded', 'home_rolling_points', 'home_rolling_possession',
                    'away_rolling_xG', 'away_rolling_xGA', 'away_rolling_goals_scored',
                    'away_rolling_goals_conceded', 'away_rolling_points', 'away_rolling_possession']:
            team_data[col] = team_data[col].ffill()
        
        rolling_metrics.append(team_data)

    return pd.concat(rolling_metrics, ignore_index=True)

team_df_homeaway = calculate_home_away_rolling_metrics(team_df.copy(), window=6)
print(f"✓ Calculated home/away rolling metrics (6-game window)")

✓ Calculated home/away rolling metrics (6-game window)


## 3. Create Training Datasets

Generate features for both approaches.

In [23]:
def create_training_data_baseline(fixtures_df, team_df_rolling, all_teams_mapping):
    """Create training data using overall rolling metrics"""
    training_data = []

    for _, fixture in fixtures_df.iterrows():
        season = fixture['Season']
        gameweek = fixture['event']

        if season not in all_teams_mapping:
            continue

        team_h_name = all_teams_mapping[season].get(fixture['team_h'])
        team_a_name = all_teams_mapping[season].get(fixture['team_a'])

        if not team_h_name or not team_a_name:
            continue

        home_fixture_date = pd.to_datetime(fixture.get('kickoff_time', None))
        if pd.isna(home_fixture_date):
            continue

        home_fixture_date = home_fixture_date.tz_localize(None)

        # For HOME: get away team's overall rolling metrics
        away_team_data = team_df_rolling[
            (team_df_rolling['Team_FPL'] == team_a_name) &
            (team_df_rolling['Season'] == season) &
            (team_df_rolling['Date'] <= home_fixture_date)
        ].sort_values('Date').tail(1)

        if len(away_team_data) > 0 and not pd.isna(away_team_data['rolling_xG'].values[0]):
            training_data.append({
                'Season': season, 'Gameweek': gameweek, 'Opponent': team_a_name, 'Venue': 'Home',
                'FDR': fixture['team_h_difficulty'],
                'opp_rolling_xG': away_team_data['rolling_xG'].values[0],
                'opp_rolling_xGA': away_team_data['rolling_xGA'].values[0],
                'opp_rolling_goals_scored': away_team_data['rolling_goals_scored'].values[0],
                'opp_rolling_goals_conceded': away_team_data['rolling_goals_conceded'].values[0],
                'opp_rolling_points': away_team_data['rolling_points'].values[0],
                'opp_rolling_possession': away_team_data['rolling_possession'].values[0],
            })

        # For AWAY: get home team's overall rolling metrics
        home_team_data = team_df_rolling[
            (team_df_rolling['Team_FPL'] == team_h_name) &
            (team_df_rolling['Season'] == season) &
            (team_df_rolling['Date'] <= home_fixture_date)
        ].sort_values('Date').tail(1)

        if len(home_team_data) > 0 and not pd.isna(home_team_data['rolling_xG'].values[0]):
            training_data.append({
                'Season': season, 'Gameweek': gameweek, 'Opponent': team_h_name, 'Venue': 'Away',
                'FDR': fixture['team_a_difficulty'],
                'opp_rolling_xG': home_team_data['rolling_xG'].values[0],
                'opp_rolling_xGA': home_team_data['rolling_xGA'].values[0],
                'opp_rolling_goals_scored': home_team_data['rolling_goals_scored'].values[0],
                'opp_rolling_goals_conceded': home_team_data['rolling_goals_conceded'].values[0],
                'opp_rolling_points': home_team_data['rolling_points'].values[0],
                'opp_rolling_possession': home_team_data['rolling_possession'].values[0],
            })

    return pd.DataFrame(training_data).dropna()


def create_training_data_homeaway(fixtures_df, team_df_rolling, all_teams_mapping):
    """Create training data using context-aware home/away rolling metrics"""
    training_data = []

    for _, fixture in fixtures_df.iterrows():
        season = fixture['Season']
        gameweek = fixture['event']

        if season not in all_teams_mapping:
            continue

        team_h_name = all_teams_mapping[season].get(fixture['team_h'])
        team_a_name = all_teams_mapping[season].get(fixture['team_a'])

        if not team_h_name or not team_a_name:
            continue

        home_fixture_date = pd.to_datetime(fixture.get('kickoff_time', None))
        if pd.isna(home_fixture_date):
            continue

        home_fixture_date = home_fixture_date.tz_localize(None)

        # For HOME: get away team's AWAY form (context-aware)
        away_team_data = team_df_rolling[
            (team_df_rolling['Team_FPL'] == team_a_name) &
            (team_df_rolling['Season'] == season) &
            (team_df_rolling['Date'] <= home_fixture_date)
        ].sort_values('Date').tail(1)

        if len(away_team_data) > 0 and not pd.isna(away_team_data['away_rolling_xG'].values[0]):
            training_data.append({
                'Season': season, 'Gameweek': gameweek, 'Opponent': team_a_name, 'Venue': 'Home',
                'FDR': fixture['team_h_difficulty'],
                'opp_rolling_xG': away_team_data['away_rolling_xG'].values[0],
                'opp_rolling_xGA': away_team_data['away_rolling_xGA'].values[0],
                'opp_rolling_goals_scored': away_team_data['away_rolling_goals_scored'].values[0],
                'opp_rolling_goals_conceded': away_team_data['away_rolling_goals_conceded'].values[0],
                'opp_rolling_points': away_team_data['away_rolling_points'].values[0],
                'opp_rolling_possession': away_team_data['away_rolling_possession'].values[0],
            })

        # For AWAY: get home team's HOME form (context-aware)
        home_team_data = team_df_rolling[
            (team_df_rolling['Team_FPL'] == team_h_name) &
            (team_df_rolling['Season'] == season) &
            (team_df_rolling['Date'] <= home_fixture_date)
        ].sort_values('Date').tail(1)

        if len(home_team_data) > 0 and not pd.isna(home_team_data['home_rolling_xG'].values[0]):
            training_data.append({
                'Season': season, 'Gameweek': gameweek, 'Opponent': team_h_name, 'Venue': 'Away',
                'FDR': fixture['team_a_difficulty'],
                'opp_rolling_xG': home_team_data['home_rolling_xG'].values[0],
                'opp_rolling_xGA': home_team_data['home_rolling_xGA'].values[0],
                'opp_rolling_goals_scored': home_team_data['home_rolling_goals_scored'].values[0],
                'opp_rolling_goals_conceded': home_team_data['home_rolling_goals_conceded'].values[0],
                'opp_rolling_points': home_team_data['home_rolling_points'].values[0],
                'opp_rolling_possession': home_team_data['home_rolling_possession'].values[0],
            })

    return pd.DataFrame(training_data).dropna()

train_baseline = create_training_data_baseline(fixtures_df, team_df_baseline, all_teams_mapping)
train_homeaway = create_training_data_homeaway(fixtures_df, team_df_homeaway, all_teams_mapping)

print(f"✓ Baseline training data: {len(train_baseline)} samples")
print(f"✓ Home/Away training data: {len(train_homeaway)} samples")

✓ Baseline training data: 4137 samples
✓ Home/Away training data: 4132 samples


## 4. Cross-Validation Comparison

Compare both approaches using temporal cross-validation.

In [24]:
def evaluate_model(train_df, model_name='LogisticSE', approach_name='Baseline'):
    """Run temporal CV and return results"""
    train_df['is_away'] = (train_df['Venue'] == 'Away').astype(int)
    
    feature_cols = ['opp_rolling_xG', 'opp_rolling_xGA', 'opp_rolling_goals_scored',
                    'opp_rolling_goals_conceded', 'opp_rolling_points', 'opp_rolling_possession', 'is_away']
    
    X = train_df[feature_cols]
    y = train_df['FDR'].astype(int)
    
    seasons = sorted(train_df['Season'].unique())
    cv_splits = [(seasons[:i], seasons[i]) for i in range(2, len(seasons))]
    
    fold_results = []
    
    for train_seasons, test_season in cv_splits:
        train_mask = train_df['Season'].isin(train_seasons)
        test_mask = train_df['Season'] == test_season

        X_train, X_test = X[train_mask], X[test_mask]
        y_train, y_test = y[train_mask], y[test_mask]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        if model_name == 'LogisticAT':
            model = mord.LogisticAT(alpha=1.0)
        elif model_name == 'LogisticIT':
            model = mord.LogisticIT(alpha=1.0)
        else:
            model = mord.LogisticSE(alpha=1.0)

        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        fold_results.append({
            'accuracy': accuracy_score(y_test, y_pred),
            'mae': mean_absolute_error(y_test, y_pred),
            'within_1': np.mean(np.abs(y_test - y_pred) <= 1)
        })
    
    results_df = pd.DataFrame(fold_results)
    
    return {
        'Approach': approach_name,
        'Model': model_name,
        'Exact Accuracy': results_df['accuracy'].mean(),
        'Accuracy Std': results_df['accuracy'].std(),
        'MAE': results_df['mae'].mean(),
        'Within-1': results_df['within_1'].mean()
    }

print("Running cross-validation...\n")
results = []

for model_name in ['LogisticAT', 'LogisticIT', 'LogisticSE']:
    print(f"Testing {model_name}...")
    results.append(evaluate_model(train_baseline.copy(), model_name, 'Baseline (5-game overall)'))
    results.append(evaluate_model(train_homeaway.copy(), model_name, 'Home/Away (6-game split)'))

results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("CROSS-VALIDATION RESULTS")
print("="*80)
print(results_df.to_string(index=False))

Running cross-validation...

Testing LogisticAT...
Testing LogisticIT...
Testing LogisticSE...

CROSS-VALIDATION RESULTS
                 Approach      Model  Exact Accuracy  Accuracy Std      MAE  Within-1
Baseline (5-game overall) LogisticAT        0.546566      0.045041 0.494898  0.958536
 Home/Away (6-game split) LogisticAT        0.595064      0.036254 0.429847  0.975088
Baseline (5-game overall) LogisticIT        0.562142      0.040133 0.501500  0.936358
 Home/Away (6-game split) LogisticIT        0.602301      0.041676 0.443397  0.954302
Baseline (5-game overall) LogisticSE        0.534957      0.044276 0.497364  0.967679
 Home/Away (6-game split) LogisticSE        0.577229      0.037506 0.439488  0.983283


## 5. Analysis: Which Approach Wins?

Compare exact accuracy improvements.

In [25]:
print("\n" + "="*80)
print("PERFORMANCE COMPARISON")
print("="*80)

# Find best models from each approach
baseline_best = results_df[results_df['Approach'].str.contains('Baseline')].sort_values('Exact Accuracy', ascending=False).iloc[0]
homeaway_best = results_df[results_df['Approach'].str.contains('Home/Away')].sort_values('Exact Accuracy', ascending=False).iloc[0]

print(f"\nBest Baseline Model: {baseline_best['Model']}")
print(f"  Exact Accuracy: {baseline_best['Exact Accuracy']:.1%}")
print(f"  Within-1 Acc:   {baseline_best['Within-1']:.1%}")
print(f"  MAE:            {baseline_best['MAE']:.3f}")

print(f"\nBest Home/Away Model: {homeaway_best['Model']}")
print(f"  Exact Accuracy: {homeaway_best['Exact Accuracy']:.1%}")
print(f"  Within-1 Acc:   {homeaway_best['Within-1']:.1%}")
print(f"  MAE:            {homeaway_best['MAE']:.3f}")

improvement = (homeaway_best['Exact Accuracy'] - baseline_best['Exact Accuracy']) * 100

print("\n" + "="*80)
if improvement > 0:
    print(f"✅ WINNER: Home/Away Split")
    print(f"   Improvement: +{improvement:.1f} percentage points in exact accuracy")
elif improvement < 0:
    print(f"✅ WINNER: Baseline (Overall Rolling)")
    print(f"   Home/Away approach is {abs(improvement):.1f}pp worse")
else:
    print(f"⚖️  TIE: Both approaches perform equally")
print("="*80)


PERFORMANCE COMPARISON

Best Baseline Model: LogisticIT
  Exact Accuracy: 56.2%
  Within-1 Acc:   93.6%
  MAE:            0.501

Best Home/Away Model: LogisticIT
  Exact Accuracy: 60.2%
  Within-1 Acc:   95.4%
  MAE:            0.443

✅ WINNER: Home/Away Split
   Improvement: +4.0 percentage points in exact accuracy


## 6. Train Final Model (Best Approach)

Use the winning approach to generate 2017-18 predictions.

In [26]:
# Determine winner
if homeaway_best['Exact Accuracy'] >= baseline_best['Exact Accuracy']:
    best_approach = 'homeaway'
    best_train_df = train_homeaway.copy()
    best_model_name = homeaway_best['Model']
    print(f"✓ Using Home/Away approach with {best_model_name}")
else:
    best_approach = 'baseline'
    best_train_df = train_baseline.copy()
    best_model_name = baseline_best['Model']
    print(f"✓ Using Baseline approach with {best_model_name}")

# Train final model
best_train_df['is_away'] = (best_train_df['Venue'] == 'Away').astype(int)
feature_cols = ['opp_rolling_xG', 'opp_rolling_xGA', 'opp_rolling_goals_scored',
                'opp_rolling_goals_conceded', 'opp_rolling_points', 'opp_rolling_possession', 'is_away']

X = best_train_df[feature_cols]
y = best_train_df['FDR'].astype(int)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

if best_model_name == 'LogisticAT':
    final_model = mord.LogisticAT(alpha=1.0)
elif best_model_name == 'LogisticIT':
    final_model = mord.LogisticIT(alpha=1.0)
else:
    final_model = mord.LogisticSE(alpha=1.0)

final_model.fit(X_scaled, y)
print(f"✓ Trained final model on {len(X)} samples")

# Save
joblib.dump(final_model, 'fdr_model.pkl')
joblib.dump(scaler, 'fdr_scaler.pkl')
joblib.dump({'model_type': best_model_name, 'approach': best_approach, 'feature_cols': feature_cols}, 'fdr_model_meta.pkl')
print("✓ Saved model files")

✓ Using Home/Away approach with LogisticIT
✓ Trained final model on 4132 samples
✓ Saved model files


## 7. Generate 2017-18 Predictions

Use the best model to predict FDR for 2017-18.

In [27]:
# Calculate rolling metrics for 2017-18 using winning approach
team_df_1718 = team_df[team_df['Season'] == '2017-18'].copy()

if best_approach == 'homeaway':
    team_df_1718_rolling = calculate_home_away_rolling_metrics(team_df_1718, window=6)
    use_context_aware = True
else:
    team_df_1718_rolling = calculate_overall_rolling_metrics(team_df_1718, window=5)
    use_context_aware = False

print(f"✓ Calculated rolling metrics for 2017-18 ({best_approach} approach)")

# Generate predictions
all_predictions = []

for _, match in team_df_1718_rolling.iterrows():
    team = match['Team_FPL']
    opponent = match['Opponent']
    if opponent in team_name_mapping:
        opponent = team_name_mapping[opponent]

    venue = match['Venue']
    date = match['Date']
    matchweek = int(match['Matchweek'])

    opponent_data = team_df_1718_rolling[
        (team_df_1718_rolling['Team_FPL'] == opponent) &
        (team_df_1718_rolling['Date'] <= date)
    ].sort_values('Date').tail(1)

    if len(opponent_data) == 0:
        continue

    if use_context_aware:
        form_suffix = 'away' if venue == 'Home' else 'home'
        features = {
            'opp_rolling_xG': opponent_data[f'{form_suffix}_rolling_xG'].values[0],
            'opp_rolling_xGA': opponent_data[f'{form_suffix}_rolling_xGA'].values[0],
            'opp_rolling_goals_scored': opponent_data[f'{form_suffix}_rolling_goals_scored'].values[0],
            'opp_rolling_goals_conceded': opponent_data[f'{form_suffix}_rolling_goals_conceded'].values[0],
            'opp_rolling_points': opponent_data[f'{form_suffix}_rolling_points'].values[0],
            'opp_rolling_possession': opponent_data[f'{form_suffix}_rolling_possession'].values[0],
            'is_away': 1 if venue == 'Away' else 0
        }
    else:
        features = {
            'opp_rolling_xG': opponent_data['rolling_xG'].values[0],
            'opp_rolling_xGA': opponent_data['rolling_xGA'].values[0],
            'opp_rolling_goals_scored': opponent_data['rolling_goals_scored'].values[0],
            'opp_rolling_goals_conceded': opponent_data['rolling_goals_conceded'].values[0],
            'opp_rolling_points': opponent_data['rolling_points'].values[0],
            'opp_rolling_possession': opponent_data['rolling_possession'].values[0],
            'is_away': 1 if venue == 'Away' else 0
        }
    
    if pd.isna(features['opp_rolling_xG']):
        continue

    X_pred = pd.DataFrame([features])
    X_pred_scaled = scaler.transform(X_pred)
    fdr_pred = final_model.predict(X_pred_scaled)[0]

    all_predictions.append({
        'Matchweek': matchweek, 'Team': team, 'Opponent': opponent,
        'Venue': venue, 'predicted_FDR': int(fdr_pred)
    })

predictions_df = pd.DataFrame(all_predictions)

# Create fixtures format
fixtures_output = []
for matchweek in sorted(predictions_df['Matchweek'].unique()):
    mw_data = predictions_df[predictions_df['Matchweek'] == matchweek]
    processed = set()

    for _, row in mw_data.iterrows():
        if row['Venue'] == 'Home':
            key = (matchweek, row['Team'], row['Opponent'])
            if key not in processed:
                away_fdr = mw_data[
                    (mw_data['Team'] == row['Opponent']) &
                    (mw_data['Opponent'] == row['Team']) &
                    (mw_data['Venue'] == 'Away')
                ]['predicted_FDR'].values

                fixtures_output.append({
                    'event': matchweek, 'team_h': row['Team'], 'team_a': row['Opponent'],
                    'team_h_difficulty': row['predicted_FDR'],
                    'team_a_difficulty': away_fdr[0] if len(away_fdr) > 0 else row['predicted_FDR']
                })
                processed.add(key)

fixtures_final = pd.DataFrame(fixtures_output).sort_values('event')
fixtures_final.to_csv('fixtures_2017-18_predicted.csv', index=False)

print(f"✓ Generated {len(fixtures_final)} fixtures for 2017-18")
print("✓ Saved to: fixtures_2017-18_predicted.csv")

✓ Calculated rolling metrics for 2017-18 (homeaway approach)
✓ Generated 360 fixtures for 2017-18
✓ Saved to: fixtures_2017-18_predicted.csv


## Summary

### Final Results
See cross-validation results above for performance comparison.

### Key Takeaways
- Compared overall rolling averages vs home/away-specific form
- Used temporal cross-validation to avoid data leakage
- Selected best approach based on exact accuracy
- Generated predictions for 2017-18 using winning model