In [2]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
import os

In [None]:
class statsbombloader:
    def __init__(self, data_path):
        self.data_path = Path(data_path)
        self.events_path = self.data_path / "events"
        self.lineups_path = self.data_path / "lineups"
        self.matches_path = self.data_path / "matches"
        self.three_sixty_path = self.data_path / "three-sixty"


    def load_matches(self, competition_id = None, season_id = None):
        matches_list = []

        for comp_folder in self.matches_path.iterdir():
            if comp_folder.is_dir():
                current_comp_id = int(comp_folder.name)
                if competition_id and current_comp_id !=competition_id:
                    continue
                for season_file in comp_folder.glob("*.json"):
                    current_season_id = int(season_file.stem)
                    if season_id and current_season_id !=season_id:
                        continue
                        
                    with open(season_file, 'r', encoding = 'utf-8') as f:
                        season_matches = json.load(f)
                        matches_list.extend(season_matches)
        return pd.json_normalize(matches_list)
    

    def get_available_matches(self):
        if not self.events_path.exists():
            return []
        return [int(f.stem) for f in self.events_path.glob("*.json")]
    

    def load_events(self, match_id):
        events_file = self.events_path / f"{match_id}.json"
        if not events_file.exists():
            raise FileNotFoundError(f"events file not found for match {match_id}")
        
        with open(events_file, 'r', encoding = 'utf-8') as f:
            events_data = json.load(f)

        events_df = pd.json_normalize(events_data)

        if 'location' in events_df.columns:
            events_df['location_x'] = events_df['location'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 2 else None)
            events_df['location_y'] = events_df['location'].apply(lambda x: x[1] if isinstance(x, list) and len(x) > 2 else None)


        return events_df
    

    def load_lineups(self, match_id):
        lineups_file = self.lineups_path / f"{match_id}.json"
        if not lineups_file.exists():
            raise FileNotFoundError(f"lineups not found for match: {match_id}")
        
        with open(lineups_file, 'r', encoding = 'utf-8') as f:
            lineups_data = json.load(f)

        all_players = []
        for team in lineups_data:
            team_id = team['team_id']
            team_name = team['team_name']

            for player in team['lineup']:
                player_info = player.copy()
                player_info['team_d'] = team_id
                player_info['team_name'] = team_name
                all_players.append(player_info)

        return pd.json_normalize(all_players)
    

    

    def load_match_sample(self, num_matches = 5):
        available_matches = self.get_available_matches()[:num_matches]

        sample_data = {}
        for match_id in available_matches:
            try:
                sample_data[match_id] = {
                    'events': self.load_events(match_id),
                    'lineups': self.load_lineups(match_id)
                }
                print(f"loaded match: {match_id} {len(sample_data['match_id']['events'])} events")
            except Exception as e:
                print|(f"error loading match {match_id} {e}")
        return sample_data


        
        


In [19]:
import json
import pandas as pd
from pathlib import Path

class statsbombloader:
    def __init__(self, data_path):
        self.data_path = Path(data_path)
        self.events_path = self.data_path / "events"
        self.lineups_path = self.data_path / "lineups"
        self.matches_path = self.data_path / "matches"
        self.three_sixty_path = self.data_path / "three-sixty"

    def load_matches(self, competition_id=None, season_id=None):
        matches_list = []
        for comp_folder in self.matches_path.iterdir():
            if comp_folder.is_dir():
                current_comp_id = int(comp_folder.name)
                if competition_id and current_comp_id != competition_id:
                    continue
                for season_file in comp_folder.glob("*.json"):
                    current_season_id = int(season_file.stem)
                    if season_id and current_season_id != season_id:
                        continue
                    with open(season_file, "r", encoding="utf-8") as f:
                        season_matches = json.load(f)
                        matches_list.extend(season_matches)
        return pd.json_normalize(matches_list)

    def get_available_matches(self):
        if not self.events_path.exists():
            return []
        return [int(f.stem) for f in self.events_path.glob("*.json")]

    def load_events(self, match_id):
        events_file = self.events_path / f"{match_id}.json"
        if not events_file.exists():
            raise FileNotFoundError(f"Events file not found for match {match_id}")
        with open(events_file, "r", encoding="utf-8") as f:
            events_data = json.load(f)
        events_df = pd.json_normalize(events_data)
        if "location" in events_df.columns:
            events_df["location_x"] = events_df["location"].apply(
                lambda x: x[0] if isinstance(x, list) and len(x) >= 2 else None
            )
            events_df["location_y"] = events_df["location"].apply(
                lambda x: x[1] if isinstance(x, list) and len(x) >= 2 else None
            )
        return events_df

    def load_lineups(self, match_id):
        lineups_file = self.lineups_path / f"{match_id}.json"
        if not lineups_file.exists():
            raise FileNotFoundError(f"Lineups not found for match {match_id}")
        with open(lineups_file, "r", encoding="utf-8") as f:
            lineups_data = json.load(f)
        all_players = []
        for team in lineups_data:
            team_id = team["team_id"]
            team_name = team["team_name"]
            for player in team["lineup"]:
                player_info = player.copy()
                player_info["team_id"] = team_id
                player_info["team_name"] = team_name
                all_players.append(player_info)
        return pd.json_normalize(all_players)

    def load_three_sixty(self, match_id):
        ts_file = self.three_sixty_path / f"{match_id}.json"
        if not ts_file.exists():
            return None  # not all matches have 360 data
        with open(ts_file, "r", encoding="utf-8") as f:
            ts_data = json.load(f)
        return pd.json_normalize(ts_data)

    def load_match_sample(self, num_matches=100):
        available_matches = self.get_available_matches()[:num_matches]
        sample_data = {}
        for match_id in available_matches:
            try:
                sample_data[match_id] = {
                    "events": self.load_events(match_id),
                    "lineups": self.load_lineups(match_id),
                    "three_sixty": self.load_three_sixty(match_id),
                }
                print(
                    f"Loaded match {match_id}: "
                    f"{len(sample_data[match_id]['events'])} events, "
                    f"{len(sample_data[match_id]['lineups'])} players, "
                    f"{'with 360' if sample_data[match_id]['three_sixty'] is not None else 'no 360'}"
                )
            except Exception as e:
                print(f"Error loading match {match_id}: {e}")
        return sample_data


In [20]:
loader = statsbombloader(r"C:\Users\RahulP\personal_data\open-data-master\open-data-master\data")
matches_df = loader.load_match_sample(100)
print(f"Total matches: {len(matches_df)}")
#print(matches_df.head())
available_matches = loader.get_available_matches()

Loaded match 15946: 3762 events, 36 players, no 360
Loaded match 15956: 3342 events, 36 players, no 360
Loaded match 15973: 3440 events, 36 players, no 360
Loaded match 15978: 3905 events, 36 players, no 360
Loaded match 15986: 3910 events, 36 players, no 360
Loaded match 15998: 3947 events, 36 players, no 360
Loaded match 16010: 3591 events, 36 players, no 360
Loaded match 16023: 4541 events, 36 players, no 360
Loaded match 16029: 3542 events, 36 players, no 360
Loaded match 16056: 3388 events, 36 players, no 360
Loaded match 16073: 4096 events, 36 players, no 360
Loaded match 16079: 3916 events, 36 players, no 360
Loaded match 16086: 3908 events, 36 players, no 360
Loaded match 16095: 3376 events, 36 players, no 360
Loaded match 16109: 3818 events, 36 players, no 360
Loaded match 16120: 3668 events, 36 players, no 360
Loaded match 16131: 3791 events, 36 players, no 360
Loaded match 16136: 3877 events, 36 players, no 360
Loaded match 16149: 3668 events, 36 players, no 360
Loaded match

In [21]:
if available_matches:
    sample_match = available_matches[0]
    print(f"loading events for match {sample_match}")

    events_df = loader.load_events(sample_match)
    print(f"events_df shape {events_df.shape}")
    print(f"evvents_df columns: {events_df.columns.to_list()}")


    lineups_df = loader.load_lineups(sample_match)
    print(f"lineups_df shape: {lineups_df.shape}")
    print(f"lineups_df columns: {lineups_df.columns.to_list()}")


    print(events_df[['minute', 'second', 'type.name', 'player.name', 'location_x', 'location_y']].head(10))


loading events for match 15946
events_df shape (3762, 121)
evvents_df columns: ['id', 'index', 'period', 'timestamp', 'minute', 'second', 'possession', 'duration', 'type.id', 'type.name', 'possession_team.id', 'possession_team.name', 'play_pattern.id', 'play_pattern.name', 'team.id', 'team.name', 'tactics.formation', 'tactics.lineup', 'related_events', 'location', 'player.id', 'player.name', 'position.id', 'position.name', 'pass.recipient.id', 'pass.recipient.name', 'pass.length', 'pass.angle', 'pass.height.id', 'pass.height.name', 'pass.end_location', 'pass.body_part.id', 'pass.body_part.name', 'pass.type.id', 'pass.type.name', 'carry.end_location', 'pass.switch', 'pass.outcome.id', 'pass.outcome.name', 'ball_receipt.outcome.id', 'ball_receipt.outcome.name', 'under_pressure', 'duel.type.id', 'duel.type.name', 'pass.aerial_won', 'counterpress', 'interception.outcome.id', 'interception.outcome.name', 'off_camera', 'ball_recovery.recovery_failure', 'pass.assisted_shot_id', 'pass.shot_ass

In [22]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, mean_squared_error
import warnings
warnings.filterwarnings('ignore')



In [23]:
shots = events_df[events_df['type.name'] == 'Shot'].copy()
print(f"Found {len(shots)} shots")



shots['is_goal'] = shots['shot.outcome.name'].apply(lambda x: 1 if x == 'Goal' else 0)
shots = shots.dropna(subset = ['location_x','location_y'])

features_df = shots.copy()

goal_x, goal_y = 120,40
features_df['distance_to_goal'] = np.sqrt(
    (features_df['location_x'] - goal_x)**2 +
    (features_df['location_y'] - goal_y)**2
)

features_df['angle_to_goal'] = np.arctan2(
    np.abs(features_df['location_y'] - goal_y),
    goal_x - features_df['location_x']
) * 180 / np.pi

features_df['in_penalty_area'] = (
    (features_df['location_x'] >=114) &
    (features_df['location_y'] >=30) &
    (features_df['location_y'] <=50)
).astype(int)

Found 28 shots


In [66]:
eventsdf_cols = events_df.columns.to_list()

In [67]:
eventsdf_cols

['id',
 'index',
 'period',
 'timestamp',
 'minute',
 'second',
 'possession',
 'duration',
 'type.id',
 'type.name',
 'possession_team.id',
 'possession_team.name',
 'play_pattern.id',
 'play_pattern.name',
 'team.id',
 'team.name',
 'tactics.formation',
 'tactics.lineup',
 'related_events',
 'location',
 'player.id',
 'player.name',
 'position.id',
 'position.name',
 'pass.recipient.id',
 'pass.recipient.name',
 'pass.length',
 'pass.angle',
 'pass.height.id',
 'pass.height.name',
 'pass.end_location',
 'pass.body_part.id',
 'pass.body_part.name',
 'pass.type.id',
 'pass.type.name',
 'carry.end_location',
 'pass.switch',
 'pass.outcome.id',
 'pass.outcome.name',
 'ball_receipt.outcome.id',
 'ball_receipt.outcome.name',
 'under_pressure',
 'duel.type.id',
 'duel.type.name',
 'pass.aerial_won',
 'counterpress',
 'interception.outcome.id',
 'interception.outcome.name',
 'off_camera',
 'ball_recovery.recovery_failure',
 'pass.assisted_shot_id',
 'pass.shot_assist',
 'shot.statsbomb_xg',


In [68]:
class SoccerDataPreprocessor:

    def __init__(self, loader):
        self.loader = loader
        self.scaler = StandardScaler()
        self.label_encoders = {}


    def preprocess_events_for_xg(self, events_df):
        print("🎯 Step 1: Filtering shots for xG modeling...")

        shots = events_df[events_df['type.name'] == 'Shot'].copy()
        print(f"Found {len(shots)} shots")

        if len(shots) ==0:
            return pd.DataFrame()
        
        shots['is_goal'] = shots['shot.outcome.name'].apply(lambda x: 1 if x == 'Goal' else 0)
        shots = shots.dropna(subset = ['location_x','location_y'])


        return shots
    
    def engineer_xg_features(self, shots_df):
        if len(shots_df) ==0:
            return pd.DataFrame
        
        features_df = shots_df.copy()

        goal_x, goal_y = 120,40
        features_df['distance_to_goal'] = np.sqrt(
            (features_df['location_x'] - goal_x)**2 +
            (features_df['location_y'] - goal_y)**2
        )

        features_df['angle_to_goal'] = np.arctan2(
            np.abs(features_df['location_y'] - goal_y),
            goal_x - features_df['location_x']
        ) * 180 / np.pi

        features_df['in_penalty_area'] = (
            (features_df['location_x'] >=114) &
            (features_df['location_y'] >=30) &
            (features_df['location_y'] <=50)
        ).astype(int)

        features_df['is_header'] = features_df['shot.body_part.name'].apply(
            lambda x: 1 if x == 'Head' else 0
        )

        features_df['is_weak_foot'] = features_df['shot.body_part.name'].apply(
            lambda x: 1 if x == 'Left Foot' else 0
        )

        features_df['is_counter_attack'] = (
            features_df.get('under_pressure', 0).fillna(0).astype(int)
        )
        features_df['minutes_scaled'] = features_df['minute'] / 90

        features_df['in_six_yard_box'] = (
            (features_df['location_x'].between(114, 120)) &  # 6-yard box x range
            (features_df['location_y'].between(34, 46))      # 6-yard box y range
            ).astype(int)


        shot_techniques = features_df['shot.technique.name'].fillna('Normal')
        features_df['is_volley'] = (shot_techniques == 'Volley').astype(int)
        features_df['is_half_volley'] = (shot_techniques == 'Half Volley').astype(int)

        return features_df
    


    def preprocess_match_prediction_features(self, events_df, lineups_df):
        print("🏆 Step 3: Creating match prediction features...")

        match_features = {}

        for team_name in events_df['team.name'].unique():
            if pd.isna(team_name):
                continue


            team_events = events_df[events_df['team.name'] == team_name]
            prefix = 'home' if team_name == events_df['team.name'].iloc[0] else 'away'

            match_features[f"{prefix}_possession_pct"] = len(team_events) / len(events_df)
            match_features[f"{prefix}_passes"] = len(team_events[team_events['type.name']== 'Pass'] )
            match_features[f"{prefix}_pass_accuracy"] = self._calculate_pass_accuracy(team_events)


            team_shots = team_events[team_events['type.name'] == 'Shot']
            match_features[f"{prefix}_shots"] = len(team_shots)
            match_features[f"{prefix}_shots_on_target"] = len(
                team_shots[team_shots['shot.outcome.name'].isin(['Goal','Saved'])]
            )

            match_features[f"{prefix}_tackles"] = len(team_events[team_events['type.name'] == 'Tackle'])
            match_features[f"{prefix}_interceptions"] = len(team_events[team_events['type.name'] == 'Interception'])

            match_features[f"{prefix}_corners"] = len(team_events[team_events['type.name'] == 'Corners'])
            match_features[f"{prefix}_free_kicks"] = len(team_events[team_events['type.name']== 'Free Kick'])
            match_features[f"{prefix}_goals"] = team_shots['shot.outcome.name'].eq('Goal').sum()


        return pd.DataFrame([match_features])
    
    

    def _calculate_pass_accuracy(self, team_events):
        passes = team_events[team_events['type.name'] == 'Pass']
        if len(passes) == 0:
            return 0.0
        
        successful = passes['pass.outcome.name'].isna().sum()
        return successful / len(passes)


    def create_player_performance_features(self, events_df, lineups_df):
        """
        STEP 4: PLAYER-LEVEL PERFORMANCE FEATURES
        """
        print("👤 Step 4: Creating player performance features...")
        
        player_stats = []
        
        for _, player in lineups_df.iterrows():
            player_id = player['player_id']
            player_events = events_df[events_df['player.id'] == player_id]
            
            stats = {
                'player_id': player_id,
                'player_name': player['player_name'],
                'team_name': player['team_name'],
                'position': player['positions'],
                'minutes_played': self._calculate_minutes_played(player_events),
                'touches': len(player_events),
                'passes_attempted': len(player_events[player_events['type.name'] == 'Pass']),
                'passes_completed': len(player_events[
                    (player_events['type.name'] == 'Pass') & 
                    (player_events['pass.outcome.name'].isna())
                ]),
                'shots_taken': len(player_events[player_events['type.name'] == 'Shot']),
                'goals_scored': len(player_events[
                    (player_events['type.name'] == 'Shot') & 
                    (player_events['shot.outcome.name'] == 'Goal')
                ]),
                'tackles': len(player_events[player_events['type.name'] == 'Tackle']),
                'interceptions': len(player_events[player_events['type.name'] == 'Interception']),
            }
            
            # Calculate derived metrics
            if stats['passes_attempted'] > 0:
                stats['pass_accuracy'] = stats['passes_completed'] / stats['passes_attempted']
            else:
                stats['pass_accuracy'] = 0.0
                
            player_stats.append(stats)
        
        return pd.DataFrame(player_stats)
    

    def _calculate_minutes_played(self, player_events):
        """Estimate minutes played based on events"""
        if len(player_events) == 0:
            return 0
        return min(player_events['minute'].max(), 90)
    

    def create_time_series_features(self, events_df, window_minutes=15):
        """
        STEP 5: TIME-BASED FEATURES FOR MOMENTUM ANALYSIS
        """
        print(f"⏱️ Step 5: Creating {window_minutes}-minute rolling features...")
        
        # Sort events by time
        events_sorted = events_df.sort_values(['period', 'minute', 'second']).copy()
        events_sorted['game_minute'] = events_sorted['period'] * 45 + events_sorted['minute']
        
        # Create rolling features
        rolling_features = []
        
        for team in events_df['team.name'].unique():
            if pd.isna(team):
                continue
                
            team_events = events_sorted[events_sorted['team.name'] == team]
            
            # Rolling metrics every 5 minutes
            for minute in range(window_minutes, 95, 5):
                window_events = team_events[
                    (team_events['game_minute'] >= minute - window_minutes) & 
                    (team_events['game_minute'] < minute)
                ]
                
                feature = {
                    'team': team,
                    'minute': minute,
                    'rolling_shots': len(window_events[window_events['type.name'] == 'Shot']),
                    'rolling_passes': len(window_events[window_events['type.name'] == 'Pass']),
                    'rolling_tackles': len(window_events[window_events['type.name'] == 'Tackle']),
                    'rolling_possession_events': len(window_events),
                }
                
                rolling_features.append(feature)
        
        return pd.DataFrame(rolling_features)
    

    def prepare_ml_datasets(self, events_df, lineups_df):
        """
        STEP 6: PREPARE FINAL ML-READY DATASETS
        """
        print("🤖 Step 6: Preparing ML-ready datasets...")
        
        # 1. xG Dataset
        shots = self.preprocess_events_for_xg(events_df)
        if len(shots) > 0:
            xg_features = self.engineer_xg_features(shots)
            xg_dataset = self._finalize_xg_dataset(xg_features)
        else:
            xg_dataset = pd.DataFrame()
        
        # 2. Match Prediction Dataset
        match_features = self.preprocess_match_prediction_features(events_df, lineups_df)
        
        # 3. Player Performance Dataset
        player_features = self.create_player_performance_features(events_df, lineups_df)
        
        # 4. Time Series Dataset
        time_features = self.create_time_series_features(events_df)

        return {
            'xg_dataset': xg_dataset,
            'match_dataset': match_features,
            'player_dataset': player_features,
            'time_series_dataset': time_features
        }
    

    def _finalize_xg_dataset(self, features_df):
        """Final preprocessing for xG model"""
        if len(features_df) == 0:
            return pd.DataFrame()
        
        # Select features for ML
        feature_columns = [
            'distance_to_goal', 'angle_to_goal', 'in_penalty_area', 'in_six_yard_box',
            'is_header', 'is_weak_foot', 'is_counter_attack', 'minutes_scaled',
            'is_volley', 'is_half_volley'
        ]
        
        target = 'is_goal'
        
        # Create final dataset
        final_df = features_df[feature_columns + [target]].copy()
        
        # Handle missing values
        final_df = final_df.fillna(0)
        
        # Scale numerical features
        numerical_features = ['distance_to_goal', 'angle_to_goal', 'minutes_scaled']
        final_df[numerical_features] = self.scaler.fit_transform(final_df[numerical_features])
        
        return final_df
    













        
        
        
        

    


        


        


In [85]:
class SoccerMLModels:
    """Collection of ML models for soccer analytics"""
    
    def __init__(self):
        self.models = {}
        self.model_performance = {}
    
    def train_xg_model(self, xg_dataset):
        """
        MODEL 1: EXPECTED GOALS (xG) PREDICTION
        Binary Classification Problem
        """
        print("⚽ Training xG Model...")
        
        if len(xg_dataset) < 10:
            print("Not enough shot data for xG model")
            return None
        
        # Prepare data
        X = xg_dataset.drop('is_goal', axis=1)
        y = xg_dataset['is_goal']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train Random Forest (best for xG)
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)
        
        # Evaluate
        y_pred = rf_model.predict(X_test)
        proba = rf_model.predict_proba(X_test)
        if proba.shape[1] == 2:
            y_pred_proba = proba[:, 1]  # xG values
        else:
            y_pred_proba = np.zeros(proba.shape[0])

        
        print("xG Model Performance:")
        print(classification_report(y_test, y_pred))
        
        self.models['xg_model'] = rf_model
        return rf_model
    
    def train_match_outcome_model(self, match_dataset, outcomes=None):
        """
        MODEL 2: MATCH OUTCOME PREDICTION
        Multi-class Classification (Win/Draw/Loss)
        """
        print("🏆 Training Match Outcome Model...")
        
        if outcomes is None:
            # For demo purposes, create dummy outcomes
            outcomes = np.random.choice(['Win', 'Draw', 'Loss'], len(match_dataset))
        
        # Prepare features
        X = match_dataset.select_dtypes(include=[np.number])
        X = X.fillna(0)
        
        # Encode labels
        le = LabelEncoder()
        y = le.fit_transform(outcomes)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train Gradient Boosting
        gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        gb_model.fit(X_train, y_train)
        
        y_pred = gb_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        
        print(f"Match Outcome Model MSE: {mse:.4f}")
        
        self.models['match_outcome_model'] = gb_model
        return gb_model
    
    def train_player_rating_model(self, player_dataset):
        """
        MODEL 3: PLAYER PERFORMANCE RATING
        Regression Problem
        """
        print("👤 Training Player Rating Model...")
        
        if len(player_dataset) == 0:
            return None
        
        # Create composite performance score
        numerical_cols = player_dataset.select_dtypes(include=[np.number]).columns
        performance_features = [
            'touches', 'pass_accuracy', 'shots_taken', 'goals_scored', 
            'tackles', 'interceptions'
        ]
        
        # Filter existing columns
        available_features = [col for col in performance_features if col in numerical_cols]
        
        if len(available_features) < 3:
            print("Not enough performance features available")
            return None
        
        X = player_dataset[available_features].fillna(0)
        
        # Create synthetic rating (in real scenario, you'd have actual ratings)
        y = (X.sum(axis=1) + np.random.normal(0, 1, len(X))) / len(available_features)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train model
        lr_model = LogisticRegression(random_state=42)
        lr_model.fit(X_train, y_train.round().astype(int))  # Convert to classification
        
        accuracy = lr_model.score(X_test, y_test.round().astype(int))
        print(f"Player Rating Model Accuracy: {accuracy:.4f}")
        
        self.models['player_rating_model'] = lr_model
        return lr_model

In [None]:
preprocessor = SoccerDataPreprocessor(loader)
all_xg = []
all_match = []
all_player = []
all_time = []
for match_id, match_data in matches_df.items():
    print(f"\n📊 Processing match {match_id}...")

    events_df = match_data["events"]
    lineups_df = match_data["lineups"]

    if events_df.empty or lineups_df.empty:
        print(f"⚠️ Skipping match {match_id} (no events or lineups)")
        continue

    
    datasets = preprocessor.prepare_ml_datasets(events_df, lineups_df)

    if len(datasets['xg_dataset']) > 0:
        all_xg.append(datasets['xg_dataset'])
    
    # Collect match dataset
    if len(datasets['match_dataset']) > 0:
        all_match.append(datasets['match_dataset'])
    
    # Collect player-level dataset
    if len(datasets['player_dataset']) > 0:
        all_player.append(datasets['player_dataset'])
    
    # Collect time series dataset
    if len(datasets['time_series_dataset']) > 0:
        all_time.append(datasets['time_series_dataset'])

    print(f"✅ Done with match {match_id}")

xg_dataset_combined = pd.concat(all_xg, ignore_index=True) if all_xg else pd.DataFrame()
match_dataset_combined = pd.concat(all_match, ignore_index=True) if all_match else pd.DataFrame()
player_dataset_combined = pd.concat(all_player, ignore_index=True) if all_player else pd.DataFrame()
time_series_dataset_combined = pd.concat(all_time, ignore_index=True) if all_time else pd.DataFrame()


ml_models = SoccerMLModels()

    # Train models
print("\n🤖 TRAINING ML MODELS")
print("=" * 40)

print(datasets['xg_dataset']['is_goal'].value_counts())


# 1. xG Model
if len(datasets['xg_dataset']) > 0:
    ml_models.train_xg_model(xg_dataset_combined)



ml_models.train_match_outcome_model(match_dataset_combined)
    
    # 3. Player Rating Model
    #ml_models.train_player_rating_model(datasets['player_dataset'])
    
    #print("\n✅ PIPELINE COMPLETE!")


📊 Processing match 15946...
🤖 Step 6: Preparing ML-ready datasets...
🎯 Step 1: Filtering shots for xG modeling...
Found 28 shots
🏆 Step 3: Creating match prediction features...
👤 Step 4: Creating player performance features...
⏱️ Step 5: Creating 15-minute rolling features...
✅ Done with match 15946

📊 Processing match 15956...
🤖 Step 6: Preparing ML-ready datasets...
🎯 Step 1: Filtering shots for xG modeling...
Found 18 shots
🏆 Step 3: Creating match prediction features...
👤 Step 4: Creating player performance features...
⏱️ Step 5: Creating 15-minute rolling features...
✅ Done with match 15956

📊 Processing match 15973...
🤖 Step 6: Preparing ML-ready datasets...
🎯 Step 1: Filtering shots for xG modeling...
Found 37 shots
🏆 Step 3: Creating match prediction features...
👤 Step 4: Creating player performance features...
⏱️ Step 5: Creating 15-minute rolling features...
✅ Done with match 15973

📊 Processing match 15978...
🤖 Step 6: Preparing ML-ready datasets...
🎯 Step 1: Filtering shot

In [92]:
ml_models.train_player_rating_model(player_dataset_combined)

👤 Training Player Rating Model...
Player Rating Model Accuracy: 0.2597


In [101]:
datasets['player_dataset']

Unnamed: 0,player_id,player_name,team_name,position,minutes_played,touches,passes_attempted,passes_completed,shots_taken,goals_scored,tackles,interceptions,pass_accuracy
0,10193,Chloe Arthur,Birmingham City WFC,"[{'position_id': 19, 'position': 'Center Attac...",90,227,71,59,2,0,0,1,0.830986
1,15557,Emma Follis,Birmingham City WFC,"[{'position_id': 17, 'position': 'Right Wing',...",85,140,33,28,2,0,0,0,0.848485
2,15560,Ann-Katrin Berger,Birmingham City WFC,[],0,0,0,0,0,0,0,0,0.0
3,15562,Lucy Staniforth,Birmingham City WFC,"[{'position_id': 11, 'position': 'Left Defensi...",64,194,55,38,3,1,0,0,0.690909
4,15563,Charlie Wellings,Birmingham City WFC,"[{'position_id': 11, 'position': 'Left Defensi...",90,53,7,7,4,0,0,0,1.0
5,15564,Lucy Quinn,Birmingham City WFC,"[{'position_id': 23, 'position': 'Center Forwa...",90,191,42,31,5,0,0,0,0.738095
6,15567,Paige Williams,Birmingham City WFC,"[{'position_id': 6, 'position': 'Left Back', '...",90,194,67,55,1,0,0,0,0.820896
7,19499,Shania Hayles,Birmingham City WFC,"[{'position_id': 17, 'position': 'Right Wing',...",90,5,0,0,0,0,0,0,0.0
8,19500,Sarah Emma Mayling,Birmingham City WFC,"[{'position_id': 10, 'position': 'Center Defen...",90,23,6,6,0,0,0,0,1.0
9,19501,Hayley Ladd,Birmingham City WFC,"[{'position_id': 9, 'position': 'Right Defensi...",80,183,51,45,0,0,0,0,0.882353
