In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Arc
import os
import requests
from datetime import datetime
from requests.auth import HTTPBasicAuth

class ChanceCreationAnalyzer:
    """
    A class to analyze chance creation patterns using StatsBomb data
    """
    def __init__(self):
        """
        Initialize the model with StatsBomb API credentials from environment variables
        """
        # Get credentials from environment variables
        self.api_username = os.environ.get('SB_USERNAME')
        self.api_token = os.environ.get('SB_PASSWORD')
        self.base_url = "https://data.statsbomb.com/api/v8"
        
        if self.api_username and self.api_token:
            # Create authentication for direct requests
            self.auth = HTTPBasicAuth(self.api_username, self.api_token)
            print(f"Using credentials from environment variables (username: {self.api_username})")
        else:
            self.auth = None
            print("No credentials found in environment variables. Set SB_USERNAME and SB_PASSWORD.")
        
        # Storage for processed data
        self.events_df = None
        self.shots_df = None
        self.chances = {}
        self.match_ids = []
        self.match_data = {}
        
        # Chance classification results
        self.chance_types = {
            'cross': [],
            'cutback': [],
            'layoff': [],
            'throughball': [],
            'transition': [],
            'high_turnover': [],
            'mid_turnover': [],
            'set_piece-corner-direct': [],  
            'set_piece-corner-indirect': [], 
            'set_piece-corner-short': [],    
            'set_piece-corner-second-phase': [], 
            'set_piece-freekick-direct': [], 
            'set_piece-freekick-indirect': [], 
            'set_piece-penalty': [],
            'set_piece-throw_in': [],
            'set_piece-other': [],
            'dds': [],  # Direct Dribble & Shoot (carries >= 5 yards)
            'other': []
        }
    
    def _api_request(self, endpoint, params=None):
        """
        Make a request to the StatsBomb API
        
        Args:
            endpoint: API endpoint (without base URL)
            params: Optional query parameters
            
        Returns:
            dict: JSON response from the API
        """
        url = f"{self.base_url}/{endpoint}"
        
        print(f"Making API request to: {url}")
        
        try:
            response = requests.get(url, params=params, auth=self.auth)
            
            # Check for successful response
            if response.status_code == 200:
                print(f"API request successful: {response.status_code}")
                return response.json()
            else:
                print(f"API request failed: {response.status_code}")
                print(f"Response content: {response.text[:100]}...")
                return None
        except requests.exceptions.RequestException as e:
            print(f"API request failed: {e}")
            return None
    
    def get_competitions(self):
        """Get available competitions from the API"""
        print("Fetching competitions from API...")
        return self._api_request('competitions')

    def get_matches(self, competition_id, season_id):
        """
        Get matches for a specific competition and season
        
        Args:
            competition_id: Competition ID
            season_id: Season ID
            
        Returns:
            list: List of matches
        """
        # Get all matches for the competition and season
        #print(f"Fetching matches for competition {competition_id}, season {season_id}...")
        return self._api_request(f'competitions/{competition_id}/seasons/{season_id}/matches')
    
    def get_events(self, match_id):
        """Get events for a specific match"""
        #print(f"Fetching events for match {match_id}...")
        return self._api_request(f'events/{match_id}')
    
    def load_competition_data(self, competition_id, season_id):
        """
        Load data for an entire competition using direct API calls
        
        Args:
            competition_id: ID of the competition
            season_id: ID of the season
        """
        if not self.auth:
            print("Error: API credentials are required for accessing competition data")
            return self
            
        # Get matches for the competition
        matches = self.get_matches(competition_id, season_id)
        
        if not matches:
            print("No matches found for the selected competition and season")
            return self
            
        print(f"Found {len(matches)} matches")
        
        # Get current date
        current_date = datetime.now().strftime('%Y-%m-%d')
        
        # Store match info and get match IDs
        self.match_ids = []
        self.match_data = {}
        
        for match in matches:
            if 'match_id' not in match:
                print("Warning: Match has no ID, skipping")
                continue
                
            match_id = match['match_id']
            
            # Get match date if available
            match_date = match.get('match_date', current_date)
            if isinstance(match_date, datetime):
                match_date = match_date.strftime('%Y-%m-%d')
            
            # Only include matches that happened before today
            if match_date < current_date:
                self.match_ids.append(match_id)
                
                # Store match information
                self.match_data[match_id] = {
                    'match_id': match_id,
                    'home_team_id': match.get('home_team', {}).get('id'),
                    'home_team_name': match.get('home_team', {}).get('name', 'Unknown'),
                    'away_team_id': match.get('away_team', {}).get('id'),
                    'away_team_name': match.get('away_team', {}).get('name', 'Unknown'),
                    'match_date': match_date
                }
        
        print(f"Processing {len(self.match_ids)} played matches")
        
        # Load events for all matches
        self._load_events_for_matches()
        
        return self
    
    def _load_events_for_matches(self):
        """
        Load event data for all matches in self.match_ids
        """
        if not self.match_ids:
            print("No match IDs to load")
            return
            
        #print(f"Loading events for {len(self.match_ids)} matches...")
        
        all_events = []
        
        for match_id in self.match_ids:
            # Get events for this match
            events = self.get_events(match_id)
            
            if events:
                # Add match_id to each event
                for event in events:
                    event['match_id'] = match_id
                
                # Add events to our list
                all_events.extend(events)
                #print(f"Added {len(events)} events from match {match_id}")
            else:
                print(f"No events found for match {match_id}")
                
        # Check if we got any events
        if not all_events:
            print("No events were retrieved from any match")
            self.events_df = pd.DataFrame()
            return
            
        # Convert list of events to DataFrame
        print(f"Converting {len(all_events)} events to DataFrame...")
        self.events_df = pd.json_normalize(all_events)
        
        # Preprocess the data
        self._preprocess_events()
    
    def _preprocess_events(self):
        """
        Clean and preprocess the events data
        """
        if self.events_df is None or self.events_df.empty:
            print("No events data to preprocess")
            return
            
        print("Preprocessing events data...")
        
        # Extract location coordinates if in list format
        if 'location' in self.events_df.columns:
            try:
                # Split location into x, y coordinates
                self.events_df['location_x'] = self.events_df['location'].apply(
                    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else np.nan
                )
                self.events_df['location_y'] = self.events_df['location'].apply(
                    lambda x: x[1] if isinstance(x, list) and len(x) > 1 else np.nan
                )
                print("Extracted x, y coordinates from location field")
            except Exception as e:
                print(f"Error extracting locations: {e}")
        
        # Extract shot coordinates
        if 'shot' in self.events_df.columns:
            try:
                # Create a temporary DataFrame with the shot information unpacked
                shot_df = pd.json_normalize(self.events_df['shot'].dropna())
                
                # Check which columns are available in the shot data
                if not shot_df.empty:
                    # Map shot DataFrame back to main DataFrame for relevant columns
                    shot_cols = ['statsbomb_xg', 'outcome.name', 'technique.name', 'body_part.name']
                    shot_cols = [col for col in shot_cols if col in shot_df.columns]
                    
                    for col in shot_cols:
                        col_name = col.replace('.', '_')
                        self.events_df.loc[self.events_df['shot'].notna(), f'shot_{col_name}'] = shot_df[col].values
                        
                    print("Extracted shot data into separate columns")
            except Exception as e:
                print(f"Error extracting shot data: {e}")
        
        # Extract end location for carries
        if 'carry' in self.events_df.columns:
            try:
                # Get carries that are not null
                carries = self.events_df['carry'].dropna()
                if not carries.empty:
                    carry_df = pd.json_normalize(carries)
                    
                    if 'end_location' in carry_df.columns:
                        # Extract end_location coordinates
                        self.events_df.loc[self.events_df['carry'].notna(), 'end_location_x'] = carry_df['end_location'].apply(
                            lambda x: x[0] if isinstance(x, list) and len(x) > 0 else np.nan
                        ).values
                        
                        self.events_df.loc[self.events_df['carry'].notna(), 'end_location_y'] = carry_df['end_location'].apply(
                            lambda x: x[1] if isinstance(x, list) and len(x) > 1 else np.nan
                        ).values
                        
                        print("Extracted carry end locations")
            except Exception as e:
                print(f"Error extracting carry end locations: {e}")
        
        # Calculate carry distances and filter short carries
        if 'type.name' in self.events_df.columns:
            type_col = 'type.name'
        else:
            type_col = 'event_type_name' if 'event_type_name' in self.events_df.columns else None
        
        if type_col and 'location_x' in self.events_df.columns and 'end_location_x' in self.events_df.columns:
            try:
                # Identify carries
                carries_mask = self.events_df[type_col] == 'Carry'
                
                # Calculate distance for each carry
                self.events_df.loc[carries_mask, 'carry_distance'] = np.sqrt(
                    (self.events_df.loc[carries_mask, 'end_location_x'] - self.events_df.loc[carries_mask, 'location_x'])**2 +
                    (self.events_df.loc[carries_mask, 'end_location_y'] - self.events_df.loc[carries_mask, 'location_y'])**2
                )
                
                # Identify short carries (< 5 yards)
                short_carries_mask = (carries_mask) & (self.events_df['carry_distance'] < 5.0)
                
                # Filter out short carries
                carries_count_before = carries_mask.sum()
                self.events_df = self.events_df[~short_carries_mask]
                carries_removed = carries_count_before - carries_mask.sum()
                
                print(f"Removed {carries_removed} carries under 5 yards")
                
            except Exception as e:
                print(f"Error calculating carry distances: {e}")
                
        # Standardize column names
        self._standardize_column_names()
        
        print(f"Preprocessing complete. Final dataset: {len(self.events_df)} events")
    
    def _standardize_column_names(self):
        """
        Standardize column names for consistent access
        """
        # Map of expected column names to standardized ones
        column_mapping = {
            'type.name': 'event_type_name',
            'pass.type.name': 'pass_type_name',
            'pass.outcome.name': 'pass_outcome_name',
            'pass.through_ball': 'through_ball',
            'shot.outcome.name': 'shot_outcome_name',
            'shot.technique.name': 'technique_name',
            'shot.body_part.name': 'body_part_name',
            'shot.statsbomb_xg': 'statsbomb_xg',
            'play_pattern.name': 'play_pattern_name',
            'player.id': 'player_id',
            'player.name': 'player_name',
            'team.id': 'team_id',
            'team.name': 'team_name',
            'possession_team.id': 'possession_team_id',
            'possession_team.name': 'possession_team_name'
        }
        
        # Apply mapping for columns that exist
        for api_col, our_col in column_mapping.items():
            if api_col in self.events_df.columns and our_col not in self.events_df.columns:
                self.events_df[our_col] = self.events_df[api_col]
    
    def extract_shots(self):
        """
        Extract all shots from the events data
        """
        if self.events_df is None or self.events_df.empty:
            print("No events data to extract shots from")
            self.shots_df = pd.DataFrame()
            return self
        
        # Identify shots based on event type
        if 'event_type_name' in self.events_df.columns:
            shots_mask = self.events_df['event_type_name'] == 'Shot'
        elif 'type.name' in self.events_df.columns:
            shots_mask = self.events_df['type.name'] == 'Shot'
        else:
            print("Shot event type column not found")
            self.shots_df = pd.DataFrame()
            return self
        
        self.shots_df = self.events_df[shots_mask].copy()
        print(f"Extracted {len(self.shots_df)} shots")
        
        # Summarize shots by match
        if not self.shots_df.empty and 'match_id' in self.shots_df.columns:
            shots_by_match = self.shots_df.groupby('match_id').size()
            for match_id, count in shots_by_match.items():
                print(f"Match {match_id}: {count} shots")
        
        return self
    
    def create_possession_chains(self):
        """
        Create possession chains to identify build-up patterns
        """
        if self.events_df is None or self.events_df.empty:
            print("No events data to create possession chains from")
            return self
        
        # Clear existing possession chains
        self.possession_chains = {}
        
        # Process each match separately
        for match_id in self.match_ids:
            match_events = self.events_df[self.events_df['match_id'] == match_id].copy()
            
            # Get possession column
            possession_col = 'possession'
            if possession_col not in match_events.columns:
                print(f"Warning: No possession column found for match {match_id}")
                continue
            
            # Process each possession
            for possession_id in match_events[possession_col].unique():
                if pd.isna(possession_id):
                    continue
                
                possession_events = match_events[match_events[possession_col] == possession_id]
                
                # Convert to list of dictionaries and sort by index
                possession_list = possession_events.to_dict('records')
                index_col = 'index'
                
                if index_col not in possession_events.columns:
                    print(f"Warning: No index column found for match {match_id}")
                    continue
                
                possession_list = sorted(possession_list, key=lambda x: x[index_col])
                
                # Store with string key (match_id_possession_id)
                key = f"{match_id}_{possession_id}"
                self.possession_chains[key] = possession_list
        
        print(f"Created {len(self.possession_chains)} possession chains")
        return self
    
    def identify_chances(self):
        """
        Identify all chance creation events
        A chance is defined as a shot or a pass that leads directly to a shot
        """
        if self.shots_df is None or self.shots_df.empty:
            print("No shots data to identify chances from")
            return self
        
        if not hasattr(self, 'possession_chains') or not self.possession_chains:
            print("No possession chains available. Run create_possession_chains() first.")
            return self
        
        # Process each match separately
        for match_id in self.match_ids:
            match_shots = self.shots_df[self.shots_df['match_id'] == match_id].copy()
            
            # For each shot, find its buildup
            for _, shot in match_shots.iterrows():
                shot_id = shot.get('id')
                if shot_id is None:
                    continue
                
                possession_id = shot.get('possession')
                if possession_id is None:
                    continue
                
                shot_index = shot.get('index')
                if shot_index is None:
                    continue
                
                # Try to find the possession chain
                key = f"{match_id}_{possession_id}"
                if key in self.possession_chains:
                    possession_events = self.possession_chains[key]
                    
                    # Filter events that occurred before the shot
                    buildup_events = []
                    for event in possession_events:
                        if event.get('index') is not None and event.get('index') < shot_index:
                            buildup_events.append(event)
                    
                    # Store the chance with its buildup
                    self.chances[shot_id] = {
                        'shot': shot,
                        'buildup': buildup_events,
                        'possession_id': possession_id,
                        'match_id': match_id,
                        'type': None  # Will be classified later
                    }
                else:
                    # Still create a chance entry but with empty buildup
                    self.chances[shot_id] = {
                        'shot': shot,
                        'buildup': [],
                        'possession_id': possession_id,
                        'match_id': match_id,
                        'type': None
                    }
        
        print(f"Identified {len(self.chances)} chances")
        return self
    
    def run_pipeline(self, competition_id=None, season_id=None, match_ids=None, 
                    output_file=None, plot_file=None):
        """
        Run the complete analysis pipeline
        
        Args:
            competition_id: Competition ID
            season_id: Season ID (required if competition_id is provided)
            match_ids: List of specific match IDs to analyze (optional)
            output_file: CSV output file path (optional)
            plot_file: Plot output file path (optional)
        """
        print("===== STARTING CHANCE CREATION ANALYSIS PIPELINE =====")
        
        # Step 1: Load data
        print("\n===== LOADING DATA =====")
        if match_ids:
            self.load_matches(match_ids)
        elif competition_id and season_id:
            self.load_competition_data(competition_id, season_id)
        else:
            print("Error: Either match_ids or both competition_id and season_id must be provided")
            return self
        
        # Check if we have events data
        if self.events_df is None or self.events_df.empty:
            print("Error: No events data loaded. Pipeline cannot continue.")
            return self
        
        # Step 2: Extract shots
        print("\n===== EXTRACTING SHOTS =====")
        self.extract_shots()
        
        # Check if we have shots
        if self.shots_df is None or self.shots_df.empty:
            print("Error: No shots found. Pipeline cannot continue.")
            return self
        
        # Step 3: Create possession chains
        print("\n===== CREATING POSSESSION CHAINS =====")
        self.create_possession_chains()
        
        # Step 4: Identify chances
        print("\n===== IDENTIFYING CHANCES =====")
        self.identify_chances()
        
        # Step 5: Classify chances (would need to implement these)
        print("\n===== CLASSIFYING CHANCES =====")
        self.classify_chances()
        
        # Step 6: Export results (if output file specified)
        if output_file:
             print(f"\n===== EXPORTING RESULTS TO {output_file} =====")
             self.export_chances_to_csv(output_file)
        
        # Step 7: Plot chance locations (if plot file specified)
        if plot_file:
             print(f"\n===== GENERATING PLOT TO {plot_file} =====")
             self.plot_chance_locations(plot_file)
        
        print("\n===== PIPELINE COMPLETE =====")
        return self


# Example usage
if __name__ == "__main__":
    # Initialize the analyzer (will use SB_USERNAME and SB_PASSWORD from environment)
    analyzer = ChanceCreationAnalyzer()
    
    # Run for competition_id=44, season_id=315 as requested
    analyzer.run_pipeline(
        competition_id=44,
        season_id=315
    )
    
    print("\n===== ANALYSIS COMPLETE =====")

Using credentials from environment variables (username: rramineni@lagalaxy.com)
===== STARTING CHANCE CREATION ANALYSIS PIPELINE =====

===== LOADING DATA =====
Making API request to: https://data.statsbomb.com/api/v8/competitions/44/seasons/315/matches
API request successful: 200
Found 510 matches
Processing 134 played matches
Making API request to: https://data.statsbomb.com/api/v8/events/3981171
API request successful: 200
Making API request to: https://data.statsbomb.com/api/v8/events/3981263
API request successful: 200
Making API request to: https://data.statsbomb.com/api/v8/events/3981269
API request successful: 200
Making API request to: https://data.statsbomb.com/api/v8/events/3981271
API request successful: 200
Making API request to: https://data.statsbomb.com/api/v8/events/3981266
API request successful: 200
Making API request to: https://data.statsbomb.com/api/v8/events/3981272
API request successful: 200
Making API request to: https://data.statsbomb.com/api/v8/events/398126

AttributeError: 'ChanceCreationAnalyzer' object has no attribute 'classify_chances'