In [45]:
%pip install kagglehub

Defaulting to user installation because normal site-packages is not writeable
Collecting kagglehub
  Downloading kagglehub-0.3.4-py3-none-any.whl.metadata (22 kB)
Downloading kagglehub-0.3.4-py3-none-any.whl (43 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
import kagglehub
import pandas as pd
import numpy as np
from datetime import datetime

#D:\Projects\Machine Learning Applications for Soccer
save_dir = "D:\Projects\Machine Learning Applications for Soccer"

# Download the dataset
path = kagglehub.dataset_download("davidcariboo/player-scores")

def create_comprehensive_dataset():
    # Load all required datasets
    players = pd.read_csv(f"{path}/players.csv")
    appearances = pd.read_csv(f"{path}/appearances.csv")
    clubs = pd.read_csv(f"{path}/clubs.csv")
    
    # Process player information
    # Convert date_of_birth to datetime and calculate age
    players['date_of_birth'] = pd.to_datetime(players['date_of_birth'])
    players['age'] = (datetime.now() - players['date_of_birth']).dt.days / 365.25
    
    # Define career phases
    def get_career_phase(age):
        if pd.isna(age):
            return 'unknown'
        if age <= 21:
            return 'breakthrough'
        elif age <= 25:
            return 'development'
        elif age <= 29:
            return 'peak'
        else:
            return 'twilight'
    
    players['career_phase'] = players['age'].apply(get_career_phase)
    
    # Calculate performance metrics from appearances
    performance = appearances.groupby('player_id').agg({
        'goals': 'sum',
        'assists': 'sum',
        'minutes_played': 'sum',
        'yellow_cards': 'sum',
        'red_cards': 'sum',
        'game_id': 'count'  # number of appearances
    }).reset_index()
    performance.rename({'game_id': 'total_appearances'}, axis=1, inplace=True)
    
    # Calculate per-game metrics
    performance['goals_per_game'] = performance['goals'] / performance['total_appearances']
    performance['assists_per_game'] = performance['assists'] / performance['total_appearances']
    performance['minutes_per_game'] = performance['minutes_played'] / performance['total_appearances']
    
    # Get the most recent season's performance
    appearances['date'] = pd.to_datetime(appearances['date'])
    appearances['season'] = appearances['date'].dt.year
    recent_performance = appearances[appearances['season'] == appearances['season'].max()].groupby('player_id').agg({
        'goals': 'sum',
        'assists': 'sum',
        'minutes_played': 'sum',
        'yellow_cards': 'sum',
        'red_cards': 'sum',
        'game_id': 'count'
    }).reset_index()
    recent_performance = recent_performance.add_prefix('recent_')
    recent_performance = recent_performance.rename(columns={'recent_player_id': 'player_id'})
    
    # Combine all information
    comprehensive_df = players.merge(performance, on='player_id', how='left')
    comprehensive_df = comprehensive_df.merge(recent_performance, on='player_id', how='left')
    comprehensive_df = comprehensive_df.merge(
        clubs[['club_id', 'name', 'domestic_competition_id', 'squad_size', 'average_age', 
               'total_market_value', 'stadium_name', 'coach_name']],
        left_on='current_club_id',
        right_on='club_id',
        how='left'
    ).rename(columns={'name_y': 'club_name', 'name_x': 'player_name'})
    
    # Select and rename final columns
    final_columns = {
        # Player Information
        'player_id': 'ID',
        'player_name': 'Name',
        'age': 'Age',
        'career_phase': 'Career_Phase',
        'position': 'Position',
        'sub_position': 'Sub_Position',
        'foot': 'Preferred_Foot',
        'height_in_cm': 'Height',
        'country_of_citizenship': 'Nationality',
        'date_of_birth': 'Birth_Date',
        
        # Contract Information
        'contract_expiration_date': 'Contract_Expiry',
        'market_value_in_eur': 'Current_Value',
        'highest_market_value_in_eur': 'Peak_Value',
        
        # Club Information
        'club_name': 'Current_Club',
        'domestic_competition_id': 'League_ID',
        'squad_size': 'Squad_Size',
        'average_age': 'Team_Average_Age',
        'total_market_value': 'Team_Total_Value',
        'coach_name': 'Coach',
        
        # Career Performance
        'total_appearances': 'Career_Games',
        'minutes_played': 'Career_Minutes',
        'goals': 'Career_Goals',
        'assists': 'Career_Assists',
        'yellow_cards': 'Career_Yellows',
        'red_cards': 'Career_Reds',
        'goals_per_game': 'Goals_Per_Game',
        'assists_per_game': 'Assists_Per_Game',
        'minutes_per_game': 'Minutes_Per_Game',
        
        # Recent Performance
        'recent_goals': 'Recent_Goals',
        'recent_assists': 'Recent_Assists',
        'recent_minutes_played': 'Recent_Minutes',
        'recent_yellow_cards': 'Recent_Yellows',
        'recent_red_cards': 'Recent_Reds',
        'recent_game_id': 'Recent_Games'
    }
    
    # Select only columns that exist in the dataframe
    existing_columns = [col for col in final_columns.keys() if col in comprehensive_df.columns]
    comprehensive_df = comprehensive_df[existing_columns].rename(columns=final_columns)
    
    # Save the dataset
    comprehensive_df.to_csv(save_dir + '/player_database.csv', index=False)
    
    return comprehensive_df

# Create the dataset and display summary
df = create_comprehensive_dataset()

print("\nDataset Overview:")
print(f"Total players: {len(df)}")
print("\nCareer Phase Distribution:")
print(df['Career_Phase'].value_counts())
print("\nPosition Distribution:")
print(df['Position'].value_counts())
print("\nAverage Values by Career Phase:")
print(df.groupby('Career_Phase')['Current_Value'].mean())

  save_dir = "D:\Projects\Machine Learning Applications for Soccer"



Dataset Overview:
Total players: 32417

Career Phase Distribution:
Career_Phase
twilight        16640
peak             7123
development      6146
breakthrough     2461
unknown            47
Name: count, dtype: int64

Position Distribution:
Position
Defender      10328
Midfield       9324
Attack         8863
Goalkeeper     3718
Missing         184
Name: count, dtype: int64

Average Values by Career Phase:
Career_Phase
breakthrough    2.091112e+06
development     2.761342e+06
peak            2.650386e+06
twilight        7.320232e+05
unknown         2.284375e+05
Name: Current_Value, dtype: float64


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import logging
from time import sleep
from random import uniform

class SoccerDataScraper:
    def __init__(self):
        # Initialize logging
        logging.basicConfig(
            filename='scraper.log',
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        
        # Basic setup
        self.save_dir = "D:/Projects/Machine Learning Applications for Soccer"
        self.base_transfermarkt_url = "https://www.transfermarkt.com"
        self.base_fbref_url = "https://fbref.com"
        
        # Initialize session for better performance
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # Load database
        self.db_path = os.path.join(self.save_dir, 'player_database.csv')
        self.load_database()

    def load_database(self):
        """Load and prepare the database"""
        try:
            self.df = pd.read_csv(self.db_path, encoding='utf-8')
            
            # Add new columns if they don't exist
            new_columns = ['transfermarkt_url', 'fbref_url', 'injury_history']
            for col in new_columns:
                if col not in self.df.columns:
                    self.df[col] = None
                    
            # Save the updated structure
            self.df.to_csv(self.db_path, index=False, encoding='utf-8')
            logging.info(f"Database loaded successfully with {len(self.df)} players")
            
        except Exception as e:
            logging.error(f"Error loading database: {str(e)}")
            raise

    def make_request(self, url, retries=3, delay=1):
        """Make HTTP request with retry logic"""
        for attempt in range(retries):
            try:
                response = self.session.get(url)
                response.raise_for_status()
                sleep(uniform(0.5, delay))  # Random delay between requests
                return response
            except requests.RequestException as e:
                if attempt == retries - 1:
                    logging.error(f"Failed to fetch {url} after {retries} attempts: {str(e)}")
                    raise
                sleep(uniform(1, 3))  # Longer delay between retries
        return None

    def find_player_urls(self, player_name):
        """Find URLs for a player"""
        try:
            search_name = player_name.replace(' ', '+')
            search_url = f"{self.base_transfermarkt_url}/schnellsuche/ergebnis/schnellsuche?query={search_name}"
            
            response = self.make_request(search_url)
            if not response:
                return None
                
            soup = BeautifulSoup(response.text, 'html.parser')
            player_link = soup.find('table', class_='items')
            
            urls = {'transfermarkt_url': None, 'fbref_url': None}
            
            if player_link and (link := player_link.find('a', class_='spielprofil_tooltip')):
                urls['transfermarkt_url'] = self.base_transfermarkt_url + link['href']
            
            return urls
            
        except Exception as e:
            logging.error(f"Error finding URLs for {player_name}: {str(e)}")
            return None

    def get_injury_history(self, url):
        """Scrape injury history from Transfermarkt"""
        try:
            response = self.make_request(url + '/verletzungen')
            if not response:
                return []
                
            soup = BeautifulSoup(response.text, 'html.parser')
            injury_table = soup.find('table', class_='items')
            
            injuries = []
            if injury_table:
                for row in injury_table.find_all('tr')[1:]:
                    cols = row.find_all('td')
                    if len(cols) >= 5:
                        injuries.append({
                            'season': cols[0].text.strip(),
                            'injury_type': cols[1].text.strip(),
                            'days_missed': cols[4].text.strip()
                        })
            
            return injuries
            
        except Exception as e:
            logging.error(f"Error scraping injury data from {url}: {str(e)}")
            return []

    def process_player(self, args):
        """Process a single player"""
        player, index = args
        try:
            # Find URLs if not already present
            if pd.isna(self.df.at[index, 'transfermarkt_url']):
                urls = self.find_player_urls(player['Name'])
                if urls:
                    self.df.at[index, 'transfermarkt_url'] = urls['transfermarkt_url']
                    self.df.at[index, 'fbref_url'] = urls['fbref_url']
            
            # Get injury history if URL exists
            if pd.notna(self.df.at[index, 'transfermarkt_url']) and pd.isna(self.df.at[index, 'injury_history']):
                injuries = self.get_injury_history(self.df.at[index, 'transfermarkt_url'])
                self.df.at[index, 'injury_history'] = str(injuries)
            
            return index, True
            
        except Exception as e:
            logging.error(f"Error processing player {player['Name']}: {str(e)}")
            return index, False

    def enhance_player_database(self, max_workers=5):
        """Process the database using parallel processing"""
        try:
            total_players = len(self.df)
            processed_count = 0
            
            # Create tasks for parallel processing
            tasks = [(row, idx) for idx, row in self.df.iterrows()]
            
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [executor.submit(self.process_player, task) for task in tasks]
                
                # Process results with progress bar
                with tqdm(total=total_players, desc="Processing players") as pbar:
                    for future in as_completed(futures):
                        index, success = future.result()
                        processed_count += 1
                        pbar.update(1)
                        
                        # Save checkpoint every 100 players
                        if processed_count % 100 == 0:
                            self.df.to_csv(self.db_path, index=False, encoding='utf-8')
                            logging.info(f"Checkpoint saved at {processed_count} players")
            
            # Final save
            self.df.to_csv(self.db_path, index=False, encoding='utf-8')
            logging.info("Database enhancement completed successfully")
            
            return self.df
            
        except Exception as e:
            logging.error(f"Error in database enhancement: {str(e)}")
            self.df.to_csv(self.db_path, index=False, encoding='utf-8')
            raise

def main():
    try:
        scraper = SoccerDataScraper()
        enhanced_df = scraper.enhance_player_database()
        
        print("\nEnhanced Dataset Overview:")
        print(f"Total players: {len(enhanced_df)}")
        print("\nColumns added:")
        print("- Player URLs")
        print("- Injury history")
        
    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Progress has been saved.")
        logging.warning("Process interrupted by user")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        print("Progress has been saved up to the last successful update.")
        logging.error(f"Main process error: {str(e)}")

if __name__ == "__main__":
    main()

Processing players: 100%|██████████████████████████████████████████████████████| 32417/32417 [2:56:35<00:00,  3.06it/s]



Enhanced Dataset Overview:
Total players: 32417

Columns added:
- Player URLs
- Injury history
