In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
import re
from urllib.parse import urljoin

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler("iplt20_scraper.log"), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

class IPLT20Scraper:
    """Simplified scraper for extracting IPL player data from IPLT20.com"""
    
    def __init__(self):
        self.base_url = "https://www.iplt20.com"
        self.teams_url = "https://www.iplt20.com/teams"
        self.session = self._create_session()
        self.teams = []
        self.players = []
        
    def _create_session(self):
        """Create a session with browser-like headers"""
        session = requests.Session()
        
        # Use realistic browser headers
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Referer': 'https://www.iplt20.com/',
        }
        session.headers.update(headers)
        
        return session
    
    def get_teams(self):
        """Extract team information from the main teams page"""
        logger.info(f"Fetching teams from {self.teams_url}")
        
        try:
            # Add a small random delay
            time.sleep(1 + random.random())
            
            # Request the teams page
            response = self.session.get(self.teams_url, timeout=30)
            response.raise_for_status()
            
            # Parse the HTML
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all team links - based on your provided HTML structure
            team_elements = soup.select('a[onclick="click_teams1(this)"][data-team_name]')
            
            if not team_elements:
                logger.warning("Could not find team elements with primary selector")
                # Try alternative selectors
                team_elements = soup.select('a[data-team_name]')
                
                if not team_elements:
                    team_elements = soup.select('a[href*="/teams/"]')
            
            logger.info(f"Found {len(team_elements)} potential team elements")
            
            # Extract team data
            teams = []
            for element in team_elements:
                try:
                    # Get team URL
                    href = element.get('href')
                    if not href:
                        continue
                    
                    team_url = urljoin(self.base_url, href)
                    
                    # Get team name
                    team_name = element.get('data-team_name')
                    
                    # If data-team_name isn't available, try to find it in the heading
                    if not team_name:
                        heading = element.select_one('h3')
                        if heading:
                            team_name = heading.text.strip()
                    
                    # Skip if we couldn't find a team name
                    if not team_name:
                        continue
                    
                    # Get team logo
                    logo_img = element.select_one('img')
                    logo_url = logo_img.get('src') if logo_img else None
                    
                    # Extract team code
                    team_code = None
                    
                    # Try to get from logo URL
                    if logo_url and 'outline' in logo_url:
                        code_match = re.search(r'/([A-Z]+)outline', logo_url)
                        if code_match:
                            team_code = code_match.group(1)
                    
                    # If not found, try to get from team name
                    if not team_code and team_name:
                        # Extract initials from team name
                        name_parts = team_name.split()
                        if len(name_parts) > 1:
                            team_code = ''.join(part[0].upper() for part in name_parts)
                    
                    # If still not found, try to find in span
                    if not team_code:
                        code_span = element.select_one('.vn-team-logo-onhover span')
                        if code_span:
                            team_code = code_span.text.strip()
                    
                    # Add team info
                    teams.append({
                        'name': team_name,
                        'code': team_code,
                        'url': team_url
                    })
                    
                    logger.info(f"Extracted team: {team_name} ({team_code})")
                    
                except Exception as e:
                    logger.error(f"Error processing team element: {str(e)}")
                    continue
            
            logger.info(f"Successfully extracted {len(teams)} teams")
            self.teams = teams
            return teams
            
        except requests.exceptions.RequestException as e:
            logger.error(f"Failed to retrieve teams page: {str(e)}")
            return []
    
    def get_squad(self, team):
        """Extract squad information for a specific team"""
        team_name = team['name']
        team_url = team['url']
        team_code = team['code']
        
        logger.info(f"Fetching squad for {team_name} from {team_url}")
        
        try:
            # Add delay to avoid rate limiting
            time.sleep(2 + random.random() * 2)
            
            # Request the team page
            response = self.session.get(team_url, timeout=30)
            response.raise_for_status()
            
            # Parse the HTML
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Based on the HTML example you provided, look for player elements
            # using the specific structure
            player_elements = soup.select('a[onclick="click_teams(this)"][data-event_context="player"][data-team_name]')
            
            if not player_elements:
                logger.warning("Could not find player elements with primary selector")
                # Try alternative selectors
                player_elements = soup.select('a[data-player_name]')
                
                if not player_elements:
                    # Look for elements with specific HTML structure we know about
                    player_elements = soup.select('.ih-p-img')
                
                if not player_elements:
                    # Try more general selectors
                    player_elements = soup.select('a[href*="squad-details"]')
            
            logger.info(f"Found {len(player_elements)} potential player elements")
            
            # Extract player data
            players = []
            for element in player_elements:
                try:
                    # Check if we have the original structure (link with onclick) or just the ih-p-img
                    if element.name != 'a' and element.find_parent('a'):
                        element = element.find_parent('a')
                    
                    # Get player name - check from data attribute first
                    player_name = element.get('data-player_name')
                    
                    # If data attribute not available, look for name in heading
                    if not player_name:
                        name_elem = element.select_one('.ih-p-name h2') or element.select_one('h2')
                        if name_elem:
                            player_name = name_elem.text.strip()
                    
                    # Skip if no name found
                    if not player_name:
                        continue
                    
                    # Get player role - based on the span with class="d-block w-100 text-center"
                    role_elem = element.select_one('span.d-block.w-100.text-center') or element.select_one('.ih-p-img span')
                    player_role = role_elem.text.strip() if role_elem else "Unknown"
                    
                    # Get player image
                    img_elem = element.select_one('.ih-p-img img') or element.select_one('img')
                    player_img = None
                    if img_elem:
                        # Try both src and data-src attributes
                        player_img = img_elem.get('src') or img_elem.get('data-src')
                    
                    # Make image URL absolute if it's not
                    if player_img and not player_img.startswith('http'):
                        player_img = urljoin(self.base_url, player_img)
                    
                    # Add player to list
                    players.append({
                        'Name': player_name,
                        'Role': player_role,
                        'Team': team_name,
                        'Team_Code': team_code,
                        'Image_URL': player_img
                    })
                    
                    logger.info(f"Extracted player: {player_name} ({player_role})")
                    
                except Exception as e:
                    logger.error(f"Error extracting player data: {str(e)}")
                    continue
            
            # If we couldn't find any players, try an alternative approach
            if not players:
                logger.warning("Standard approach failed, trying alternative method")
                
                # Look for elements that might contain player images and names
                img_elements = soup.select('img[src*="IPLHeadshot"], img[data-src*="IPLHeadshot"]')
                
                for img in img_elements:
                    try:
                        # Find the parent container that might have player info
                        container = img.find_parent('div', class_='ih-p-img') or img.find_parent('a')
                        
                        if not container:
                            continue
                        
                        # Get player name
                        name_elem = container.select_one('h2')
                        if not name_elem:
                            continue
                            
                        player_name = name_elem.text.strip()
                        
                        # Get player role
                        role_elem = container.select_one('span.d-block')
                        player_role = role_elem.text.strip() if role_elem else "Unknown"
                        
                        # Get image URL
                        player_img = img.get('src') or img.get('data-src')
                        
                        # Make image URL absolute
                        if player_img and not player_img.startswith('http'):
                            player_img = urljoin(self.base_url, player_img)
                        
                        # Add player to list
                        players.append({
                            'Name': player_name,
                            'Role': player_role,
                            'Team': team_name,
                            'Team_Code': team_code,
                            'Image_URL': player_img
                        })
                        
                        logger.info(f"Extracted player (alt method): {player_name} ({player_role})")
                        
                    except Exception as e:
                        logger.error(f"Error in alternative extraction: {str(e)}")
                        continue
            
            logger.info(f"Extracted {len(players)} players for {team_name}")
            return players
            
        except requests.exceptions.RequestException as e:
            logger.error(f"Failed to retrieve squad for {team_name}: {str(e)}")
            return []
    
    def scrape_all_squads(self):
        """Scrape all team squads"""
        # First, get all teams
        teams = self.get_teams()
        
        if not teams:
            logger.error("No teams found!")
            return False
        
        all_players = []
        
        # Now get squad data for each team
        for team in teams:
            players = self.get_squad(team)
            all_players.extend(players)
            
            # Print progress
            logger.info(f"Added {len(players)} players from {team['name']}")
            
            # Add a longer delay between teams
            time.sleep(3 + random.random() * 3)
        
        self.players = all_players
        logger.info(f"Total players collected: {len(all_players)}")
        
        return len(all_players) > 0
    
    def save_data(self):
        """Save the player data to CSV file"""
        if not self.players:
            logger.warning("No player data to save!")
            return False
        
        # Save player data to the requested filename
        players_df = pd.DataFrame(self.players)
        players_df.to_csv("IPL_Squads_List.csv", index=False)
        logger.info("Player data saved to IPL_Squads_List.csv")
        
        return True

def main():
    """Main function to run the scraper"""
    logger.info("Starting IPL squad scraper for IPLT20.com")
    
    scraper = IPLT20Scraper()
    
    # Scrape everything
    teams = scraper.get_teams()
    if teams:
        success = scraper.scrape_all_squads()
        if success:
            scraper.save_data()
            logger.info("Scraping completed successfully")
        else:
            logger.error("Failed to scrape squad data!")
    else:
        logger.error("Failed to retrieve team data!")
    
    logger.info("IPL scraper finished")

if __name__ == "__main__":
    main()

2025-05-06 15:14:48,306 - INFO - Starting IPL squad scraper for IPLT20.com
2025-05-06 15:14:48,306 - INFO - Fetching teams from https://www.iplt20.com/teams
2025-05-06 15:14:50,610 - INFO - Found 10 potential team elements
2025-05-06 15:14:50,610 - INFO - Extracted team: Chennai Super Kings (CSK)
2025-05-06 15:14:50,610 - INFO - Extracted team: Delhi Capitals (DC)
2025-05-06 15:14:50,610 - INFO - Extracted team: Gujarat Titans (GT)
2025-05-06 15:14:50,610 - INFO - Extracted team: Kolkata Knight Riders (KKR)
2025-05-06 15:14:50,610 - INFO - Extracted team: Lucknow Super Giants (LSG)
2025-05-06 15:14:50,610 - INFO - Extracted team: Mumbai Indians (MI)
2025-05-06 15:14:50,610 - INFO - Extracted team: Punjab Kings (PBKS)
2025-05-06 15:14:50,610 - INFO - Extracted team: Rajasthan Royals (RR)
2025-05-06 15:14:50,626 - INFO - Extracted team: Royal Challengers Bengaluru (RCB)
2025-05-06 15:14:50,628 - INFO - Extracted team: Sunrisers Hyderabad (SRH)
2025-05-06 15:14:50,632 - INFO - Successfull