In [1]:
pip install beautifulsoup4 requests pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def scrape_scorecard(match_url):
    """
    Scrape cricket match scorecard information from a given URL
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(match_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    match_data = {
        'match_status': None,
        'match_winning_team': None,
        'match_tie_breaker': None,
        'match_toss': None,
        'umpires': None,
        'match_referee': None,
        'third_umpires': None,
        'match_datetime': None,
        'team1_name': None,
        'team2_name': None,
        'team1_score': None,
        'team1_wickets': None,
        'team2_score': None,
        'team2_wickets': None,
        'team1_captain': None,
        'team1_players': None,
        'team1_bench': None,
        'team1_support_staff': None,
        'team2_captain': None,
        'team2_players': None,
        'team2_bench': None,
        'team2_support_staff': None
    }
    
    try:
        status = soup.find('div', class_='status-text').text.strip().lower()
        if 'abandoned' in status:
            match_data['match_status'] = 'abandoned'
        elif 'rescheduled' in status:
            match_data['match_status'] = 'rescheduled'
        elif 'tied' in status:
            match_data['match_status'] = 'tied'
        else:
            match_data['match_status'] = 'completed'
        if match_data['match_status'] == 'completed' or match_data['match_status'] == 'tied':
            result = soup.find('div', class_='status-text').text.strip()
            match_data['match_winning_team'] = result.split(' won ')[0] if ' won ' in result else None
            if match_data['match_status'] == 'tied':
                match_data['match_tie_breaker'] = result.split('tied, ')[-1] if 'tied, ' in result else None
        toss_info = soup.find('div', class_='match-info-item', string=lambda x: x and 'toss' in x.lower())
        if toss_info:
            match_data['match_toss'] = toss_info.text.strip().replace('Toss', '').strip()
        match_info_items = soup.find_all('div', class_='match-info-item')
        for item in match_info_items:
            text = item.text.strip()
            if 'Umpires' in text:
                match_data['umpires'] = text.replace('Umpires', '').strip()
            elif 'Match Referee' in text:
                match_data['match_referee'] = text.replace('Match Referee', '').strip()
            elif 'Third Umpire' in text:
                match_data['third_umpires'] = text.replace('Third Umpire', '').strip()
        date_time = soup.find('div', class_='match-info-time').text.strip()
        match_data['match_datetime'] = datetime.strptime(date_time, '%b %d, %Y, %H:%M %Z')
        teams = soup.find_all('div', class_='team')
        if len(teams) >= 2:
            match_data['team1_name'] = teams[0].find('div', class_='name').text.strip()
            match_data['team2_name'] = teams[1].find('div', class_='name').text.strip()
            team1_score = teams[0].find('div', class_='score-detail')
            if team1_score:
                score_parts = team1_score.text.strip().split('/')
                match_data['team1_score'] = score_parts[0]
                if len(score_parts) > 1:
                    match_data['team1_wickets'] = score_parts[1].split()[0]
            team2_score = teams[1].find('div', class_='score-detail')
            if team2_score:
                score_parts = team2_score.text.strip().split('/')
                match_data['team2_score'] = score_parts[0]
                if len(score_parts) > 1:
                    match_data['team2_wickets'] = score_parts[1].split()[0]
        squads = soup.find_all('div', class_='squad-players')
        if len(squads) >= 2:
            team1_players = [p.text.strip() for p in squads[0].find_all('div', class_='player-name')]
            match_data['team1_players'] = ', '.join(team1_players)
            team2_players = [p.text.strip() for p in squads[1].find_all('div', class_='player-name')]
            match_data['team2_players'] = ', '.join(team2_players)
            if team1_players:
                match_data['team1_captain'] = team1_players[0]
            if team2_players:
                match_data['team2_captain'] = team2_players[0]      
    except Exception as e:
        print(f"Error scraping data: {e}")   
    return match_data
if __name__ == "__main__":
    match_url = "https://www.espncricinfo.com/series/ipl-2023-1345038/chennai-super-kings-vs-gujarat-titans-final-1370353/full-scorecard"
    match_data = scrape_scorecard(match_url)
    df = pd.DataFrame([match_data])
    print(df)

Error scraping data: 'NoneType' object has no attribute 'text'
  match_status match_winning_team match_tie_breaker match_toss umpires  \
0         None               None              None       None    None   

  match_referee third_umpires match_datetime team1_name team2_name  ...  \
0          None          None           None       None       None  ...   

  team2_score team2_wickets team1_captain team1_players team1_bench  \
0        None          None          None          None        None   

  team1_support_staff team2_captain team2_players team2_bench  \
0                None          None          None        None   

  team2_support_staff  
0                None  

[1 rows x 22 columns]
