In [2]:

pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\sarum\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def extract_ipl_match_data():
    """
    Extract IPL match data from Cricbuzz
    Returns a list of dictionaries containing match information
    """
    base_url = "https://www.cricbuzz.com/cricket-series/7607/indian-premier-league-2024/matches"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        match_cards = soup.find_all('div', class_='cb-col-75 cb-col')
        matches = []
        
        for card in match_cards:
            match_info = {}
            
            # Extract match title and link
            title_tag = card.find('a', class_='text-hvr-underline')
            if title_tag:
                match_info['title'] = title_tag.text.strip()
                match_info['match_url'] = "https://www.cricbuzz.com" + title_tag['href']
            
            # Extract match number and series
            series_info = card.find('div', class_='text-gray')
            if series_info:
                match_info['series_info'] = series_info.text.strip()
            
            # Extract match location and time
            location_time = card.find('div', class_='text-gray cb-font-12')
            if location_time:
                parts = [part.strip() for part in location_time.text.split('•') if part.strip()]
                if len(parts) >= 2:
                    match_info['venue'] = parts[0]
                    match_info['date_time'] = parts[1]
            
            # Extract match result if available
            result_tag = card.find('div', class_='cb-scr-wll-chvrn cb-lv-scrs-col')
            if result_tag:
                match_info['result'] = result_tag.text.strip()
            
            if match_info:
                matches.append(match_info)
                
        return matches
    
    except requests.RequestException as e:
        print(f"Error fetching data: {e}")
        return []

def transform_match_data(matches):
    """
    Transform raw match data into a structured format
    """
    transformed = []
    
    for match in matches:
        # Parse date and time
        date_str = match.get('date_time', '')
        try:
            match_date = datetime.strptime(date_str, '%b %d, %Y, %I:%M %p')
            formatted_date = match_date.strftime('%Y-%m-%d')
            formatted_time = match_date.strftime('%H:%M')
        except ValueError:
            formatted_date = date_str
            formatted_time = ''
        
        # Extract teams from title
        teams = []
        title = match.get('title', '')
        if 'vs' in title:
            teams = [team.strip() for team in title.split('vs')]
        
        # Create transformed record
        record = {
            'match_title': title,
            'team1': teams[0] if len(teams) > 0 else '',
            'team2': teams[1] if len(teams) > 1 else '',
            'series': match.get('series_info', ''),
            'venue': match.get('venue', ''),
            'date': formatted_date,
            'time': formatted_time,
            'result': match.get('result', 'Upcoming'),
            'match_url': match.get('match_url', '')
        }
        transformed.append(record)
    
    return transformed

def load_to_csv(data, filename='ipl_matches.csv'):
    """
    Load transformed data to CSV file
    """
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Data successfully saved to {filename}")

def main():
    print("Starting IPL Match Data ETL Pipeline...")
    
    # Extract data
    print("Extracting match data from Cricbuzz...")
    raw_matches = extract_ipl_match_data()
    print(f"Found {len(raw_matches)} matches")
    
    # Transform data
    print("Transforming data...")
    transformed_data = transform_match_data(raw_matches)
    
    # Load data
    print("Loading data to CSV...")
    load_to_csv(transformed_data)
    
    print("ETL Pipeline completed successfully!")

if __name__ == "__main__":
    main()

Starting IPL Match Data ETL Pipeline...
Extracting match data from Cricbuzz...
Found 74 matches
Transforming data...
Loading data to CSV...
Data successfully saved to ipl_matches.csv
ETL Pipeline completed successfully!
