In [8]:
import import_ipynb
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from fuzzywuzzy import fuzz
import re
from bs4 import BeautifulSoup
import time

# Load Dataframe:

In [9]:
from P01_Pre_Processing import matches
from P01_Pre_Processing import deliveries
from P03_Imputation import unmatched_all_matches

# Scraping:

In [10]:
def init_driver():
    """Initialize Selenium Chrome WebDriver."""
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    return driver

In [11]:
def get_match_url(season: int, team1: str, team2: str, date: str):
    """
    Generate the ESPN Cricinfo IPL season fixtures URL for the given match.
    Handles year-range naming (e.g., 2007-08) for older seasons and single-year formats for newer ones.
    """
    
    # Season ‚Üí ESPN Cricinfo Series ID
    season_map = {
        2008: 313494,
        2009: 374163,
        2010: 418064,
        2011: 466304,
        2012: 520932,
        2013: 586733,
        2014: 695871,
        2015: 791129,
        2016: 968923,
        2017: 1078425,
        2018: 1131611,
        2019: 1165643,
        2020: 1210595,
        2021: 1249214,
        2022: 1298423,
        2023: 1336037,
        2024: 1415619,
    }
    
    # Season ‚Üí Year label used in ESPN Cricinfo URLs
    season_label = {
        2008: "2007-08",
        2009: "2008-09",
        2010: "2009-10",
        2011: "2011",
        2012: "2012",
        2013: "2013",
        2014: "2014",
        2015: "2015",
        2016: "2016",
        2017: "2017",
        2018: "2018",
        2019: "2019",
        2020: "2020",
        2021: "2021",
        2022: "2022",
        2023: "2023",
        2024: "2024",
    }

    # Validate season
    if season not in season_map:
        raise ValueError(f"‚ùå Unknown IPL season: {season}. Please update mappings.")

    series_id = season_map[season]
    label = season_label[season]

    # Construct fixtures page URL
    fixtures_url = f"https://www.espncricinfo.com/series/indian-premier-league-{label}-{series_id}/match-schedule-fixtures-and-results"

    print(f"üîé Searching match: {season} ‚Äî {team1} vs {team2} ({date})")
    print(f"üìú Fixtures URL: {fixtures_url}\n")

    return fixtures_url


In [12]:
url = get_match_url(2008, "Delhi Capitals", "Kolkata Knight Riders", "2008-05-22")
print(url)

üîé Searching match: 2008 ‚Äî Delhi Capitals vs Kolkata Knight Riders (2008-05-22)
üìú Fixtures URL: https://www.espncricinfo.com/series/indian-premier-league-2007-08-313494/match-schedule-fixtures-and-results

https://www.espncricinfo.com/series/indian-premier-league-2007-08-313494/match-schedule-fixtures-and-results


In [13]:
unmatched_all_matches

Unnamed: 0,Season,Match_No,Date,match_key,_merge
46,2008,Match 47,2008-05-22,"(Delhi Capitals, Kolkata Knight Riders)",left_only
65,2009,Match 7,2009-04-21,"(Mumbai Indians, Rajasthan Royals)",left_only
71,2009,Match 13,2009-04-25,"(Chennai Super Kings, Kolkata Knight Riders)",left_only
197,2011,Match 20,2011-04-19,"(Rajasthan Royals, Royal Challengers Bangalore)",left_only
283,2012,Match 32,2012-04-24,"(Kolkata Knight Riders, Sunrisers Hyderabad)",left_only
285,2012,Match 34,2012-04-25,"(Chennai Super Kings, Royal Challengers Bangal...",left_only
488,2015,Match 25,2015-04-26,"(Kolkata Knight Riders, Rajasthan Royals)",left_only
612,2017,Match 29,2017-04-25,"(Royal Challengers Bangalore, Sunrisers Hydera...",left_only
1094,2024,Match 63,2024-05-13,"(Gujarat Titans, Kolkata Knight Riders)",left_only
1097,2024,Match 66,2024-05-16,"(Gujarat Titans, Sunrisers Hyderabad)",left_only


In [14]:
def find_match_link_by_number(driver, fixtures_url, match_no):
    """
    Locate a match on the Cricinfo fixtures page using its match number
    (e.g. 'Match 47') and return the match URL.
    """
    driver.get(fixtures_url)
    time.sleep(3)

    matches = driver.find_elements(By.CSS_SELECTOR, "div.ds-px-4.ds-py-3")

    for match in matches:
        text = match.text.lower()
        if match_no.lower() in text:
            try:
                link = match.find_element(By.TAG_NAME, "a").get_attribute("href")
                print(f"‚úÖ Found {match_no}: {link}")
                return link
            except Exception as e:
                print(f"‚ö†Ô∏è Link extraction failed for {match_no}: {e}")
                continue

    print(f"‚ùå {match_no} not found on page.")
    return None


In [15]:
def get_match_details(driver, match_url):
    """
    Scrape basic match details (teams, venue, date, result, etc.)
    from a specific match page.
    """
    driver.get(match_url)
    time.sleep(3)

    data = {}
    try:
        header = driver.find_element(By.CSS_SELECTOR, "h1.ds-text-title-lg").text
        data["Match"] = header

        venue_info = driver.find_elements(By.CSS_SELECTOR, ".ds-text-tight-s")
        if len(venue_info) > 0:
            data["Venue"] = venue_info[0].text

        result_info = driver.find_element(By.CSS_SELECTOR, ".ds-text-tight-m.ds-font-regular").text
        data["Result"] = result_info
    except Exception as e:
        print("‚ö†Ô∏è Error extracting details:", e)

    return data

In [16]:
def get_ball_by_ball(driver, match_url):
    """
    Navigate to ball-by-ball commentary page and extract delivery details.
    Returns a list of dicts, one per delivery.
    """
    if not match_url.endswith("live-cricket-score"):
        match_url = match_url.rstrip("/") + "/live-cricket-score"

    driver.get(match_url)
    time.sleep(3)

    # Open "Commentary" tab
    try:
        comm_link = driver.find_element(By.XPATH, "//a[contains(text(),'Commentary')]")
        comm_link.click()
        time.sleep(3)
    except Exception:
        print("‚ö†Ô∏è Couldn't find commentary tab.")
        return []

    balls = []
    deliveries = driver.find_elements(By.CSS_SELECTOR, "div.ds-flex.ds-items-center.ds-min-w-0.ds-mb-1")
    for delivery in deliveries:
        try:
            over = delivery.find_element(By.CSS_SELECTOR, "div.ds-text-tight-xs").text
            text = delivery.text
            balls.append({"Over": over, "Commentary": text})
        except Exception:
            continue

    print(f"üéØ Extracted {len(balls)} deliveries.")
    return balls

In [17]:
def fetch_match_data_by_number(season, match_no, team1, team2, date):
    """
    Fetch match data (summary + ball-by-ball) using match number for faster lookup.
    """
    driver = init_driver()
    fixtures_url = get_match_url(season, team1, team2, date)
    match_url = find_match_link_by_number(driver, fixtures_url, match_no)

    if not match_url:
        driver.quit()
        return None

    match_details = get_match_details(driver, match_url)
    ball_by_ball = get_ball_by_ball(driver, match_url)

    driver.quit()
    return {"details": match_details, "balls": ball_by_ball}

In [19]:
unmatched_matches = [
    (2008, "Match 47", "Delhi Capitals", "Kolkata Knight Riders", "2008-05-22"),
    (2009, "Match 7", "Mumbai Indians", "Rajasthan Royals", "2009-04-21"),
    (2024, "Match 63", "Gujarat Titans", "Kolkata Knight Riders", "2024-05-13"),
]

for season, match_no, team1, team2, date in unmatched_matches:
    print(f"\n{'='*60}\nFetching {match_no} ({season}) ‚Äî {team1} vs {team2}")
    result = fetch_match_data_by_number(season, match_no, team1, team2, date)
    if result:
        print(result["details"])
        print(f"Ball count: {len(result['balls'])}")



Fetching Match 47 (2008) ‚Äî Delhi Capitals vs Kolkata Knight Riders
üîé Searching match: 2008 ‚Äî Delhi Capitals vs Kolkata Knight Riders (2008-05-22)
üìú Fixtures URL: https://www.espncricinfo.com/series/indian-premier-league-2007-08-313494/match-schedule-fixtures-and-results

‚ùå Match 47 not found on page.

Fetching Match 7 (2009) ‚Äî Mumbai Indians vs Rajasthan Royals
üîé Searching match: 2009 ‚Äî Mumbai Indians vs Rajasthan Royals (2009-04-21)
üìú Fixtures URL: https://www.espncricinfo.com/series/indian-premier-league-2008-09-374163/match-schedule-fixtures-and-results

‚ùå Match 7 not found on page.

Fetching Match 63 (2024) ‚Äî Gujarat Titans vs Kolkata Knight Riders
üîé Searching match: 2024 ‚Äî Gujarat Titans vs Kolkata Knight Riders (2024-05-13)
üìú Fixtures URL: https://www.espncricinfo.com/series/indian-premier-league-2024-1415619/match-schedule-fixtures-and-results

‚ùå Match 63 not found on page.
