# Scrapping Time for every Match:

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re
from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
def scrape_time(season, driver):
    url = f'https://www.iplt20.com/matches/results/{season}'
    print(f"Fetching IPL {season} results from {url} ...")

    driver.get(url)
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li[ng-repeat*='resultList']"))
        )
    except Exception as e:
        print(f"‚ö†Ô∏è Timeout waiting for matches to load for season {season}: {e}")
        return pd.DataFrame()

    soup = bs(driver.page_source, 'html.parser')
    
    match_cards = soup.find_all("li", attrs={"ng-repeat": "list in resultList | orderBy:'-timestamp'"})
    print(f"‚úÖ Found {len(match_cards)} match cards")

    matches_data = []

    for i, card in enumerate(match_cards, 1):
        try:
            match_number = card.find('span', class_='vn-matchOrder')
            match_number = match_number.get_text(strip=True) if match_number else f"Match {i}"

            team_names = [t.get_text(strip=True) for t in card.find_all('h3') if t.get_text(strip=True)]
            team1, team2 = team_names[0], team_names[2]

            datetime_element = card.find('div', class_='vn-matchDateTime')
            datetime_text = datetime_element.get_text(strip=True) if datetime_element else ""

            datetime_clean = re.sub(r'\s+', ' ', datetime_text.strip())
            parts = [part.strip() for part in datetime_clean.split(',')]

            if len(parts) >= 2:
                month_abbr = parts[0]
                day_parts = parts[1].split()
                day_num = next((part for part in day_parts if part.isdigit()), None)
                time_part = parts[2] if len(parts) > 2 else ""

                month_map = {
                    'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06',
                    'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'
                }
                month_num = month_map.get(month_abbr.upper(), '01')

                match_date = f"{season}-{month_num}-{day_num.zfill(2)}" if day_num else ""

                time_match = re.search(r'(\d{1,2}:\d{2}\s*[ap]m)', time_part, re.IGNORECASE)
                match_time = time_match.group(1) if time_match else ""
            else:
                match_date = ""
                match_time = ""

            matches_data.append({
                "Match": match_number,
                "Team 1": team1,
                "Team 2": team2,
                "Date": match_date,
                "Time": match_time,
                "Season": season
            })
        except Exception as e:
            print(f"‚ö†Ô∏è Error processing match {i}: {e}")
            continue

    df = pd.DataFrame(matches_data)
    if not df.empty:
        df = df.iloc[::-1].reset_index(drop=True)
    return df

In [3]:
def scrape_all_seasons(start_year=2008, end_year=2024, output_csv='../data/scrapped/all_matches.csv'):
    all_data = []
    
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    # Initialize driver once
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    try:
        for season in range(start_year, end_year + 1):
            try:
                df = scrape_time(season, driver)
                if not df.empty:
                    all_data.append(df)
                    print(f"‚úÖ Season {season} added ({len(df)} matches)")
                else:
                    print(f"‚ö†Ô∏è No data found for {season}")
            except Exception as e:
                print(f"‚ùå Error scraping season {season}: {e}")
                continue
    finally:
        driver.quit()

    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        final_df.to_csv(output_csv, index=False)
        print(f"\nüéâ All IPL match data saved to '{output_csv}' ({len(final_df)} total matches)")
    else:
        print("No data collected!")

In [4]:
scrape_all_seasons(start_year=2008, end_year=2024)

Fetching IPL 2008 results from https://www.iplt20.com/matches/results/2008 ...
‚úÖ Found 59 match cards
‚úÖ Season 2008 added (59 matches)
Fetching IPL 2009 results from https://www.iplt20.com/matches/results/2009 ...
‚úÖ Found 59 match cards
‚úÖ Season 2009 added (59 matches)
Fetching IPL 2010 results from https://www.iplt20.com/matches/results/2010 ...
‚úÖ Found 60 match cards
‚úÖ Season 2010 added (60 matches)
Fetching IPL 2011 results from https://www.iplt20.com/matches/results/2011 ...
‚úÖ Found 74 match cards
‚úÖ Season 2011 added (74 matches)
Fetching IPL 2012 results from https://www.iplt20.com/matches/results/2012 ...
‚úÖ Found 75 match cards
‚úÖ Season 2012 added (75 matches)
Fetching IPL 2013 results from https://www.iplt20.com/matches/results/2013 ...
‚úÖ Found 76 match cards
‚úÖ Season 2013 added (76 matches)
Fetching IPL 2014 results from https://www.iplt20.com/matches/results/2014 ...
‚úÖ Found 60 match cards
‚úÖ Season 2014 added (60 matches)
Fetching IPL 2015 results f