What we need is to get the unique combo of days or i suppose we don

In [12]:
import os
import re
import time
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Map team names to MLB abbreviations
team_name_to_abbr = {
    "Chi Cubs": "CHC", "NY Yankees": "NYY", "San Diego": "SDP", "St. Louis": "STL", "LA Dodgers": "LAD",
    "Boston": "BOS", "Miami": "MIA", "Detroit": "DET", "Sacramento": "OAK", "NY Mets": "NYM",
    "Philadelphia": "PHI", "Cincinnati": "CIN", "Seattle": "SEA", "Arizona": "ARI", "Tampa Bay": "TBR",
    "Milwaukee": "MIL", "Atlanta": "ATL", "Toronto": "TOR", "Houston": "HOU", "Washington": "WSN",
    "Minnesota": "MIN", "Cleveland": "CLE", "Kansas City": "KCR", "SF Giants": "SFG", "Baltimore": "BAL",
    "Texas": "TEX", "Pittsburgh": "PIT", "Chi Sox": "CHW", "LA Angels": "LAA", "Colorado": "COL"
}

# Create output folder
output_folder = "batting_data"
os.makedirs(output_folder, exist_ok=True)

# Set up Selenium browser
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
driver = webdriver.Chrome(options=options)

# Date range: May 20 to August 20, 2024
start_date = datetime(2022, 5, 20)
end_date = datetime(2022, 8, 20)
delta = timedelta(days=1)

current_date = start_date
while current_date <= end_date:
    date_str = current_date.strftime("%Y-%m-%d")
    print(f"Processing {date_str}...")

    url = f"https://www.teamrankings.com/mlb/stat/batting-average?date={date_str}"
    try:
        driver.get(url)
        time.sleep(3)

        html = driver.page_source

        # Extract team and first batting average value
        pattern = re.compile(
            r'<td class="text-left nowrap" data-sort="[^"]+">\s*<a[^>]*>([^<]+)</a>.*?'
            r'<td class="text-right" data-sort="([\d.]+)">',
            re.DOTALL
        )
        matches = pattern.findall(html)

        if matches:
            df = pd.DataFrame(matches, columns=["Team", "Batting Average"])
            df["Batting Average"] = df["Batting Average"].astype(float)
            df["Date"] = date_str
            df["Team Abbreviation"] = df["Team"].map(team_name_to_abbr)

            # Save to CSV
            df.to_csv(os.path.join(output_folder, f"{date_str}.csv"), index=False)
        else:
            print(f"⚠️ No data found for {date_str}")

    except Exception as e:
        print(f"❌ Error on {date_str}: {e}")

    current_date += delta

driver.quit()
print("✅ Done scraping all dates.")


Processing 2022-05-20...
Processing 2022-05-21...
Processing 2022-05-22...
Processing 2022-05-23...
Processing 2022-05-24...
Processing 2022-05-25...
Processing 2022-05-26...
Processing 2022-05-27...
Processing 2022-05-28...
Processing 2022-05-29...
Processing 2022-05-30...
Processing 2022-05-31...
Processing 2022-06-01...
Processing 2022-06-02...
Processing 2022-06-03...
Processing 2022-06-04...
Processing 2022-06-05...
Processing 2022-06-06...
Processing 2022-06-07...
Processing 2022-06-08...
Processing 2022-06-09...
Processing 2022-06-10...
Processing 2022-06-11...
Processing 2022-06-12...
Processing 2022-06-13...
Processing 2022-06-14...
Processing 2022-06-15...
Processing 2022-06-16...
Processing 2022-06-17...
Processing 2022-06-18...
Processing 2022-06-19...
Processing 2022-06-20...
Processing 2022-06-21...
Processing 2022-06-22...
Processing 2022-06-23...
Processing 2022-06-24...
Processing 2022-06-25...
Processing 2022-06-26...
Processing 2022-06-27...
Processing 2022-06-28...
