# Pull pitchers from MLB

In [1]:
# Add these imports if they are missing in your first cell
# Standard library imports
import asyncio
import os
from io import StringIO
import traceback # For detailed error reporting

# Third-party imports
import nest_asyncio
import pandas as pd
from bs4 import BeautifulSoup
from scrapling import StealthyFetcher # Make sure this is imported

## Scrape function

In [2]:
import re # Make sure re is imported, add 'import re' to your imports cell if needed

async def scrape_probable_pitchers(url="https://www.mlb.com/probable-pitchers"):
    """
    Scrapes probable pitcher names, IDs, opponents, and home/away status
    from the MLB probable pitchers page.
    Returns a list of dictionaries.
    """
    print(f"Attempting to fetch: {url}")
    results = []
    try:
        fetcher = StealthyFetcher()
        page = await fetcher.async_fetch(url)

        if not page or not page.html_content:
             print(f"Failed to fetch content from {url}. Status: {page.status_code if page else 'N/A'}")
             return []

        print("Successfully fetched page content.")
        soup = BeautifulSoup(page.html_content, 'html.parser')

        game_matchups = soup.select('div.probable-pitchers__matchup')

        if not game_matchups:
            print("Could not find game matchup containers using selector 'div.probable-pitchers__matchup'.")
            return []

        print(f"Found {len(game_matchups)} game matchups.")
        processed_count = 0

        for i, game in enumerate(game_matchups):
            try:
                away_team_element = game.select_one('span.probable-pitchers__team-name--away')
                home_team_element = game.select_one('span.probable-pitchers__team-name--home')
                pitcher_name_links = game.select('div.probable-pitchers__pitchers div.probable-pitchers__pitcher-name a')

                missing = []
                if not away_team_element: missing.append("Away Team")
                if not home_team_element: missing.append("Home Team")
                if len(pitcher_name_links) < 2:
                    if len(pitcher_name_links) == 1: missing.append("Second Pitcher Link")
                    else: missing.append("Both Pitcher Links")

                if missing:
                    print(f"Skipping matchup {i+1}: Couldn't find -> {', '.join(missing)}")
                    continue

                away_team = away_team_element.get_text(strip=True)
                home_team = home_team_element.get_text(strip=True)

                # --- Extract Pitcher 1 (Away) Info ---
                pitcher1_link = pitcher_name_links[0]
                pitcher1_name = pitcher1_link.get_text(strip=True)
                pitcher1_href = pitcher1_link.get('href', '')
                # Extract ID from href (e.g., /player/carlos-carrasco-471911 -> carlos-carrasco-471911)
                pitcher1_id_match = re.search(r'/player/([^/]+)$', pitcher1_href)
                pitcher1_id = pitcher1_id_match.group(1) if pitcher1_id_match else None

                # --- Extract Pitcher 2 (Home) Info ---
                pitcher2_link = pitcher_name_links[1]
                pitcher2_name = pitcher2_link.get_text(strip=True)
                pitcher2_href = pitcher2_link.get('href', '')
                pitcher2_id_match = re.search(r'/player/([^/]+)$', pitcher2_href)
                pitcher2_id = pitcher2_id_match.group(1) if pitcher2_id_match else None


                # Assign opponents and home/away status
                if pitcher1_name and pitcher1_name != "TBD" and pitcher1_id:
                    results.append({
                        'name': pitcher1_name,
                        'player_id': pitcher1_id, # Add player ID
                        'opponent': home_team,
                        'at_home': 0
                    })

                if pitcher2_name and pitcher2_name != "TBD" and pitcher2_id:
                     results.append({
                         'name': pitcher2_name,
                         'player_id': pitcher2_id, # Add player ID
                         'opponent': away_team,
                         'at_home': 1
                     })

                processed_count += 1

            except Exception as e:
                print(f"Error processing matchup {i+1}: {e}")
                continue

        print(f"Finished processing matchups. Successfully extracted {len(results)} pitcher entries from {processed_count} fully processed matchups.")
        return results

    except ImportError:
        print("Required libraries (BeautifulSoup, scrapling, re) not found.")
        return []
    except Exception as e:
        print(f"An overall error occurred during scraping: {e}")
        print(traceback.format_exc())
        return []

## Run function

In [3]:
async def main_mlb():
    print("\nScraping MLB probable pitchers, IDs, opponents, and home/away status...")
    pitcher_data = await scrape_probable_pitchers() # Function now returns list of dicts with player_id
    if pitcher_data:
        print("\n--- Probable Pitchers Found ---")
        printed_names = set()
        for i, data in enumerate(pitcher_data):
             home_away_str = "Home" if data.get('at_home', 0) == 1 else "Away"
             player_id_str = data.get('player_id', 'N/A') # Get player ID, default to N/A if missing

             if data['name'] not in printed_names:
                 # Include player ID in the printout
                 print(f"{i+1}. {data['name']} ({player_id_str}) (vs {data['opponent']}, {home_away_str})")
                 printed_names.add(data['name'])
             else:
                  print(f"   (Duplicate entry skipped: {data['name']})")


        # --- Save to CSV ---
        try:
            if 'pd' in globals() or 'pd' in locals():
                df_pitchers = pd.DataFrame(pitcher_data)

                # Ensure columns exist
                expected_cols = ['name', 'player_id', 'opponent', 'at_home']
                for col in expected_cols:
                     if col not in df_pitchers.columns: df_pitchers[col] = []


                # Remove duplicates based on name
                df_pitchers = df_pitchers.drop_duplicates(subset=['name'], keep='first')

                output_filename = 'probable_pitchers_with_opponents.csv'
                # Specify column order including 'player_id'
                df_pitchers.to_csv(output_filename, index=False, columns=expected_cols)
                print(f"\nSuccessfully saved list to {output_filename}")
            else:
                print("\nWarning: pandas (pd) not imported. Cannot save to CSV.")
        except Exception as e:
             print(f"\nError saving to CSV: {e}")
             traceback.print_exc()
        # --------------------

    else:
        print("\nNo probable pitchers found or an error occurred during scraping.")

# --- Run the main_mlb async function (code to run it remains the same) ---
try:
    loop = asyncio.get_running_loop()
    print("Asyncio loop already running. Scheduling main_mlb task.")
    asyncio.ensure_future(main_mlb())
except RuntimeError:
    print("No asyncio loop running. Starting one with asyncio.run().")
    asyncio.run(main_mlb())

print("\nMLB scraping process initiated.")

Asyncio loop already running. Scheduling main_mlb task.

MLB scraping process initiated.



Scraping MLB probable pitchers, IDs, opponents, and home/away status...
Attempting to fetch: https://www.mlb.com/probable-pitchers


[2025-04-08 11:21:23] INFO: Fetched (200) <GET https://www.mlb.com/probable-pitchers> (referer: https://www.google.com/search?q=mlb)


Successfully fetched page content.
Found 15 game matchups.
Skipping matchup 13: Couldn't find -> Second Pitcher Link
Finished processing matchups. Successfully extracted 28 pitcher entries from 14 fully processed matchups.

--- Probable Pitchers Found ---
1. Carlos Carrasco (carlos-carrasco-471911) (vs Tigers, Away)
2. Tarik Skubal (tarik-skubal-669373) (vs Yankees, Home)
3. Shane Smith (shane-smith-681343) (vs Guardians, Away)
4. Ben Lively (ben-lively-594902) (vs White Sox, Home)
5. Connor Gillispie (connor-gillispie-687362) (vs Mets, Away)
6. Clay Holmes (clay-holmes-605280) (vs Marlins, Home)
7. Sonny Gray (sonny-gray-543243) (vs Pirates, Away)
8. Paul Skenes (paul-skenes-694973) (vs Cardinals, Home)
9. Easton Lucas (easton-lucas-687922) (vs Red Sox, Away)
10. Garrett Crochet (garrett-crochet-676979) (vs Blue Jays, Home)
11. Justin Wrobleski (justin-wrobleski-680736) (vs Nationals, Away)
12. Brad Lord (brad-lord-695418) (vs Dodgers, Home)
13. Kyle Hendricks (kyle-hendricks-543294) 