In [1]:
# Standard library imports
import asyncio
import os
import sqlite3
from io import StringIO

# Third-party imports
import nest_asyncio
import pandas as pd
from bs4 import BeautifulSoup
from scrapling import StealthyFetcher

# scrape one year

In [2]:
async def scrape_year(player_id, year):
    # The URL pattern for pitching game logs
    url = f"https://www.baseball-reference.com/players/gl.fcgi?id={player_id}&t=p&year={year}"
    page = await StealthyFetcher().async_fetch(url)  # the async version of fetch
    soup = BeautifulSoup(page.html_content, 'html.parser')
    #extract the player name from the title
    player_name = ' '.join(soup.find('title').text.split()[:2]) if soup.find('title') else None
    game_log_table = soup.find('table', id='pitching_gamelogs')
    df = pd.read_html(StringIO(str(game_log_table)))[0] if game_log_table else []
    if not isinstance(df, list) and not df.empty:
        df['year'] = year
        df['player_id'] = player_id
        df['name'] = player_name
    return df, player_name

# scrape all years for 1 guy

In [3]:
async def player_scrape(player_id):
    """Scrape and combine pitching data for 2021-2024 for a player."""
    years = [2021, 2022, 2023, 2024]  # Fixed years to scrape
    tasks = [scrape_year(player_id, year) for year in years] # run all the years at once
    results = await asyncio.gather(*tasks)
    # Combine dataframes, filtering out empty ones
    dfs = [df for df, _ in results if not isinstance(df, list) and not df.empty]
    combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
    return combined_df

## Clean data

In [4]:
def cleanse_pitcher_game_logs(df):
    rename_map = {
        'Rk': 'season_game_num',
        'Gcar': 'career_game_num',
        'Gtm': 'team_game_num',
        'Date': 'date',
        'Tm': 'team_id',
        'Unnamed: 5': 'road_indicator',
        'Opp': 'opp_id',
        'Rslt': 'game_result',
        'Inngs': 'innings',
        'DR': 'days_rest',
        'IP': 'ip',
        'H': 'h',
        'R': 'r',
        'ER': 'er',
        'BB': 'bb',
        'SO': 'so',
        'HR': 'hr',
        'HBP': 'hbp',
        'ERA': 'era',
        'FIP': 'fip',
        'BF': 'batters_faced',
        'Pit': 'pitches',
        'Str': 'strikes_total',
        'StL': 'strikes_looking',
        'StS': 'strikes_swinging',
        'GB': 'ground_balls',
        'FB': 'flyballs',
        'LD': 'line_drives',
        'PU': 'pop_ups',
        'GSc': 'game_score',
        'SB': 'sb',
        'CS': 'cs',
        'PO': 'pickoffs',
        'AB': 'ab',
        '2B': '2b',
        '3B': '3b',
        'IBB': 'ibb',
        'GDP': 'gidp',
        'SF': 'sf',
        'ROE': 'roe',
        'aLI': 'avg_leverage_index',
        'WPA': 'win_prob_added',
        'acLI': 'adjusted_cli_avg',
        'cWPA': 'champ_win_prob_added',
        'RE24': 'base_out_run_saved'
    }
    #drop un needed and apply column mapping
    df = df.drop(columns=['Dec','IR','IS','Unk','DFS(DK)','DFS(FD)', 'Entered', 'Exited','Rslt','Inngs'], 
                                    errors='ignore').rename(columns=rename_map)
    # Convert road_indicator to dummy variables and filter invalid team_id rows
    df = pd.concat([df.drop('road_indicator', axis=1), 
                            pd.get_dummies(df['road_indicator'], prefix='road')], axis=1)
    df = df[~((df['team_id'].isna()) | (df['team_id'] == "Tm"))]
    # Ensure player_id, name, and year are first three columns
    desired_order = ['player_id', 'name', 'year'] + [col for col in df.columns if col not in ['player_id', 'name', 'year']]
    df = df[desired_order]
    return df

# Store Data

In [5]:
def save_pitcher_logs(df, db_path="baseball.db"):
    db_path = os.path.abspath(db_path)
    
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(db_path) or '.', exist_ok=True)
    # Create connection to SQLite database
    conn = sqlite3.connect(db_path)
    
    #save gamelogs
    df.to_sql("pitching_gamelogs", conn, if_exists='append', index=False)
    
    #save player info
    player_info = df[['player_id', 'name']].drop_duplicates()
    player_info.to_sql("scraped_players", conn, if_exists='append', index=False)
    
    # Close connection
    conn.close()
    
    print(f"{df['name'].iloc[0]} finished")


# Put it all together


In [6]:
async def scrape_all_players(filepath):
    test_ids = (pd.read_csv(filepath))["player_id"]
    for player_id in test_ids:
        print(player_id)
        df = await player_scrape(player_id)
        df = cleanse_pitcher_game_logs(df)
        save_pitcher_logs(df)


# Run it on csv file

In [None]:
# Apply nest_asyncio to allow asyncio to run in Jupyter Notebook
nest_asyncio.apply()

# Filepath to your CSV file (update this if needed)
csv_filepath = "test_ids.csv"  # Adjust if it's in a different directory

# Run the async function in the event loop
asyncio.run(scrape_all_players(csv_filepath))


# Get probable pitchers

In [3]:
# Add these imports if they are missing in your first cell
# Standard library imports
import asyncio
import os
from io import StringIO
import traceback # For detailed error reporting

# Third-party imports
import nest_asyncio
import pandas as pd
from bs4 import BeautifulSoup
from scrapling import StealthyFetcher # Make sure this is imported

In [16]:
async def scrape_probable_pitchers(url="https://www.mlb.com/probable-pitchers"):
    """
    Scrapes probable pitcher names, opponents, and home/away status
    from the MLB probable pitchers page using updated selectors based on HTML inspection.
    Returns a list of dictionaries.
    """
    print(f"Attempting to fetch: {url}")
    results = []
    try:
        fetcher = StealthyFetcher()
        page = await fetcher.async_fetch(url)

        if not page or not page.html_content:
             print(f"Failed to fetch content from {url}. Status: {page.status_code if page else 'N/A'}")
             return []

        print("Successfully fetched page content.")
        soup = BeautifulSoup(page.html_content, 'html.parser')

        # Find each game matchup container
        game_matchups = soup.select('div.probable-pitchers__matchup')

        if not game_matchups:
            print("Could not find game matchup containers using selector 'div.probable-pitchers__matchup'.")
            return []

        print(f"Found {len(game_matchups)} game matchups.")
        processed_count = 0

        for i, game in enumerate(game_matchups):
            try:
                # --- Selectors Based on Provided HTML ---
                away_team_element = game.select_one('span.probable-pitchers__team-name--away')
                home_team_element = game.select_one('span.probable-pitchers__team-name--home')

                # Select the links containing pitcher names within the pitcher section
                pitcher_name_links = game.select('div.probable-pitchers__pitchers div.probable-pitchers__pitcher-name a')
                # --- End Selectors ---

                # --- Checks ---
                missing = []
                if not away_team_element: missing.append("Away Team")
                if not home_team_element: missing.append("Home Team")
                # Check if we found at least two pitcher links
                if len(pitcher_name_links) < 2:
                    if len(pitcher_name_links) == 1:
                        missing.append("Second Pitcher") # Found one, missing the other
                    else:
                        missing.append("Both Pitchers") # Found none
                # --- End Checks ---

                if missing:
                    print(f"Skipping matchup {i+1}: Couldn't find -> {', '.join(missing)}")
                    continue

                # --- Extract Data ---
                away_team = away_team_element.get_text(strip=True)
                home_team = home_team_element.get_text(strip=True)
                # First link is away pitcher
                pitcher1_name = pitcher_name_links[0].get_text(strip=True)
                # Second link is home pitcher
                pitcher2_name = pitcher_name_links[1].get_text(strip=True)
                # --- End Extract Data ---

                # Assign opponents and home/away status
                # Pitcher 1 (Away)
                if pitcher1_name and pitcher1_name != "TBD":
                    results.append({'name': pitcher1_name, 'opponent': home_team, 'at_home': 0})

                # Pitcher 2 (Home)
                if pitcher2_name and pitcher2_name != "TBD":
                     results.append({'name': pitcher2_name, 'opponent': away_team, 'at_home': 1})

                processed_count += 1

            except Exception as e:
                print(f"Error processing matchup {i+1}: {e}")
                # traceback.print_exc() # Uncomment for deep debugging if needed
                continue

        print(f"Finished processing matchups. Successfully extracted {len(results)} pitcher entries from {processed_count} fully processed matchups.")
        return results

    except ImportError:
        print("Required libraries (BeautifulSoup, scrapling) not found. Please install them using uv: `uv pip install beautifulsoup4 scrapling`")
        return []
    except Exception as e:
        print(f"An overall error occurred during scraping: {e}")
        print(traceback.format_exc())
        return []

In [17]:
async def main_mlb():
    print("\nScraping MLB probable pitchers, opponents, and home/away status...")
    pitcher_data = await scrape_probable_pitchers() # Function now returns list of dicts with 'at_home'
    if pitcher_data:
        print("\n--- Probable Pitchers Found ---")
        printed_names = set()
        for i, data in enumerate(pitcher_data):
             # Determine Home/Away string for printing
             home_away_str = "Home" if data.get('at_home', 0) == 1 else "Away"
             # Check for duplicates before printing
             if data['name'] not in printed_names:
                 print(f"{i+1}. {data['name']} (vs {data['opponent']}, {home_away_str})")
                 printed_names.add(data['name'])
             else:
                  print(f"   (Duplicate entry skipped: {data['name']})")


        # --- Save to CSV ---
        try:
            if 'pd' in globals() or 'pd' in locals():
                df_pitchers = pd.DataFrame(pitcher_data)
                 # Ensure columns exist even if list is empty
                if 'name' not in df_pitchers.columns: df_pitchers['name'] = []
                if 'opponent' not in df_pitchers.columns: df_pitchers['opponent'] = []
                if 'at_home' not in df_pitchers.columns: df_pitchers['at_home'] = []


                # Remove potential duplicates based on name before saving
                df_pitchers = df_pitchers.drop_duplicates(subset=['name'], keep='first')

                output_filename = 'probable_pitchers_with_opponents.csv' # Keep filename or change if preferred
                # Specify column order including 'at_home'
                df_pitchers.to_csv(output_filename, index=False, columns=['name', 'opponent', 'at_home'])
                print(f"\nSuccessfully saved list to {output_filename}")
            else:
                print("\nWarning: pandas (pd) not imported. Cannot save to CSV.")
        except Exception as e:
             print(f"\nError saving to CSV: {e}")
             traceback.print_exc()
        # --------------------

    else:
        print("\nNo probable pitchers found or an error occurred during scraping.")

# --- Run the main_mlb async function (code to run it remains the same) ---
try:
    loop = asyncio.get_running_loop()
    print("Asyncio loop already running. Scheduling main_mlb task.")
    asyncio.ensure_future(main_mlb())
except RuntimeError:
    print("No asyncio loop running. Starting one with asyncio.run().")
    asyncio.run(main_mlb())

print("\nMLB scraping process initiated.")

Asyncio loop already running. Scheduling main_mlb task.

MLB scraping process initiated.



Scraping MLB probable pitchers, opponents, and home/away status...
Attempting to fetch: https://www.mlb.com/probable-pitchers


[2025-04-08 11:14:07] INFO: Fetched (200) <GET https://www.mlb.com/probable-pitchers> (referer: https://www.google.com/search?q=mlb)


Successfully fetched page content.
Found 15 game matchups.
Skipping matchup 13: Couldn't find -> Second Pitcher
Finished processing matchups. Successfully extracted 28 pitcher entries from 14 fully processed matchups.

--- Probable Pitchers Found ---
1. Carlos Carrasco (vs Tigers, Away)
2. Tarik Skubal (vs Yankees, Home)
3. Shane Smith (vs Guardians, Away)
4. Ben Lively (vs White Sox, Home)
5. Connor Gillispie (vs Mets, Away)
6. Clay Holmes (vs Marlins, Home)
7. Sonny Gray (vs Pirates, Away)
8. Paul Skenes (vs Cardinals, Home)
9. Easton Lucas (vs Red Sox, Away)
10. Garrett Crochet (vs Blue Jays, Home)
11. Justin Wrobleski (vs Nationals, Away)
12. Brad Lord (vs Dodgers, Home)
13. Kyle Hendricks (vs Rays, Away)
14. Shane Baz (vs Angels, Home)
15. Zack Wheeler (vs Braves, Away)
16. Chris Sale (vs Phillies, Home)
17. Pablo López (vs Royals, Away)
18. Cole Ragans (vs Twins, Home)
19. Patrick Corbin (vs Cubs, Away)
20. Jameson Taillon (vs Rangers, Home)
21. Freddy Peralta (vs Rockies, Away)
