In [3]:
import pandas as pd
import sqlalchemy
import requests
import time
import os
from dotenv import load_dotenv

load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_NAME = os.getenv("DB_NAME")
RAWG_API_KEY = os.getenv("RAWG_API_KEY")

if not all([DB_USER, DB_PASS, DB_HOST, DB_NAME, RAWG_API_KEY]):
    print("Error: One or more environment variables are missing.")
    print("Please check your .env file.")
    exit()

DB_CONNECTION_STRING = f'mysql+mysqlconnector://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}'
RAWG_API_URL_BASE = "https://api.rawg.io/api/publishers"


In [4]:
def enrich_publisher_data():
    """
    Connects to the MySQL DB, reads Publishers, calls the RAWG API
    (first to search, then to get details) to find the founding year,
    and updates the table.
    """
    try:
        engine = sqlalchemy.create_engine(DB_CONNECTION_STRING)
        
        with engine.connect() as conn:
            print("Connection successful!")
            
        print("Reading list of publishers from MySQL database...")
        query = "SELECT publisher_id, publisher_name FROM Publishers WHERE founding_year IS NULL"
        df_publishers = pd.read_sql(query, engine)
        
        if df_publishers.empty:
            print("All publishers are already enriched. No work to do.")
            return

        print(f"Found {len(df_publishers)} publishers to enrich...")
        
        with engine.connect() as conn:
            for index, row in df_publishers.iterrows():
                publisher_id = row['publisher_id']
                publisher_name = row['publisher_name']
                
                if publisher_name == 'Unknown':
                    print("Skipping 'Unknown' publisher...")
                    continue
                
                try:
                    search_params = {'key': RAWG_API_KEY, 'search': publisher_name}
                    response = requests.get(RAWG_API_URL_BASE, params=search_params)
                    response.raise_for_status() 
                    search_data = response.json()
                    
                    if search_data['count'] > 0:
                        # Get the ID from the first search result
                        publisher_api_id = search_data['results'][0]['id']
                        
                        # STEP 2: Call the DETAIL ENDPOINT using that ID
                        detail_url = f"{RAWG_API_URL_BASE}/{publisher_api_id}"
                        detail_params = {'key': RAWG_API_KEY}
                        
                        detail_response = requests.get(detail_url, params=detail_params)
                        detail_response.raise_for_status()
                        detail_data = detail_response.json()

                        # STEP 3: Parse the (correct) detail data
                        if 'year_start' in detail_data and detail_data['year_start']:
                            founding_year = detail_data['year_start']
                            
                            update_statement = sqlalchemy.text(
                                "UPDATE Publishers SET founding_year = :year WHERE publisher_id = :id"
                            )
                            with conn.begin():
                                conn.execute(update_statement, {"year": founding_year, "id": publisher_id})
                            
                            print(f"SUCCESS: Updated '{publisher_name}' (ID: {publisher_id}) with founding year: {founding_year}")
                        else:
                            print(f"INFO: Found publisher '{publisher_name}', but 'year_start' key is missing on its detail page.")
                    else:
                        print(f"INFO: No API results found for '{publisher_name}' (ID: {publisher_id})")

                except requests.RequestException as e:
                    print(f"WARN: API request failed for '{publisher_name}'. Error: {e}")
                
                # Be kind to the API and wait
                time.sleep(1) 
                
        print("\n--- API Enrichment Complete! ---")

    except sqlalchemy.exc.OperationalError as e:
        print(f"Error: Could not connect to the MySQL database. Access Denied.")
        print("Please double-check your DB_USER and DB_PASS in the .env file.")
    except Exception as e:
        print(f"An error occurred: {e}")

# --- Main execution ---
if __name__ == "__main__":
    enrich_publisher_data()



Connection successful!
Reading list of publishers from MySQL database...
Found 581 publishers to enrich...
INFO: Found publisher 'Nintendo', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'Microsoft Game Studios', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'Take-Two Interactive', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'Sony Computer Entertainment', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'Activision', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'Ubisoft', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'Bethesda Softworks', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'Electronic Arts', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'Sega', but 'year_start' key is missing on its detail page.
INFO: Found publisher 'SquareSoft', but 'year_start' key is 