In [1]:
import pandas as pd
import numpy as np
import os

# --- Configuration ---
RAW_CSV_FILE = 'Video_Games.csv'

# --- Output Files ---
PUBLISHERS_CSV = 'publishers.csv'
DEVELOPERS_CSV = 'developers.csv'
GAMES_CSV = 'games_final.csv'

def transform_and_export():
    """
    Reads the raw CSV, cleans it using only Pandas, and exports
    three separate, relational CSVs ready for database import.
    This script does NOT require sqlalchemy or sqlite3.
    """
    try:
        # --- Step 1: Read and Clean CSV Data (from 2.1 & 2.2) ---
        print(f"Reading data from '{RAW_CSV_FILE}'...")
        df = pd.read_csv(RAW_CSV_FILE, index_col=0) # From 2.1_pandas.ipynb

        print("Cleaning data (from 2.2_cleaning_and_data_manipulation.ipynb)...")
        
        # A. Clean User_Score: Replace 'tbd' with NaN (NULL)
        # We saw .replace() in 2.2_cleaning_and_data_manipulation.ipynb
        df['User_Score'] = df['User_Score'].replace('tbd', np.nan)
        df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce')
        
        # B. Clean Year_of_Release: Convert float (e.g., 2006.0) to a nullable integer
        df['Year_of_Release'] = df['Year_of_Release'].astype('Int64')

        # C. Handle missing relational data: Fill NaN in Developer/Publisher
        # We saw .fillna() in 2.2_cleaning_and_data_manipulation.ipynb
        df['Developer'] = df['Developer'].fillna('Unknown')
        df['Publisher'] = df['Publisher'].fillna('Unknown')
        
        # --- Step 2: Extract and Create Dimension DataFrames (from 2.1) ---
        print("Creating 'Developers' dimension...")
        # Get unique developer names (from 2.2_cleaning_and_data_manipulation.ipynb)
        unique_developers = pd.DataFrame(df['Developer'].unique(), columns=['developer_name'])
        # Add the 'developer_id' which matches our SQL schema
        unique_developers.insert(0, 'developer_id', unique_developers.index + 1)
        
        print("Creating 'Publishers' dimension...")
        # Get unique publisher names
        unique_publishers = pd.DataFrame(df['Publisher'].unique(), columns=['publisher_name'])
        # Add the 'publisher_id'
        unique_publishers.insert(0, 'publisher_id', unique_publishers.index + 1)

        # --- Step 3: Map Foreign Keys (from 2.3_combining_structuring_data.ipynb) ---
        print("Mapping foreign keys using pd.merge()...")
        
        # Use pd.merge() to add the new IDs back to the main DataFrame
        # This is just like the 'merge' examples in 2.3_combining_structuring_data.ipynb
        df = pd.merge(df, unique_developers, left_on='Developer', right_on='developer_name', how='left')
        df = pd.merge(df, unique_publishers, left_on='Publisher', right_on='publisher_name', how='left')

        # --- Step 4: Prepare and Export Final CSVs ---
        print("Preparing final 'Games' table...")
        
        # Rename columns to match the SQL schema exactly
        df = df.rename(columns={
            'Name': 'game_name',
            'Platform': 'platform',
            'Year_of_Release': 'year_of_release',
            'Genre': 'genre',
            'NA_Sales': 'na_sales',
            'EU_Sales': 'eu_sales',
            'JP_Sales': 'jp_sales',
            'Other_Sales': 'other_sales',
            'Global_Sales': 'global_sales',
            'Critic_Score': 'critic_score',
            'Critic_Count': 'critic_count',
            'User_Score': 'user_score',
            'User_Count': 'user_count',
            'Rating': 'rating',
            'publisher_id': 'fk_publisher_id', # This is from the merge
            'developer_id': 'fk_developer_id'  # This is from the merge
        })
        
        # Select *only* the columns that exist in the 'Games' table schema
        final_games_columns = [
            'game_name', 'platform', 'year_of_release', 'genre', 'na_sales',
            'eu_sales', 'jp_sales', 'other_sales', 'global_sales',
            'critic_score', 'critic_count', 'user_score', 'user_count',
            'rating', 'fk_publisher_id', 'fk_developer_id'
        ]
        
        df_games_final = df[final_games_columns]

        # --- Step 5: Export all three CSVs ---
        print(f"Exporting clean data to CSV files...")
        
        # Export the dimension tables
        unique_developers.to_csv(DEVELOPERS_CSV, index=False)
        unique_publishers.to_csv(PUBLISHERS_CSV, index=False)
        
        # Export the final fact table
        df_games_final.to_csv(GAMES_CSV, index=False)
        
        print("\n--- Data Transformation and Export Complete! ---")
        print("You now have three clean CSV files ready for database import:")
        print(f"1. {DEVELOPERS_CSV}")
        print(f"2. {PUBLISHERS_CSV}")
        print(f"3. {GAMES_CSV}")

    except FileNotFoundError as e:
        print(f"Error: Could not find file. Make sure '{e.filename}' is in the same folder.")
    except Exception as e:
        print(f"An error occurred: {e}")

# --- Main execution ---
if __name__ == "__main__":
    transform_and_export()


Reading data from 'Video_Games.csv'...
Cleaning data (from 2.2_cleaning_and_data_manipulation.ipynb)...
Creating 'Developers' dimension...
Creating 'Publishers' dimension...
Mapping foreign keys using pd.merge()...
Preparing final 'Games' table...
Exporting clean data to CSV files...

--- Data Transformation and Export Complete! ---
You now have three clean CSV files ready for database import:
1. developers.csv
2. publishers.csv
3. games_final.csv
