<a href="https://colab.research.google.com/github/ranwiththecode/high-fantasy-data-analysis/blob/main/metadata_addition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
from google.cloud import bigquery

def summarize_preprocessing(cleaned_file, original_file, summary_file,
                             project_id="stable-course-461105-k9",
                             dataset="google_imports",
                             table="pubs_2011_2015"):
    """
    Summarizes preprocessing results for a single book and appends metadata from BigQuery.

    Args:
        cleaned_file (str): Path to the cleaned review CSV file.
        original_file (str): Path to the original review CSV file.
        summary_file (str): Path to the summary CSV to update or create.
        project_id (str): GCP project ID.
        dataset (str): BigQuery dataset name.
        table (str): BigQuery table name.
    """

    # Extract book title from cleaned filename
    book_title = os.path.basename(cleaned_file).replace("_clean.csv", "").replace("_", " ").title()

    # Load cleaned and original data
    try:
        cleaned_df = pd.read_csv(cleaned_file)
        original_df = pd.read_csv(original_file)
    except Exception as e:
        print(f"Error reading files for {book_title}: {e}")
        return

    filtered_count = len(cleaned_df)
    original_count = len(original_df)

    # Query metadata from BigQuery
    client = bigquery.Client(project=project_id)
    query = f"""
    SELECT `Author Pronouns` AS author_gender, Protagonist AS protagonist_gender
    FROM `{project_id}.{dataset}.{table}`
    WHERE LOWER(Name) = @book_title
    LIMIT 1
    """
    job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ScalarQueryParameter("book_title", "STRING", book_title.lower())
        ]
    )

    try:
        results = client.query(query, job_config=job_config).result()
        row = next(results, None)
    except Exception as e:
        print(f"BigQuery error for {book_title}: {e}")
        row = None

    author_gender = row.author_gender if row else "Unknown"
    protagonist_gender = row.protagonist_gender if row else "Unknown"

    # Build summary row
    summary_data = {
        "Book Title": book_title,
        "Original Reviews": original_count,
        "Filtered Reviews": filtered_count,
        "Discarded Reviews": original_count - filtered_count,
        "Discarded %": f"{(original_count - filtered_count) / original_count:.2%}",
        "Author Gender": author_gender,
        "Protagonist Gender": protagonist_gender
    }

    summary_df = pd.DataFrame([summary_data])

    # Append or create summary file
    if os.path.exists(summary_file):
        existing_df = pd.read_csv(summary_file)
        updated_df = pd.concat([existing_df, summary_df], ignore_index=True)
    else:
        updated_df = summary_df

    updated_df.to_csv(summary_file, index=False)
    print(f"✅ Summary updated for: {book_title}")
