# Extract PDF Singleton from Yearbook

In [1]:
import os
from PyPDF2 import PdfReader, PdfWriter
from pathlib import Path
from hidden_debt_gsf.config import SRC, BLD_data

def extract_page(input_folder, output_folder, year, page):
    """
    Extracts a single specified page from a Yearbook PDF and saves it as a new PDF.
    Parameters:
      input_folder (str): Folder containing the PDF.
      output_folder (str): Folder where the extracted page PDF will be saved.
      year (str): Year identifier used in file names.
      page (int): The page number to extract (1-indexed).
    """
    output_folder = os.path.join(output_folder, f"Yearbook_{year}")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    input_pdf = os.path.join(input_folder, f"Yearbook_{year}.pdf")
    reader = PdfReader(input_pdf)
    total_pages = len(reader.pages)
    
    if page < 1 or page > total_pages:
        raise ValueError(f"Page {page} is out of range. Total pages: {total_pages}")
    
    writer = PdfWriter()
    writer.add_page(reader.pages[page - 1])  # Adjust for 0-indexing
    output_filename = os.path.join(output_folder, f"page_{page}_{year}.pdf")

    with open(output_filename, "wb") as output_pdf:
        writer.write(output_pdf)

    print(f"Saved: {output_filename}")
    return output_filename

# Use Google API

In [2]:
from typing import Optional
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore

def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # `projects/{project_id}/locations/{location}/processors/{processor_id}`
        name = client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    # For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
    # Optional: Additional configurations for processing.
    process_options = documentai.ProcessOptions(
        # Process only specific pages
        individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
            pages=[1]
        )
    )

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=raw_document,
        field_mask=field_mask,
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document
    
    return document

# Workflow before 2003

## Clean and format Data

In [None]:
import csv
from collections import defaultdict

def convert_page_to_csv_before_2003(document, output_csv="output.csv"):
    """
    Converts a Document AI page response to a CSV file with columns:
    Country, Country_Code, Currency, Entity, Year, Value_type, and Value.
    
    Structural break indicators are processed so that if any value in a given year starts with "X",
    the leading "X" or "1" is dropped from all values of that year. Commas within the values are also removed.
    The function adapts to splitting property values on newline or whitespace.
    """
    # Extract common fields.
    country = None
    country_code = None
    currency = None
    year_list = []
    
    for ent in document.entities:
        if ent.type_ == "Country":
            country = ent.mention_text.split(",")[0].strip()
        elif ent.type_ == "Country_Code":
            country_code = ent.mention_text.strip()
        elif ent.type_ == "Currency":
            currency = " ".join(ent.mention_text.split())
        elif ent.type_ == "Year_Column":
            year_list = [y.strip() for y in ent.mention_text.splitlines()]
    
    # List to hold rows.
    rows = []
    
    # Process table entities with value properties.
    for ent in document.entities:
        if ent.type_ in ["Table_D", "Table_F"]:
            table_entity = None
            for prop in ent.properties:
                if prop.type_ == "Entity":
                    table_entity = prop.mention_text.strip()
                    break
            
            for prop in ent.properties:
                if prop.type_ in ["Domestic_Financing", "Financing", "Financing_Abroad", 
                                  "Domestic_Debt", "Total_Debt", "Foreign_Debt"]:
                    # Split based on the delimiter present.
                    if "\n" in prop.mention_text:
                        values = [v.strip() for v in prop.mention_text.splitlines()]
                    else:
                        values = [v.strip() for v in prop.mention_text.split()]
                    if len(year_list) == len(values):
                        for i, value in enumerate(values):
                            rows.append([country, country_code, currency, table_entity, 
                                         year_list[i], prop.type_, value])
                    else:
                        print(f"Skipping mapping for {prop.type_} as lengths differ: {len(year_list)} != {len(values)}")
    
    # Group rows by year to correct structural break indicators.
    rows_by_year = defaultdict(list)
    for row in rows:
        rows_by_year[row[4]].append(row)
    
    for year, group in rows_by_year.items():
        if any(r[6].startswith("X") for r in group):
            for r in group:
                if r[6].startswith("X"):
                    r[6] = r[6][1:].strip()
                elif r[6].startswith("1"):
                    r[6] = r[6][1:].strip()
        elif any(r[6].startswith("I") for r in group):
            for r in group:
                if r[6].startswith("I"):
                    r[6] = r[6][1:].strip()
                elif r[6].startswith("1"):
                    r[6] = r[6][1:].strip()
        elif any(r[6].startswith("+") for r in group):
            for r in group:
                if r[6].startswith("+"):
                    r[6] = r[6][1:].strip()


    # Remove commas from the value field.
    for row in rows:
        row[6] = row[6].replace(",", "")
    
    # Write to CSV.
    with open(output_csv, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Country", "Country_Code", "Currency", "Entity", "Year", "Value_type", "Value"])
        writer.writerows(rows)
    
    print(f"CSV file '{output_csv}' has been created.")


## Combine functions

In [None]:
import os
from hidden_debt_gsf.config import SRC, BLD_data

def process_yearbook_page_before_2003(year, page):
    """
    Combines page extraction and Document AI processing. It extracts a specified page from the Yearbook PDF,
    calls the Document AI API to process it, and then converts the resulting document to CSV.
    
    Parameters:
      year (str): The year of the Yearbook.
      page (int): The page number to process (1-indexed).
    """
    # Extract the page and get its PDF file path.
    input_pdf_path = extract_page(str(SRC / "data" / "PDF_raw"), str(BLD_data / "PDF_Singletons"), year, page)
    
    # Process the extracted page using Document AI.
    document = process_document_sample(
        project_id="483427254986",
        location="eu",
        processor_id="acd06e8d1af69f94",
        file_path=input_pdf_path,
        mime_type="application/pdf",
    )
    
    # Define output folder for CSV.
    output_folder = BLD_data / "Document_AI" / f"{year}"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    page_name = f"page_{page}_{year}.pdf"
    output_csv = output_folder / f"{page_name}.csv"
    
    convert_page_to_csv_before_2003(document, output_csv=output_csv)
    print(f"Processed page {page} for year {year}. CSV saved as {output_csv}.")

In [None]:
# for page in range(32, 426):
for page in range(122, 123):
    process_yearbook_page_before_2003("1997", page)

Saved: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/PDF_Singletons/Yearbook_1997/page_122_1997.pdf
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/1997/page_122_1997.pdf.csv' has been created.
Processed page 122 for year 1997. CSV saved as /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/1997/page_122_1997.pdf.csv.


In [None]:
#for page in range(34, 485):
#    process_yearbook_page_before_2003("2001", page)

Saved: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/PDF_Singletons/Yearbook_2001/page_34_2001.pdf
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2001/page_34_2001.pdf.csv' has been created.
Processed page 34 for year 2001. CSV saved as /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2001/page_34_2001.pdf.csv.
Saved: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/PDF_Singletons/Yearbook_2001/page_35_2001.pdf
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2001/page_35_2001.pdf.csv' has been created.
Processed page 35 for year 2001. CSV saved as /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2001/page_35_2001.pdf.csv.
Saved: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/PDF_Singletons/Yearbook_2001/page_36_2001.pdf
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Documen

In [None]:
#for page in range(34, 491):
#    process_yearbook_page_before_2003("2002", page)

Saved: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/PDF_Singletons/Yearbook_2002/page_34_2002.pdf
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2002/page_34_2002.pdf.csv' has been created.
Processed page 34 for year 2002. CSV saved as /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2002/page_34_2002.pdf.csv.
Saved: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/PDF_Singletons/Yearbook_2002/page_35_2002.pdf
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2002/page_35_2002.pdf.csv' has been created.
Processed page 35 for year 2002. CSV saved as /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2002/page_35_2002.pdf.csv.
Saved: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/PDF_Singletons/Yearbook_2002/page_36_2002.pdf
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Documen

## CSV- Singeltons to .dta

In [5]:
import pandas as pd
import os
from pathlib import Path
from hidden_debt_gsf.config import SRC, BLD_data

def load_all_csvs_to_dataframe(year):
    """
    Loads all CSV files from the processed yearbook pages into a single Pandas DataFrame,
    and adds a 'Vintage' column.

    Parameters:
      year (str): The year of the Yearbook.

    Returns:
      pd.DataFrame: A DataFrame containing the merged data from all CSV files with 'Vintage_Year' column.
    """
    # Define the folder where CSV files are stored.
    csv_folder = BLD_data / "Document_AI" / f"{year}"
    
    # Get a list of all CSV files in the folder.
    csv_files = list(csv_folder.glob("*.csv"))

    # Check if there are CSV files to process.
    if not csv_files:
        print(f"No CSV files found for year {year}.")
        return pd.DataFrame()

    # Load all CSVs into a DataFrame list.
    dataframes = []
    for file in csv_files:
        df = pd.read_csv(file)
        dataframes.append(df)
    
    # Concatenate all DataFrames into one.
    combined_df = pd.concat(dataframes, ignore_index=True)
    combined_df["Vintage"] = year

    print(f"Loaded {len(csv_files)} CSV files into a DataFrame.")
    return combined_df


In [2]:
def add_residence_name_column(df):
    """
    Adds a 'Residence_Name' column to the DataFrame based on the 'Value_type' column.
    
    - 'total' for 'Financing' and 'Total_Debt'
    - 'domestic' for 'Domestic_Debt' and 'Domestic_Financing'
    - 'foreign' for 'Foreign_Debt' and 'Financing_Abroad'
    
    Parameters:
      df (pd.DataFrame): The DataFrame containing the 'Value_type' column.
      
    Returns:
      pd.DataFrame: Updated DataFrame with the new 'Residence_Name' column.
    """
    # Define mapping conditions
    conditions = {
        "Financing": "total",
        "Total_Debt": "total",
        "Domestic_Debt": "domestic",
        "Domestic_Financing": "domestic",
        "Foreign_Debt": "foreign",
        "Financing_Abroad": "foreign"
    }
    
    # Apply the mapping
    df["Residence_Name"] = df["Value_type"].map(conditions).fillna("Unknown")  # Default to 'Unknown' if not in mapping
    
    return df

In [24]:
import pandas as pd

def add_financial_columns(df):
    """
    Adds two new columns to the DataFrame:
    
    - 'netliab': The value from 'Value' if 'Value_type' is 'Financing', 'Domestic_Financing', or 'Financing_Abroad'.
    - 'dod' (Debt): The value from 'Value' if 'Value_type' is 'Total_Debt', 'Domestic_Debt', or 'Foreign_Debt'.
    
    Parameters:
      df (pd.DataFrame): The DataFrame containing 'Value_type' and 'Value'.
      
    Returns:
      pd.DataFrame: Updated DataFrame with the new 'NetLiab' and 'DoD' columns.
    """
    # Assign 'NetLiab' for Financing-related categories
    df["netliab"] = df.apply(lambda row: row["Value"] if row["Value_type"] in 
                             ["Financing", "Domestic_Financing", "Financing_Abroad"] else None, axis=1)
    
    # Assign 'DoD' (Debt) for debt-related categories
    df["dod"] = df.apply(lambda row: row["Value"] if row["Value_type"] in 
                         ["Total_Debt", "Domestic_Debt", "Foreign_Debt"] else None, axis=1)
    
    return df


In [25]:
import pandas as pd

def reshape_financial_data(df):
    """
    Reshapes the DataFrame such that:
    - Each unique (Country_Code, Country, Currency, Entity, Year, Vintage, Residence_Name)
      has only one row.
    - 'netliab' and 'dod' are combined into a single row per group, ensuring missing values remain NaN.
    
    Parameters:
      df (pd.DataFrame): The DataFrame to be reshaped.

    Returns:
      pd.DataFrame: Reshaped DataFrame with unique rows and summed values.
    """
    # Convert netliab and dod to numeric, handling errors (e.g., if stored as strings)
    df["netliab"] = pd.to_numeric(df["netliab"], errors="coerce")
    df["dod"] = pd.to_numeric(df["dod"], errors="coerce")

    # Define the grouping columns
    group_cols = ["Country_Code", "Country", "Currency", "Entity", "Year", "Vintage", "Residence_Name"]

    # Aggregate data, summing 'netliab' and 'dod' while preserving NaN values
    reshaped_df = df.groupby(group_cols, as_index=False).agg({
        "netliab": lambda x: x.sum(min_count=1),  # Keeps NaN if all values are NaN
        "dod": lambda x: x.sum(min_count=1)       # Keeps NaN if all values are NaN
    })

    return reshaped_df


In [33]:
import pandas as pd

def polish_dataframe(df):
    """
    Cleans and standardizes the DataFrame by:
    1. Renaming columns.
    2. Dropping invalid Year entries:
       a. If Year contains multiple years (e.g., "1997 2000").
       b. If Year ends with 'f' or 'p' (e.g., "2000p").
    3. Removing line breaks from all string columns.
    4. Replacing entire groups with NaN if 'netliab' or 'dod' have more than three decimals.

    Parameters:
      df (pd.DataFrame): The DataFrame to be cleaned.

    Returns:
      pd.DataFrame: The cleaned DataFrame.
    """
    # 1. Rename columns
    df.rename(columns={"Entity": "Sector_Name"}, inplace=True)

    # 2. Drop invalid Year entries
    df = df[df["Year"].astype(str).apply(lambda x: len(x.split()) == 1)]  # Removes "1997 2000"
    df = df[~df["Year"].astype(str).str.endswith(("f", "p"))]  # Removes years ending with 'f' or 'p'
    
    # 3. Remove line breaks from all string columns
    df = df.applymap(lambda x: x.replace("\n", " ").strip() if isinstance(x, str) else x)

    # 4. Convert netliab and dod to numeric (if not already)
    df["netliab"] = pd.to_numeric(df["netliab"], errors="coerce")
    df["dod"] = pd.to_numeric(df["dod"], errors="coerce")

    # 5. Identify groups with more than 3 decimal places
    def has_more_than_three_decimals(value):
        if pd.isna(value):  # Skip NaNs
            return False
        return abs(value * 1000) % 1 != 0  # Checks if more than 3 decimals exist

    for column in ["netliab", "dod"]:
        invalid_groups = df[df[column].apply(has_more_than_three_decimals)][["Country_Code", "Sector_Name", "Vintage", "Residence_Name"]]
        
        # Set the entire group to NaN for that column
        df.loc[df.set_index(["Country_Code", "Sector_Name", "Vintage", "Residence_Name"]).index.isin(invalid_groups.set_index(["Country_Code", "Sector_Name", "Vintage", "Residence_Name"]).index), column] = pd.NA

    return df

In [69]:
import pandas as pd

def adjust_currency(df):
    """
    Adjusts currency units based on special cases and general rules:
    
    - Special cases for Argentina, Brazil, Egypt, and El Salvador, depending on Vintage and Year.
    - Standardizes currency labels by removing extra words like 'Millions' or 'Billions'.
    - Adjusts 'netliab' and 'dod' values by adding the appropriate number of zeros.
    
    Parameters:
      df (pd.DataFrame): The DataFrame to process.
    
    Returns:
      pd.DataFrame: The updated DataFrame with adjusted currency and scaled values.
    """

    # Convert 'Year' to numeric (handling errors)
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")  # Ensures Year is an integer
    
    # Ensure netliab and dod are numeric
    df["netliab"] = pd.to_numeric(df["netliab"], errors="coerce")
    df["dod"] = pd.to_numeric(df["dod"], errors="coerce")

    # Define special cases for currency adjustments
    special_cases = [
        {"Country": "Argentina", "Vintage": "1997", "Condition": (df["Year"] <= 1988), "Multiplier": 1_000, "New_Currency": "Thousands of Pesos"},
        {"Country": "Argentina", "Vintage": "1997", "Condition": (df["Year"] >= 1989), "Multiplier": 1_000_000, "New_Currency": "Millions of Pesos"},
        {"Country": "Brazil", "Vintage": ["1997", "2001"], "Condition": (df["Year"] <= 1989), "Multiplier": 1, "New_Currency": "Reais"},
        {"Country": "Brazil", "Vintage": ["1997", "2001"], "Condition": ((df["Year"] >= 1990) & (df["Year"] <= 1992)), "Multiplier": 1_000, "New_Currency": "Thousands of Reais"},
        {"Country": "Brazil", "Vintage": ["1997", "2001"], "Condition": (df["Year"] >= 1993), "Multiplier": 1_000_000, "New_Currency": "Millions of Reais"},
        {"Country": "Egypt", "Vintage": "1997", "Condition": True, "Multiplier": 1_000_000, "New_Currency": "Millions of Pounds"},  # All Egypt entries in 1997
        {"Country": "Cyprus", "Vintage": "2001", "Condition": True, "Multiplier": 1_000_000, "New_Currency": "Millions of Pounds"}, # All Cyprus entries in 2001
        {"Country": "Jordan", "Vintage": "2002", "Condition": True, "Multiplier": 1_000_000, "New_Currency": "Millions of Dinars"}, # All Jordan entries in 2002
        {"Country": "Iceland", "Vintage": "2001", "Condition": True, "Multiplier": 1_000_000, "New_Currency": "Millions of Kronur"}, # All Iceland entries in 2001
        {"Country": "Japan", "Vintage": "2001", "Condition": True, "Multiplier": 1_000_000_000, "New_Currency": "Billions of Yen"}, # All Japan entries in 2001
        {"Country": "El Salvador", "Vintage": "2002", "Condition": (df["Year"] <= 2000), "Multiplier": 1_000_000, "New_Currency": "Millions of Colones"},
        {"Country": "El Salvador", "Vintage": "2002", "Condition": (df["Year"] >= 2001), "Multiplier": 1_000_000, "New_Currency": "Millions of Dollars"},  
    ]

    # Apply special case adjustments
    for case in special_cases:
        vintage_condition = (
            df["Vintage"].isin(case["Vintage"]) if isinstance(case["Vintage"], list) else (df["Vintage"] == case["Vintage"])
        )

        mask = (df["Country"] == case["Country"]) & vintage_condition & case["Condition"]
        
        # Apply the multiplier to netliab and dod
        df.loc[mask, ["netliab", "dod"]] *= case["Multiplier"]

        # Update the Currency column
        df.loc[mask, "Currency"] = case["New_Currency"]
    
    # Standard currency adjustments (for other countries)
    def get_multiplier(currency):
        """Determines the multiplier based on currency descriptions."""
        if isinstance(currency, str):
            if "Trillion" in currency:
                return 1_000_000_000_000
            elif "Billion" in currency:
                return 1_000_000_000
            elif "Million" in currency:
                return 1_000_000
            elif "Thousand" in currency:
                return 1_000
        return 1  # No scaling needed
    
    # Apply general currency adjustments
    df["Multiplier"] = df["Currency"].apply(get_multiplier)
    df["netliab"] *= df["Multiplier"]
    df["dod"] *= df["Multiplier"]
    
    # Extract clean currency name (removing 'Millions' and 'Billions')
    df["Currency"] = df["Currency"].str.replace(r"Thousands? of |Millions? of |Billions? of |Trillions? of ", "", regex=True).str.strip()
    
    # Drop the temporary multiplier column
    df.drop(columns=["Multiplier"], inplace=True)
    
    return df

In [None]:
import pandas as pd
import os

# Define the years to process
years = ["1997", "2001", "2002"]

# List to store DataFrames for each year
dataframes = []

for year in years:
    print(f"Processing data for {year}...")

    # Load, process, and clean the data
    df = load_all_csvs_to_dataframe(year)
    df = add_residence_name_column(df)
    df = add_financial_columns(df)
    df = reshape_financial_data(df)
    df = polish_dataframe(df)
    df = adjust_currency(df)

    # Append to the list
    dataframes.append(df)

# Concatenate all processed DataFrames
df_combined = pd.concat(dataframes, ignore_index=True)

# Define output paths
csv_output_path = BLD_data / "Document_AI" / "1997_to_2002_data.csv"
dta_output_path = BLD_data / "DTA" / "Document_AI" / "1997_to_2002_data.dta"

# Ensure directories exist
os.makedirs(csv_output_path.parent, exist_ok=True)  # Creates "Document_AI" directory if missing
os.makedirs(dta_output_path.parent, exist_ok=True)  # Creates "DTA/Document_AI" directory if missing

# Save to CSV
df_combined.to_csv(csv_output_path, index=False)
print(f"Data saved to: {csv_output_path}")

# Save to Stata (.dta)
df_combined.to_stata(dta_output_path, write_index=False, version=117)  # Ensure Stata compatibility
print(f"Data saved to: {dta_output_path}")

Processing data for 1997...
Loaded 394 CSV files into a DataFrame.
Processing data for 2001...


  df = df.applymap(lambda x: x.replace("\n", " ").strip() if isinstance(x, str) else x)


Loaded 451 CSV files into a DataFrame.
Processing data for 2002...


  df = df.applymap(lambda x: x.replace("\n", " ").strip() if isinstance(x, str) else x)


Loaded 457 CSV files into a DataFrame.
Data saved to: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/1997_to_2002_data.csv
Data saved to: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/DTA/Document_AI/1997_to_2002_data.dta


  df = df.applymap(lambda x: x.replace("\n", " ").strip() if isinstance(x, str) else x)


# Workflow for 2003 and 2004

In [None]:
import csv
from collections import defaultdict
import re

def convert_page_to_csv_2003_2004(document, output_csv="output.csv"):
    """
    Converts a Document AI response (new data structure) to a CSV file with columns:
    Country, Country_Code, Currency, Entity, Year, Value_type, and Value.
    
    Structural break indicators are corrected and commas removed from values.
    """
    country = None
    country_code = None
    currency_value = None
    fiscal_year = None
    reporting_base = None
    year_list = []
    year_list_gg = []
    values = []
    values_gg = []

    for ent in document.entities:
        if ent.type_ == "Country":
            parts = ent.mention_text.split(",")[0].strip().splitlines()
            if parts:
                country = parts[0].strip()
            if len(parts) > 1:
                country_code = parts[1].strip()
        elif ent.type_ == "Currency":
            # Normalize spacing and split by '/'
            text = " ".join(ent.mention_text.split())
            parts = [p.strip() for p in text.split("/")]
            if parts:
                currency_value = parts[0]
            if len(parts) > 1:
                fiscal_year = parts[1]
            if len(parts) > 2:
                reporting_base = parts[2]

    # List to hold rows.
    rows = []

    def split_text(text):
        """Removes commas, then splits text by any whitespace (including newlines).
        Then further splits tokens by detecting boundaries where a digit is immediately
        followed by a '+', '†', or '-' and another digit.
        """
        # Remove commas from the text.
        text = text.replace(",", "")
        
        # Split by any whitespace (this covers spaces, newlines, etc.)
        tokens = [v.strip() for v in re.split(r'\s+', text) if v.strip()]
        
        def further_split(token):
            # Further split tokens on boundaries: digit followed by a '+', '†', or '-' and a digit.
            return re.split(r'(?<=\d)(?=[\+\†-]\d)', token)
        
        result = []
        for token in tokens:
            result.extend(further_split(token))
        return [x for x in result if x]

    # Process table entities with value properties.
    for ent in document.entities:
        if ent.type_ == "Budgetary_Central_Govenrment":
            # Aggregate years for Budgetary Central Government.
            year_list = []
            for prop in ent.properties:
                if prop.type_ == "Years":
                    year_list.extend(split_text(prop.mention_text))
            for prop in ent.properties:
                if prop.type_ in ["Domestic_Liabilities", "Domestic_Net_Incurrence", "Foreign_Liabilities",
                                "Foreign_Net_Incurrence", "Liabilities", "Net_Incurrence_of_Liabilities"]:
                    values = split_text(prop.mention_text)
                    if len(year_list) == len(values):
                        for i, value in enumerate(values):
                            rows.append([country, country_code, currency_value, fiscal_year, reporting_base,
                                        year_list[i], prop.type_, value, ent.type_])
                                
        elif ent.type_ == "General_Government":
            # Aggregate years for General Government.
            year_list_gg = []
            for prop in ent.properties:
                if prop.type_ == "Years":
                    year_list_gg.extend(split_text(prop.mention_text))
            for prop in ent.properties:
                if prop.type_ in ["Domestic_Liabilities", "Domestic_Net_Incurrence", "Foreign_Liabilities",
                                "Foreign_Net_Incurrence", "Liabilities", "Net_Incurrence_of_Liabilities"]:
                    values_gg = split_text(prop.mention_text)
                    if len(year_list_gg) == len(values):
                        for i, value in enumerate(values):
                            rows.append([country, country_code, currency_value, fiscal_year, reporting_base,
                                        year_list_gg[i], prop.type_, value, ent.type_])

    # Correct structural break indicators.
    rows_by_year = defaultdict(list)
    for row in rows:
        rows_by_year[row[5]].append(row)

    for group in rows_by_year.values():
        if any(r[7].startswith("+") for r in group):
            for r in group:
                if r[7].startswith("+"):
                    r[7] = r[7][1:].strip()

    # Remove commas from the Value field.
    for row in rows:
        row[7] = row[7].replace(",", "")

    # Write the rows to CSV.
    with open(output_csv, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Country", "Country_Code", "Currency", "Fiscal_Year", "Reporting_Base", "Year", "Value_type", "Value", "Entity"])
        writer.writerows(rows)
    
    print(f"CSV file '{output_csv}' has been created.")
    return year_list, year_list_gg, values, values_gg


In [33]:
import os
from hidden_debt_gsf.config import SRC, BLD_data
import pandas as pd

def process_yearbook_page_2003_2004(year, page):
    """
    Combines page extraction and Document AI processing. It extracts a specified page from the Yearbook PDF,
    calls the Document AI API to process it, and then converts the resulting document to CSV.
    
    Parameters:
      year (str): The year of the Yearbook.
      page (int): The page number to process (1-indexed).
    """
    # Extract the page and get its PDF file path.
    input_pdf_path = extract_page(str(SRC / "data" / "PDF_raw"), str(BLD_data / "PDF_Singletons"), year, page)
    
    # Process the extracted page using Document AI.
    document = process_document_sample(
        project_id="483427254986",
        location="eu",
        processor_id="7aade0143389388f",
        file_path=input_pdf_path,
        mime_type="application/pdf",
    )
    
    # Define output folder for CSV.
    output_folder = BLD_data / "Document_AI" / f"{year}"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    page_name = f"page_{page}_{year}.pdf"
    output_csv = output_folder / f"{page_name}.csv"
    json_csv = output_folder / f"{page_name}_json.csv"

    def save_entities_quick(document, output_csv="entities_output.csv"):
        df = pd.DataFrame([vars(ent) for ent in document.entities])
        df.to_csv(output_csv, index=False)
        print(f"CSV file '{output_csv}' has been created.")
    save_entities_quick(document, output_csv=json_csv)
        
    year_list, year_list_gg, values, values_gg = convert_page_to_csv_2003_2004(document, output_csv=output_csv)
    print(f"Processed page {page} for year {year}. CSV saved as {output_csv}.")
    return year_list, year_list_gg, values, values_gg

In [50]:
year_list, year_list_gg, values, values_gg = process_yearbook_page_2003_2004(2004, 362)

Saved: /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/PDF_Singletons/Yearbook_2004/page_362_2004.pdf
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2004/page_362_2004.pdf_json.csv' has been created.
CSV file '/Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2004/page_362_2004.pdf.csv' has been created.
Processed page 362 for year 2004. CSV saved as /Users/torbenhaferkamp/Desktop/IfW_Kiel/hidden_debt_gsf/bld/data/Document_AI/2004/page_362_2004.pdf.csv.


In [48]:
print(f'Year list BCG:\n{year_list}')
print(f'Year list GG:\n{year_list_gg}')
print(f'Values BCG:\n{values}')
print(f'Values GG:\n{values_gg}')

Year list BCG:
['2001', '2002', '2003']
Year list GG:
['2002']
Values BCG:
['28.78']
Values GG:
['588.37']
