In [2]:
import pandas as pd
from pathlib import Path

In [4]:
# Get the data path
data_path = Path.cwd().parent.resolve()/"data"/"GFSSSUC"/"GFSSSUC2019"

# Define the file name
file_name = "GFSSSUC2019_01-16-2025.csv"

# Combine the base path and file name
file_path = data_path / file_name

if file_path.exists():  # Check if the file exists
    data = pd.read_csv(file_path)
    print("File loaded successfully.")
else:
    print(f"File not found at {file_path}")


File loaded successfully.


In [5]:
print(data.head())

  Country Name  Country Code Classification Name Classification Code  \
0   Bangladesh           513  Revenue cash flows               G1|_Z   
1   Bangladesh           513  Revenue cash flows               G1|_Z   
2   Bangladesh           513  Revenue cash flows               G1|_Z   
3   Bangladesh           513  Revenue cash flows               G1|_Z   
4   Bangladesh           513  Revenue cash flows               G1|_Z   

                    Sector Name Sector Code       Unit Name   Unit Code  \
0  Budgetary central government      S1311B  Percent of GDP  XDC_R_B1GQ   
1  Budgetary central government      S1311B  Percent of GDP  XDC_R_B1GQ   
2  Budgetary central government      S1311B  Percent of GDP  XDC_R_B1GQ   
3  Budgetary central government      S1311B  Percent of GDP  XDC_R_B1GQ   
4  Budgetary central government      S1311B  Percent of GDP  XDC_R_B1GQ   

                             Attribute 1972  ...              2013  \
0                                Value  NaN  .

In [6]:
def filter_and_combine(data, column_name, keywords, output_filename):
    """
    Filters rows based on multiple keywords in a specified column and combines them into one CSV.
    
    Parameters:
        data (pd.DataFrame): The DataFrame to search within.
        column_name (str): The column to search for the keywords.
        keywords (list): A list of keywords to search for.
        output_filename (str): The filename to save the combined filtered rows.

    Returns:
        pd.DataFrame: Combined DataFrame of filtered rows.
    """
    combined_rows = pd.DataFrame()  # Initialize an empty DataFrame

    for keyword in keywords:
        # Filter rows where the keyword appears in the specified column (case-insensitive)
        filtered_rows = data[data[column_name].str.contains(keyword, case=False, na=False)]
        
        # Add a column to indicate the keyword used for filtering
        filtered_rows['Keyword'] = keyword
        
        # Append to the combined DataFrame
        combined_rows = pd.concat([combined_rows, filtered_rows], ignore_index=True)

    # Save the combined rows to a single CSV file
    combined_rows.to_csv(Path.cwd() / output_filename, index=False)
    
    # Return the combined DataFrame
    return combined_rows

# Define the DataFrame (assuming `data` is already loaded)
column_name = "Classification Name"

# List of keywords to search for
keywords = ["debt", "liabilities", "borrowing"]

# Call the function to filter and combine rows
output_filename = "filtered_rows_combined.csv"
combined_data = filter_and_combine(data, column_name, keywords, output_filename)

# Display the unique values in the "Classification Name" column
unique_classification_names = combined_data[column_name].unique()
print("Unique classification names:", unique_classification_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows['Keyword'] = keyword


Unique classification names: ['Net incurrence of liabilities']


In [8]:
def analyze_data_format(data):
    """
    Analyzes the dataset to calculate the sum of legitimate entries and covered countries
    for each combination of Classification, Sector, and Unit.
    
    Parameters:
        data (pd.DataFrame): The input dataset.

    Returns:
        pd.DataFrame: Summary DataFrame with analysis results.
    """
    # Filter rows where Attribute is "Value"
    filtered_data = data[data['Attribute'] == 'Value']

    # Select year columns (assume all columns except metadata and Attribute are years)
    year_columns = [col for col in data.columns if col.startswith('20') or col.startswith('19')]

    # Group by Classification, Sector, and Unit
    grouped = filtered_data.groupby(
        ['Classification Name', 'Sector Name', 'Unit Name']
    )

    # Calculate the required summaries
    summary = grouped.apply(
        lambda group: pd.Series({
            'Sum of Legitimate Entries': group[year_columns].apply(
                pd.to_numeric, errors='coerce'
            ).count().sum(),
            'Number of Covered Countries': group['Country Code'].nunique()
        })
    ).reset_index()

    return summary

# Assuming your dataset is loaded in `data`
summary = analyze_data_format(combined_data)

# Save the summary to a CSV file if needed
summary.to_csv(Path.cwd().parent.resolve()/"data"/"GFSSSUC"/"Summaries"/"summary_analysis_2019.csv", index=False)

  summary = grouped.apply(


In [2]:
import pandas as pd
from pathlib import Path

# Define the data directory and output directory
data_dir = Path.cwd().parent.resolve() / "data" / "GFSSSUC"
output_dir = data_dir / "Summaries"

output_dir.mkdir(parents=True, exist_ok=True)  # Ensure the output directory exists

# List of years to process
years = [2014, 2015, 2016, 2017, 2019, 2020]

# List of keywords for filtering
keywords = ["debt", "liabilities", "borrowing"]

# Chunksize for reading large files
chunksize = 10_000

def load_data(data_dir, year, chunksize):
    """Load the CSV file for a specific year in chunks."""
    file_dir = f"GFSSSUC{year}"
    file_name = f"GFSSSUC{year}_01-16-2025.csv"
    file_path = data_dir / file_dir / file_name
    if file_path.exists():
        data_chunks = pd.read_csv(file_path, chunksize=chunksize)
        return pd.concat(data_chunks, ignore_index=True)
    else:
        print(f"File not found for year {year}: {file_path}")
        return None

def filter_and_combine(data, column_name, keywords):
    """Filter rows based on multiple keywords and return combined results."""
    combined_rows = pd.DataFrame()
    for keyword in keywords:
        filtered_rows = data[data[column_name].str.contains(keyword, case=False, na=False)]
        filtered_rows['Keyword'] = keyword
        combined_rows = pd.concat([combined_rows, filtered_rows], ignore_index=True)
    return combined_rows

def analyze_data_format(data):
    """Analyze data to calculate sums of legitimate entries and covered countries."""
    filtered_data = data[data['Attribute'] == 'Value']
    year_columns = [col for col in data.columns if col.startswith(('20', '19'))]
    grouped = filtered_data.groupby(
        ['Classification Name', 'Sector Name', 'Unit Name']
    )
    summary = grouped.apply(
        lambda group: pd.Series({
            'Sum of Legitimate Entries': group[year_columns].apply(pd.to_numeric, errors='coerce').count().sum(),
            'Number of Covered Countries': group['Country Code'].nunique()
        })
    ).reset_index()
    return summary

def process_all_years(data_dir, years, keywords, output_dir, chunksize):
    """Process all years, filter and analyze data, and save summaries."""
    for year in years:
        # Load data for the year
        data = load_data(data_dir, year, chunksize)
        if data is None:
            continue  # Skip if file not found

        # Filter and combine rows based on keywords
        combined_data = filter_and_combine(data, "Classification Name", keywords)

        # Analyze data format
        summary = analyze_data_format(combined_data)

        # Save summary to a CSV file
        summary_file = output_dir / f"summary_analysis_{year}.csv"
        summary.to_csv(summary_file, index=False)
        print(f"Summary for {year} saved to {summary_file}")

# Execute the processing for all years
process_all_years(data_dir, years, keywords, output_dir, chunksize)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows['Keyword'] = keyword
  summary = grouped.apply(


Summary for 2014 saved to /home/torbenhaferkamp/Desktop/IfW_Kiel/GSF/hidden_debt_gsf/src/hidden_debt_gsf/data/GFSSSUC/Summaries/summary_analysis_2014.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows['Keyword'] = keyword
  summary = grouped.apply(


Summary for 2015 saved to /home/torbenhaferkamp/Desktop/IfW_Kiel/GSF/hidden_debt_gsf/src/hidden_debt_gsf/data/GFSSSUC/Summaries/summary_analysis_2015.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows['Keyword'] = keyword
  summary = grouped.apply(


Summary for 2016 saved to /home/torbenhaferkamp/Desktop/IfW_Kiel/GSF/hidden_debt_gsf/src/hidden_debt_gsf/data/GFSSSUC/Summaries/summary_analysis_2016.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows['Keyword'] = keyword
  summary = grouped.apply(


Summary for 2017 saved to /home/torbenhaferkamp/Desktop/IfW_Kiel/GSF/hidden_debt_gsf/src/hidden_debt_gsf/data/GFSSSUC/Summaries/summary_analysis_2017.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows['Keyword'] = keyword
  summary = grouped.apply(


Summary for 2019 saved to /home/torbenhaferkamp/Desktop/IfW_Kiel/GSF/hidden_debt_gsf/src/hidden_debt_gsf/data/GFSSSUC/Summaries/summary_analysis_2019.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows['Keyword'] = keyword


Summary for 2020 saved to /home/torbenhaferkamp/Desktop/IfW_Kiel/GSF/hidden_debt_gsf/src/hidden_debt_gsf/data/GFSSSUC/Summaries/summary_analysis_2020.csv


  summary = grouped.apply(


In [3]:
import pandas as pd
from pathlib import Path

# Define the directory containing summary files
summary_dir = output_dir

# Load all summary files into a list of DataFrames
summary_dfs = []
for year in years:
    file_path = summary_dir / f"summary_analysis_{year}.csv"
    if file_path.exists():
        summary_dfs.append(pd.read_csv(file_path))
    else:
        print(f"Summary file for {year} not found at {file_path}")

# Combine all summaries into a single DataFrame
combined_summary = pd.concat(summary_dfs, ignore_index=True)

# Group by relevant columns and calculate aggregated values
aggregated_summary = combined_summary.groupby(
    ['Classification Name', 'Sector Name', 'Unit Name'],
    as_index=False
).agg({
    'Sum of Legitimate Entries': 'sum',
    'Number of Covered Countries': 'max'
})

# Sort by the most legitimate entries
aggregated_summary = aggregated_summary.sort_values(by='Sum of Legitimate Entries', ascending=False)

# Save the aggregated summary to a CSV file
output_path = summary_dir / "aggregated_summary_2014_2020.csv"
aggregated_summary.to_csv(output_path, index=False)

print(f"Aggregated summary saved to {output_path}")


Aggregated summary saved to /home/torbenhaferkamp/Desktop/IfW_Kiel/GSF/hidden_debt_gsf/src/hidden_debt_gsf/data/GFSSSUC/Summaries/aggregated_summary_2014_2020.csv
