## Purpose of this notebook

This notebook is to transform the structure of the CSV files of each of the input variables (GDP, health expenditure, maternal education, population density, slum population, water access) to a format where we have the Country Name, Year, Indicator Variable alone rather than the long format where each year is a column

In [8]:
import pandas as pd

def transform_wide_to_long(input_filepath: str, output_filepath: str, value_column_name: str = "Value") -> pd.DataFrame:
    """
    Transforms a wide-format CSV (with each year as a column) into a long format 
    by unpivoting year columns into 'Year' and a specified value column.

    Args:
        input_filepath (str): Path to the input CSV file.
        output_filepath (str): Path to save the transformed long-format CSV file.
        value_column_name (str): The desired name for the new column containing the indicator values. 
                                 (Default is "Value")

    Returns:
        pd.DataFrame: The transformed long-format DataFrame with "Year" as a column.
    """
    try:
        # 1. Load the data
        df = pd.read_csv(input_filepath)
        print(f"Original dataframe has {len(df)} rows and {len(df.columns)} columns.")
    except FileNotFoundError:
        print(f"Error: File not found at {input_filepath}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred during file reading: {e}")
        return pd.DataFrame()

    # Define the columns to keep as identifiers (id_vars)
    # These are the first column: Country Name
    id_vars = df.columns[:1].tolist()

    # Define the columns to unpivot (value_vars) - these are all year columns after the identifiers
    value_vars = df.columns[4:].tolist()

    # 2. Perform the melt (unpivot) operation
    df_long = pd.melt(
        df,
        id_vars=id_vars,
        value_vars=value_vars,
        var_name="Year",
        value_name=value_column_name
    )

    # 3. Clean up data types and remove missing data
    # Convert 'Year' to integer
    df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce').astype('Int64')
    
    # Convert the value column to numeric, coercing the original empty strings to NaN
    df_long[value_column_name] = pd.to_numeric(df_long[value_column_name], errors='coerce')

    # Remove rows where the indicator value is NaN (original empty cells)
    rows_before = len(df_long)
    df_long.dropna(subset=[value_column_name], inplace=True)
    df_long.reset_index(drop=True, inplace=True)
    rows_after = len(df_long)
    
    print(f"Successfully processed {input_filepath}: {rows_before - rows_after} empty rows dropped.")
    
    # 4. Save the transformed DataFrame
    df_long.to_csv(output_filepath, index=False)
    print(f"Transformed data saved to {output_filepath}")

    return df_long

In [None]:
transform_wide_to_long(input_filepath="../../data/raw/gdp/data.csv", output_filepath="../../data/processed/gdp/data.csv", value_column_name="GDP per capita")

Original dataframe has 266 rows and 70 columns.
Successfully processed ../../data/raw/gdp/data.csv: 9095 empty rows dropped.
Transformed data saved to ../../data/processed/gdp/data.csv


Unnamed: 0,Country Name,Year,"GDP per capita, PPP (constant 2021 international $)"
0,Aruba,1990,34579.220640
1,Africa Eastern and Southern,1990,3385.675600
2,Africa Western and Central,1990,3324.458127
3,Angola,1990,7391.752354
4,Albania,1990,5560.857871
...,...,...,...
8456,Samoa,2024,6894.889855
8457,Kosovo,2024,16380.868590
8458,South Africa,2024,13598.788866
8459,Zambia,2024,3716.037353


In [None]:
transform_wide_to_long(input_filepath="../../data/raw/health_expenditure/data.csv", output_filepath="../../data/processed/health_expenditure/data.csv", value_column_name="Current health expenditure percentage")

Original dataframe has 266 rows and 70 columns.
Successfully processed ../../data/raw/health_expenditure/data.csv: 12093 empty rows dropped.
Transformed data saved to ../../data/processed/health_expenditure/data.csv


Unnamed: 0,Country Name,Year,Current health expenditure (% of GDP)
0,Africa Eastern and Southern,2000,5.654027
1,Africa Western and Central,2000,3.462852
2,Angola,2000,1.908599
3,Albania,2000,5.944198
4,Andorra,2000,5.952764
...,...,...,...
5458,Philippines,2023,5.106296
5459,Poland,2023,7.004516
5460,Portugal,2023,10.003311
5461,Slovenia,2023,9.413308


In [None]:
transform_wide_to_long(input_filepath="../../data/raw/maternal_education/data.csv", output_filepath="../../data/processed/maternal_education/data.csv", value_column_name="Secondary education, pupils female percentage")

Original dataframe has 266 rows and 70 columns.
Successfully processed ../../data/raw/maternal_education/data.csv: 9441 empty rows dropped.
Transformed data saved to ../../data/processed/maternal_education/data.csv


Unnamed: 0,Country Name,Year,"Secondary education, pupils (% female)"
0,Afghanistan,1970,13.12944
1,Arab World,1970,34.00445
2,Argentina,1970,52.49140
3,Brazil,1970,50.46579
4,Brunei Darussalam,1970,44.75123
...,...,...,...
8110,Djibouti,2019,45.77627
8111,Ghana,2019,48.88477
8112,Kazakhstan,2019,48.95600
8113,Monaco,2019,49.76261


In [12]:
transform_wide_to_long(input_filepath="../../data/raw/population_density/data.csv", output_filepath="../../data/processed/population_density/data.csv", value_column_name="Population density (people per sq. km of land area)")

Original dataframe has 266 rows and 70 columns.
Successfully processed ../../data/raw/population_density/data.csv: 2260 empty rows dropped.
Transformed data saved to ../../data/processed/population_density/data.csv


Unnamed: 0,Country Name,Year,Population density (people per sq. km of land area)
0,Aruba,1961,308.766667
1,Africa Eastern and Southern,1961,12.036017
2,Afghanistan,1961,14.127046
3,Africa Western and Central,1961,11.021477
4,Angola,1961,4.252493
...,...,...,...
15291,Samoa,2023,77.936331
15292,"Yemen, Rep.",2023,74.608025
15293,South Africa,2023,52.108569
15294,Zambia,2023,27.877648


In [None]:
transform_wide_to_long(input_filepath="../../data/raw/slum_population/data.csv", output_filepath="../../data/processed/slum_population/data.csv", value_column_name="Population living in slums percentage")

Original dataframe has 266 rows and 70 columns.
Successfully processed ../../data/raw/slum_population/data.csv: 15121 empty rows dropped.
Transformed data saved to ../../data/processed/slum_population/data.csv


Unnamed: 0,Country Name,Year,Population living in slums (% of urban population)
0,Aruba,2000,0.000000
1,Africa Eastern and Southern,2000,53.224958
2,Africa Western and Central,2000,71.264885
3,Angola,2000,19.700000
4,Albania,2000,28.100000
...,...,...,...
2430,Samoa,2022,34.585330
2431,"Yemen, Rep.",2022,44.200000
2432,South Africa,2022,24.200000
2433,Zambia,2022,48.258130


In [None]:
transform_wide_to_long(input_filepath="../../data/raw/water_access/data.csv", output_filepath="../../data/processed/water_access/data.csv", value_column_name="People using safely managed drinking water services percentage")

Original dataframe has 266 rows and 70 columns.
Successfully processed ../../data/raw/water_access/data.csv: 13792 empty rows dropped.
Transformed data saved to ../../data/processed/water_access/data.csv


Unnamed: 0,Country Name,Year,People using safely managed drinking water services (% of population)
0,Afghanistan,2000,11.093326
1,Africa Western and Central,2000,13.499546
2,Albania,2000,49.138321
3,Andorra,2000,90.640000
4,Armenia,2000,82.350719
...,...,...,...
3759,Uzbekistan,2022,79.845298
3760,Viet Nam,2022,57.781411
3761,World,2022,72.927152
3762,Samoa,2022,62.191714
