In [2]:
import pandas as pd
import os

In [37]:
def process_worldbank_data(source_file, metadata_file, output_file_prefix, source_directory, output_directory, save_metadata=False):
    # Load the CSV file
    file_path = os.path.join(source_directory, source_file)
    data = pd.read_csv(file_path, skiprows=4)

    # Load the country metadata CSV file
    metadata_file_path = os.path.join(source_directory, metadata_file)
    metadata = pd.read_csv(metadata_file_path)

    # Drop unnecessary columns and rows with all NaN values in data
    data = data.drop(columns=['Indicator Name', 'Indicator Code', 'Unnamed: 68'])
    data = data.dropna(how='all', subset=data.columns[2:])

    # Melt the DataFrame to long format
    data_long = data.melt(id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='Value')

    # Drop rows with missing values in data
    data_long = data_long.dropna()

    # Convert 'Year' column to integer
    data_long['Year'] = data_long['Year'].astype(int)

    # Select relevant columns from metadata and merge with data
    metadata = metadata[['Country Code', 'Region', 'IncomeGroup']]
    merged_data = pd.merge(data_long, metadata, on='Country Code', how='left')

    # Rename the columns in the merged data
    merged_data = merged_data.rename(columns={
        'Country Name': 'country',
        'Country Code': 'countryCode',
        'Year': 'year',
        'Value': 'value',
        'Region': 'region',
        'IncomeGroup': 'incomeGroup'
    })

    # Rename the columns in the metadata
    metadata = metadata.rename(columns={
        'Country Code': 'countryCode',
        'Region': 'region',
        'IncomeGroup': 'incomeGroup'
    })

    # Save the merged DataFrame to a Parquet file
    file_path_parquet = os.path.join(output_directory, f'{output_file_prefix}.parquet')
    file_path_csv = os.path.join(output_directory, f'{output_file_prefix}.csv')

    merged_data.to_parquet(file_path_parquet, engine='pyarrow')
    merged_data.to_csv(file_path_csv, index=False)

    # Save the metadata file only if the flag is set to True
    if save_metadata:
        processed_metadata_file_path_csv = os.path.join(output_directory, 'country_metadata.csv')
        metadata.to_csv(processed_metadata_file_path_csv, index=False)
        print(f'Metadata file saved to {processed_metadata_file_path_csv}')

    print(f'Files for {output_file_prefix} saved to {output_directory}')




In [43]:
# Example usage
source_directory = '../../data/source/worldbank/'  # Update with the actual path to your directory containing CSV files
output_directory = '../../data/processed/worldbank/'  # Update with the path where you want to save Parquet files

# Process the first file and save metadata
process_worldbank_data(
    source_file='API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678/API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678.csv',
    metadata_file='API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678/Metadata_Country_API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678.csv',
    output_file_prefix='gdp_per_capita_ppp_constant_2021_international$',
    source_directory=source_directory,
    output_directory=output_directory,
    save_metadata=True
)

# Process additional files without saving metadata again
process_worldbank_data(
    source_file='API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2_1887125/API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2_1887125.csv',
    metadata_file='API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678/Metadata_Country_API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678.csv',
    output_file_prefix='gdp_per_capita_ppp_current_international$',
    source_directory=source_directory,
    output_directory=output_directory,
    save_metadata=False
)

# Process additional files without saving metadata again
process_worldbank_data(
    source_file='API_NY.GDP.PCAP.CD_DS2_en_csv_v2_1887433/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_1887433.csv',
    metadata_file='API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678/Metadata_Country_API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678.csv',
    output_file_prefix='gdp_per_capita_current_us$',
    source_directory=source_directory,
    output_directory=output_directory,
    save_metadata=False
)

process_worldbank_data(
    source_file='API_PA.NUS.PPPC.RF_DS2_en_csv_v2_1665754/API_PA.NUS.PPPC.RF_DS2_en_csv_v2_1665754.csv',
    metadata_file='API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678/Metadata_Country_API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678.csv',
    output_file_prefix='ppp_conversion_factor',
    source_directory=source_directory,
    output_directory=output_directory,
    save_metadata=False
)


process_worldbank_data(
    source_file='API_NY.GDP.MKTP.CD_DS2_en_csv_v2_2002465/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_2002465.csv',
    metadata_file='API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678/Metadata_Country_API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678.csv',
    output_file_prefix='gdp_current_us$',
    source_directory=source_directory,
    output_directory=output_directory,
    save_metadata=False
)


process_worldbank_data(
    source_file='API_NY.GDP.MKTP.CN_DS2_en_csv_v2_2001080/API_NY.GDP.MKTP.CN_DS2_en_csv_v2_2001080.csv',
    metadata_file='API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678/Metadata_Country_API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678.csv',
    output_file_prefix='gdp_current_lcu',
    source_directory=source_directory,
    output_directory=output_directory,
    save_metadata=False
)


process_worldbank_data(
    source_file='API_NY.GDP.MKTP.KN_DS2_en_csv_v2_2001151/API_NY.GDP.MKTP.KN_DS2_en_csv_v2_2001151.csv',
    metadata_file='API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678/Metadata_Country_API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_1588678.csv',
    output_file_prefix='gdp_constant_lcu',
    source_directory=source_directory,
    output_directory=output_directory,
    save_metadata=False
)




Metadata file saved to ../../data/processed/worldbank/country_metadata.csv
Files for gdp_per_capita_ppp_constant_2021_international$ saved to ../../data/processed/worldbank/
Files for gdp_per_capita_ppp_current_international$ saved to ../../data/processed/worldbank/
Files for gdp_per_capita_current_us$ saved to ../../data/processed/worldbank/
Files for ppp_conversion_factor saved to ../../data/processed/worldbank/
Files for gdp_current_us$ saved to ../../data/processed/worldbank/
Files for gdp_current_lcu saved to ../../data/processed/worldbank/
Files for gdp_constant_lcu saved to ../../data/processed/worldbank/


In [40]:
merged_data


Unnamed: 0,Country Name,Country Code,Year,GDP per Capita,Region,IncomeGroup
0,Aruba,ABW,1990,33050.644278,Latin America & Caribbean,High income
1,Africa Eastern and Southern,AFE,1990,3415.748255,,
2,Africa Western and Central,AFW,1990,3328.880333,,
3,Angola,AGO,1990,7265.348208,Sub-Saharan Africa,Lower middle income
4,Albania,ALB,1990,5377.944657,Europe & Central Asia,Upper middle income
...,...,...,...,...,...,...
8171,Samoa,WSM,2023,6021.679135,East Asia & Pacific,Lower middle income
8172,Kosovo,XKX,2023,13546.951417,Europe & Central Asia,Upper middle income
8173,South Africa,ZAF,2023,14284.339917,Sub-Saharan Africa,Upper middle income
8174,Zambia,ZMB,2023,3718.700564,Sub-Saharan Africa,Lower middle income
