In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import random

## Preprocess WEO data

In [None]:
weo_data = pd.read_excel('WEOApr2023all.xlsx', sheet_name='WEOApr2023all').replace('--',None)

In [None]:
# extract data between given dates for a specific list of variables

START_YEAR = 1990
END_YEAR = 2019
VARIABLES = ['ISO','year','GGX_NGDP','NGDPD','LP','PCPI']

k = list(range(1980,2021)) 
k.append('ISO')
k.append('WEO Subject Code')
df = weo_data[k].melt(id_vars=['WEO Subject Code','ISO'],var_name='year').dropna()
df = df.pivot(index=['ISO','year'],columns='WEO Subject Code',values='value').reset_index()

df = df[VARIABLES]

df = df.dropna()
df = df[(df['year']>=START_YEAR)&(df['year']<=END_YEAR)]

df2 = df.groupby('ISO').count().reset_index()[['ISO','year']]
COUNTRIES = df2[df2['year']==END_YEAR-START_YEAR+1]['ISO'].tolist()

df = df[df['ISO'].isin(COUNTRIES)]

iso = df['ISO']
weo_data = df.drop(columns='ISO').replace(',','').astype(float)
weo_data['ISO'] = iso

weo_data

## Preprocess PWT data

In [None]:
pwt_data = pd.read_excel('pwt1001.xlsx', sheet_name='Data')

In [None]:
# extract data between given dates for a specific list of variables

START_YEAR = 1990
END_YEAR = 2019
VARIABLES = ['countrycode','country','year','emp','hc']

df = pwt_data[VARIABLES].dropna()
df = df[(df['year']>=START_YEAR)&(df['year']<=END_YEAR)]

df2 = df.groupby('countrycode').count().reset_index()[['countrycode','country']]
COUNTRIES = df2[df2['country']==END_YEAR-START_YEAR+1]['countrycode'].tolist()

df = df[df['countrycode'].isin(COUNTRIES)]
pwt_data = df.rename(columns={'countrycode':'ISO'}).drop(columns='country')
pwt_data['year'] = pwt_data['year'].astype(float)
pwt_data

## Merge datasets and reformat

In [None]:
data = pd.merge(weo_data, pwt_data, on=['ISO','year'])
data = data.rename(columns={'NGDPD':'Y','LP':'X_population','emp':'X_employment','hc':'X_human_capital_index','PCPI':'X_inflation_rate','GGX_NGDP':'T_government_expenditure'})

# create binary intervention
# 1 if government expenditure (as % of GDP) is higher than 30%, 0 otherwise
data['T_government_expenditure'] = (data['T_government_expenditure']>30).astype(int)
data = data.dropna()

# if inflation rate >1000 set it to 1000 (to avoid extreme outliers)
data['X_inflation_rate'] = data['X_inflation_rate'].apply(lambda x : x if x<1000 else 1000)

# the scales vary widely across variables so we rescale the data to have a common scale across variabless
scaler = StandardScaler()
exclude_cols = ['ISO','year','T_government_expenditure']
data_to_scale = data.drop(columns=exclude_cols)
data = pd.concat([data[exclude_cols],pd.DataFrame(scaler.fit_transform(data_to_scale), columns=data_to_scale.keys())], axis=1)

data

## Create random splits for main experiment

In [None]:
NUMTASKS_TRAIN = 49
NUMTASKS_VAL = 10
NUMTASKS_TEST = 10
NUMSAMPLES_SPT = 15
NUMSAMPLES_QRY = 15

COUNTRIES = data['ISO'].unique().tolist()

for dataset in range(1,4):

    random.shuffle(COUNTRIES)
    train_tasks = COUNTRIES[0:NUMTASKS_TRAIN]
    val_tasks = COUNTRIES[NUMTASKS_TRAIN:NUMTASKS_TRAIN+NUMTASKS_VAL]
    test_tasks = COUNTRIES[NUMTASKS_TRAIN+NUMTASKS_VAL:]

    meta_train_type = {'train':0, 'val':1, 'test':2}
    country_list = {'train':train_tasks, 'val':val_tasks, 'test':test_tasks}
    
    country_map = dict(zip(COUNTRIES,range(len(COUNTRIES))))
    
    df = data.sample(frac=1)
    new_df = pd.DataFrame()
    for datatype in ['train', 'val', 'test']:
        add_data = df[df['ISO'].isin(country_list[datatype])]
        add_data['meta_train'] = meta_train_type[datatype]
        add_data['task_train'] = (add_data.groupby('ISO').cumcount() < NUMSAMPLES_SPT).astype(int)
        add_data['task'] = add_data['ISO'].map(country_map)
        new_df = pd.concat([new_df, add_data[['Y','X_employment','X_human_capital_index','X_population','X_inflation_rate','T_government_expenditure','meta_train','task_train','task']]])

    new_df = new_df.sort_values(by=['task','task_train'])
    new_df.to_csv(f'econ_dataset{dataset}.csv', index=None)

## Create random splits for distribution shift experiment

In [None]:
NUMTASKS_TRAIN = 49
NUMTASKS_VAL = 10
NUMTASKS_TEST = 10
NUMSAMPLES_SPT = 15
NUMSAMPLES_QRY = 15

COUNTRIES_IN_ORDER_OF_POP = data.groupby('ISO').mean().sort_values(by='X_population', ascending=False).reset_index()['ISO'].tolist()
HIGHEST_POP_COUNTRIES = COUNTRIES_IN_ORDER_OF_POP[0:NUMTASKS_TEST] # highest population countries (fixed across all sets)
OTHER_COUNTRIES = list(set(COUNTRIES_IN_ORDER_OF_POP) - set(HIGHEST_POP_COUNTRIES))

for dataset in range(1,4):

    random.shuffle(OTHER_COUNTRIES)
    train_tasks = OTHER_COUNTRIES[0:NUMTASKS_TRAIN]
    val_tasks = OTHER_COUNTRIES[NUMTASKS_TRAIN:NUMTASKS_TRAIN+NUMTASKS_VAL]
    test_tasks = HIGHEST_POP_COUNTRIES # fixed across all sets

    meta_train_type = {'train':0, 'val':1, 'test':2}
    country_list = {'train':train_tasks, 'val':val_tasks, 'test':test_tasks}
    
    COUNTRIES = OTHER_COUNTRIES + HIGHEST_POP_COUNTRIES
    country_map = dict(zip(COUNTRIES,range(len(COUNTRIES))))
    
    df = data.sample(frac=1)
    new_df = pd.DataFrame()
    for datatype in ['train', 'val', 'test']:
        add_data = df[df['ISO'].isin(country_list[datatype])]
        add_data['meta_train'] = meta_train_type[datatype]
        add_data['task_train'] = (add_data.groupby('ISO').cumcount() < NUMSAMPLES_SPT).astype(int)
        add_data['task'] = add_data['ISO'].map(country_map)
        new_df = pd.concat([new_df, add_data[['Y','X_employment','X_human_capital_index','X_population','X_inflation_rate','T_government_expenditure','meta_train','task_train','task']]])

    new_df = new_df.sort_values(by=['task','task_train'])
    new_df.to_csv(f'econshift_dataset{dataset}.csv', index=None)