In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

import plotly.express as px
from plotly.subplots import make_subplots

from datasets import DF, Load

In [2]:
df = Load(DF.WIID)
df = df[['ISO', 'YEAR', 'QUINTILE 1 (MEAN INCOME)', 'DECILE 1 (MEAN INCOME)', 'GDP PER CAPITA']]
df = df.rename(columns = {'ISO': 'c3', 'YEAR': 'year', 'QUINTILE 1 (MEAN INCOME)': 'qy1', 'DECILE 1 (MEAN INCOME)': 'dy1', 'GDP PER CAPITA': 'gdppc'})

df

Unnamed: 0,c3,year,qy1,dy1,gdppc
0,AFG,1975,421.975,308.64,1603.67
1,AFG,1976,434.155,317.55,1649.96
2,AFG,1977,401.545,293.70,1526.02
3,AFG,1978,420.405,307.50,1597.69
4,AFG,1979,401.940,293.99,1527.52
...,...,...,...,...,...
3583,YEM,2016,844.295,657.89,2740.12
3584,YEM,2017,782.280,609.57,2538.84
3585,YEM,2018,769.800,599.84,2498.35
3586,YEM,2019,773.195,602.49,2509.36


In [3]:
df['ln(inc)'] = np.log(df['gdppc'])
df['ln(poor_q1)'] = np.log(df['qy1'])
df['ln(poor_d1)'] = np.log(df['dy1'])

df['del_ln(inc)'] = df.groupby('c3')['ln(inc)'].diff()
df['del_ln(poor_q1)'] = df.groupby('c3')['ln(poor_q1)'].diff()
df['del_ln(poor_d1)'] = df.groupby('c3')['ln(poor_d1)'].diff()

In [4]:
df = df[df['year'] > 1979]
df = df.reset_index(drop = True)
df

Unnamed: 0,c3,year,qy1,dy1,gdppc,ln(inc),ln(poor_q1),ln(poor_d1),del_ln(inc),del_ln(poor_q1),del_ln(poor_d1)
0,AFG,1980,400.365,292.84,1521.54,7.327478,5.992377,5.679626,-0.003923,-0.003926,-0.003919
1,AFG,1981,449.480,328.76,1708.19,7.443190,6.108091,5.795328,0.115711,0.115715,0.115702
2,AFG,1982,498.985,364.97,1896.33,7.547676,6.212576,5.899815,0.104486,0.104485,0.104487
3,AFG,1983,529.240,387.10,2011.30,7.606537,6.271442,5.958683,0.058861,0.058866,0.058868
4,AFG,1984,525.310,384.23,1996.37,7.599086,6.263989,5.951241,-0.007451,-0.007453,-0.007442
...,...,...,...,...,...,...,...,...,...,...,...
3193,YEM,2016,844.295,657.89,2740.12,7.915757,6.738502,6.489038,-0.126521,-0.126518,-0.126519
3194,YEM,2017,782.280,609.57,2538.84,7.839463,6.662213,6.412754,-0.076294,-0.076289,-0.076284
3195,YEM,2018,769.800,599.84,2498.35,7.823386,6.646131,6.396663,-0.016077,-0.016082,-0.016091
3196,YEM,2019,773.195,602.49,2509.36,7.827783,6.650531,6.401071,0.004397,0.004401,0.004408


_"For each country we begin with the first available observation, and then move forward in time until we encounter the next observation subject to the constraint that at least five years separate observations, until we have exhausted the available data for that country"_ (p. 13, Dollar & Kraay, 2002)

In [5]:
def dk_filter(gp) -> pd.DataFrame:
    selected_rows = []
    last_year = None
    
    for _, row in gp.iterrows():
        if last_year is None or row['year'] >= last_year + 5:
            selected_rows.append(row)
            last_year = row['year']
    
    return pd.DataFrame(selected_rows)

In [6]:
fil = df.groupby('c3')[df.columns].apply(dk_filter).reset_index(drop=True)
fil = fil.copy()

In [7]:
print(f'{fil.shape[0]} episodes, {len(fil['c3'].unique())} countries.')

702 episodes, 78 countries.


In [8]:
lr_x = fil['ln(inc)']
lr_y_q1 = fil['ln(poor_q1)']
lr_y_d1 = fil['ln(poor_d1)']

lr_x = sm.add_constant(lr_x)

In [9]:
lr_model = sm.OLS(lr_y_q1, lr_x).fit()
print(lr_model.summary())

                            OLS Regression Results                            
Dep. Variable:            ln(poor_q1)   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     7580.
Date:                Thu, 16 Jan 2025   Prob (F-statistic):               0.00
Time:                        20:56:00   Log-Likelihood:                -477.05
No. Observations:                 702   AIC:                             958.1
Df Residuals:                     700   BIC:                             967.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.1018      0.119    -26.028      0.0

**ln(poor_q1) = 1.18 ln(inc) -3.10. R² = 0.92**

In [10]:
lr_model = sm.OLS(lr_y_d1, lr_x).fit()
print(lr_model.summary())

                            OLS Regression Results                            
Dep. Variable:            ln(poor_d1)   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.869
Method:                 Least Squares   F-statistic:                     4631.
Date:                Thu, 16 Jan 2025   Prob (F-statistic):          8.06e-311
Time:                        20:56:53   Log-Likelihood:                -658.31
No. Observations:                 702   AIC:                             1321.
Df Residuals:                     700   BIC:                             1330.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.5739      0.154    -23.165      0.0

**ln(poor_d1) = 1.20 ln(inc) -3.57. R² = 0.87**

### Dollar-Kraay Medium-run

_"In our econometric estimation (discussed in the following subsection) we restrict the sample further to the set of 285 observations covering 92 countries for which at least two spaced observations on mean income of the poor are available, so that we can consider within-country growth in mean incomes of the poor over periods of at least five years."_ (p. 13, Dollar & Kraay, 2002)

In [11]:
fil

Unnamed: 0,c3,year,qy1,dy1,gdppc,ln(inc),ln(poor_q1),ln(poor_d1),del_ln(inc),del_ln(poor_q1),del_ln(poor_d1)
0,AFG,1980,400.365,292.84,1521.54,7.327478,5.992377,5.679626,-0.003923,-0.003926,-0.003919
1,AFG,1985,512.340,374.74,1947.10,7.574096,6.238988,5.926232,-0.024989,-0.025000,-0.025009
2,AFG,1990,378.365,276.75,1437.93,7.270960,5.935859,5.623115,-0.036698,-0.036690,-0.036681
3,AFG,1995,248.680,181.89,945.09,6.851280,5.516167,5.203402,0.390265,0.390241,0.390243
4,AFG,2000,197.380,144.37,750.13,6.620247,5.285131,4.972379,-0.031907,-0.031910,-0.031902
...,...,...,...,...,...,...,...,...,...,...,...
697,YEM,2000,1429.640,1105.22,4605.61,8.435030,7.265178,7.007800,0.031813,0.043382,0.050245
698,YEM,2005,1610.010,1284.45,4904.64,8.497937,7.383996,7.158086,0.025787,0.036725,0.042666
699,YEM,2010,1683.700,1332.22,5261.93,8.568253,7.428749,7.194602,0.046534,0.037315,0.033694
700,YEM,2015,958.165,746.62,3109.69,8.042278,6.865020,6.615556,-0.354493,-0.354494,-0.354497


In [12]:
print(f'{fil.shape[0]} episodes, {len(fil['c3'].unique())} countries.')

702 episodes, 78 countries.


In [13]:
mr_x = fil['del_ln(inc)']
mr_y_q1 = fil['del_ln(poor_q1)']
mr_y_d1 = fil['del_ln(poor_d1)']

mr_x = sm.add_constant(mr_x)

In [14]:
mr_model = sm.OLS(mr_y_q1, fil['del_ln(inc)']).fit()
print(mr_model.summary())

                                 OLS Regression Results                                
Dep. Variable:        del_ln(poor_q1)   R-squared (uncentered):                   0.525
Model:                            OLS   Adj. R-squared (uncentered):              0.524
Method:                 Least Squares   F-statistic:                              773.7
Date:                Thu, 16 Jan 2025   Prob (F-statistic):                   2.58e-115
Time:                        20:57:25   Log-Likelihood:                          917.03
No. Observations:                 702   AIC:                                     -1832.
Df Residuals:                     701   BIC:                                     -1827.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

**∆ln(poor_q1) = 1.02 ∆ln(inc). R² = 0.53**

In [15]:
mr_model = sm.OLS(mr_y_d1, fil['del_ln(inc)']).fit()
print(mr_model.summary())

                                 OLS Regression Results                                
Dep. Variable:        del_ln(poor_d1)   R-squared (uncentered):                   0.330
Model:                            OLS   Adj. R-squared (uncentered):              0.329
Method:                 Least Squares   F-statistic:                              345.9
Date:                Thu, 16 Jan 2025   Prob (F-statistic):                    4.65e-63
Time:                        20:57:50   Log-Likelihood:                          619.86
No. Observations:                 702   AIC:                                     -1238.
Df Residuals:                     701   BIC:                                     -1233.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

**∆ln(poor_d1) = 1.04  ∆ln(inc). R² = 0.33**

### Graphs

In [17]:
col_curve = '#286bbb'
col_acc = '#883039'

fc = '#595959'
fc_scnd = '#8c8c8c'
fc_title = '#262626'

In [18]:
sctr = px.scatter(
    fil, x='ln(inc)', y='ln(poor_q1)', trendline='ols', title='Relating the Income of the Poor to Average Incomes',
    labels={'ln(inc)': 'log (per capita income)', 'ln(poor_q1)': 'log (per capita income in poorest quintile)'},
)

del_sctr = px.scatter(
    fil, x='del_ln(inc)', y='del_ln(poor_q1)', trendline='ols', 
    labels={'del_ln(inc)': '∆log (per capita income)', 'del_ln(poor_q1)': '∆log (per capita income in poorest quintile)'},
)

lr_fig = make_subplots(rows=1, cols=2)

for trace in sctr.data:
    trace.update(marker=dict(color=col_curve, size=8))
    if trace.mode == 'lines':
        trace.update(line=dict(color=col_acc, width=2))
    lr_fig.add_trace(trace, row=1, col=1)

for trace in del_sctr.data:
    trace.update(marker=dict(color=col_curve, size=8))
    if trace.mode == 'lines':
        trace.update(line=dict(color=col_acc, width=2))
    lr_fig.add_trace(trace, row=1, col=2)

lr_fig.update_xaxes(title_text='log (per capita income)', row=1, col=1)
lr_fig.update_yaxes(title_text='log (per capita income in poorest quintile)', row=1, col=1)

lr_fig.update_xaxes(title_text='∆log (per capita income)', row=1, col=2)
lr_fig.update_yaxes(title_text='∆log (per capita income in poorest quintile)', row=1, col=2)

lr_fig.update_xaxes({'zerolinecolor': '#45464b', 'linewidth': 2}, row=1, col=2)
lr_fig.update_yaxes({'zerolinecolor': '#45464b', 'linewidth': 2}, row=1, col=2)

lr_fig.update_layout(
    title=dict(
        text='Relating the Income of the Poor to Average Incomes',
        font=dict(size=20),
        xanchor='center',
        yanchor='middle'
    ),
    template='seaborn',
    width=1200,
    height=600,
    font_family="Helvetica Now Text",
    font_color=fc,
    title_font_color=fc_title,
    plot_bgcolor='rgba(234, 234, 243, 0.75)',
)

lr_fig.add_annotation(
    text='Source: UNU-WIDER, World Income Inequality Database (WIID) Companion dataset. Version 15 May 2023 (1.0.1).',
    xref='paper', yref='paper', x=0, y=-0.165,
    xanchor='left', yanchor='bottom', 
    font=dict(size=10, color=fc_scnd), 
    align='left', showarrow=False
)

lr_fig.show()

In [19]:
lr_fig.write_image('../images/dollar-kraay-q1.png', scale=3, engine='kaleido')

In [20]:
sctr = px.scatter(
    fil, x='ln(inc)', y='ln(poor_d1)', trendline='ols', title='Relating the Income of the Poor to Average Incomes',
    labels={'ln(inc)': 'log (per capita income)', 'ln(poor_d1)': 'log (per capita income in poorest decile)'},
)

del_sctr = px.scatter(
    fil, x='del_ln(inc)', y='del_ln(poor_d1)', trendline='ols', 
    labels={'del_ln(inc)': '∆log (per capita income)', 'del_ln(poor_d1)': '∆log (per capita income in poorest decile)'},
)

mr_fig = make_subplots(rows=1, cols=2)

for trace in sctr.data:
    trace.update(marker=dict(color=col_curve, size=8))
    if trace.mode == 'lines':
        trace.update(line=dict(color=col_acc, width=2))
    mr_fig.add_trace(trace, row=1, col=1)

for trace in del_sctr.data:
    trace.update(marker=dict(color=col_curve, size=8))
    if trace.mode == 'lines':
        trace.update(line=dict(color=col_acc, width=2))
    mr_fig.add_trace(trace, row=1, col=2)

mr_fig.update_xaxes(title_text='log (per capita income)', row=1, col=1)
mr_fig.update_yaxes(title_text='log (per capita income in poorest decile)', row=1, col=1)

mr_fig.update_xaxes(title_text='∆log (per capita income)', row=1, col=2)
mr_fig.update_yaxes(title_text='∆log (per capita income in poorest decile)', row=1, col=2)

mr_fig.update_xaxes({'zerolinecolor': '#45464b', 'linewidth': 2}, row=1, col=2)
mr_fig.update_yaxes({'zerolinecolor': '#45464b', 'linewidth': 2}, row=1, col=2)

mr_fig.update_layout(
    title=dict(
        text='Relating the Income of the Poor to Average Incomes',
        font=dict(size=20),
        xanchor='center',
        yanchor='middle'
    ),
    template='seaborn',
    width=1200,
    height=600,
    font_family="Helvetica Now Text",
    font_color=fc,
    title_font_color=fc_title,
    plot_bgcolor='rgba(234, 234, 243, 0.75)',
)

mr_fig.add_annotation(
    text='Source: UNU-WIDER, World Income Inequality Database (WIID) Companion dataset. Version 15 May 2023 (1.0.1).',
    xref='paper', yref='paper', x=0, y=-0.165,
    xanchor='left', yanchor='bottom', 
    font=dict(size=10, color=fc_scnd), 
    align='left', showarrow=False
)

mr_fig.show()

In [21]:
mr_fig.write_image('../images/dollar-kraay-d1.png', scale=3, engine='kaleido')

### Control Variables

In [18]:
ctr_df = pd.read_csv('ctr-2023.csv')
ctr_codes = ctr_df['Country Code (alpha-3)'].to_numpy()

un = pd.read_csv('un-wiid-trunc.csv')
un = un[['ISO', 'YEAR', 'GINI INDEX']]
un = un.rename(columns = {'ISO': 'c3', 'YEAR': 'year', 'GINI INDEX': 'gini'})

weo = pd.read_excel('imf-weo.xlsx', sheet_name='Data')
weo = weo[['Economy ISO3', 'Indicator ID', 'Attribute 1'] + [str(i) for i in range(1980, 2020 + 1)]]
weo = weo.rename(columns = {'Economy ISO3': 'c3', 'Indicator ID': 'id', 'Attribute 1': 'a'})

wdi = pd.read_csv('wb-wdi-trunc.csv')
wdi = wdi[['Country Code', 'Indicator Code'] + [str(i) for i in range(1980, 2023 + 1)]]
wdi = wdi.rename(columns = {'Country Code': 'c3', 'Indicator Code': 'id'})

hdr = pd.read_csv('hdr.csv', encoding='latin-1')
hdr = hdr[['iso3'] + [f'le_{i}' for i in range(1990, 2022 + 1)] + [f'mys_{i}' for i in range(1990, 2022 + 1)]]
hdr = hdr.rename(columns = {'iso3': 'c3'})

In [19]:
# Gini
df = df.merge(un, on=['c3', 'year'])

# GDPpc at 1980
df['gdppc_init'] = df.groupby('c3')['gdppc'].transform('first')

# Life Expectancy and Mean Years of Schooling
hdr = hdr[hdr['c3'].isin(ctr_codes)]
hdr_le = hdr.filter(regex='^c3|le_').melt(id_vars='c3', var_name='year', value_name='life_expectancy')
hdr_mys = hdr.filter(regex='^c3|mys_').melt(id_vars='c3', var_name='year', value_name='mean_years_schooling')
hdr_le['year'] = hdr_le['year'].str.replace('le_', '', regex=False).astype(int)
hdr_mys['year'] = hdr_mys['year'].str.replace('mys_', '', regex=False).astype(int)
hdr_f = pd.merge(hdr_le, hdr_mys, on=['c3', 'year'], how='outer')

df = pd.merge(df, hdr_f, on=['c3', 'year'], how='outer')

# Trade as % of GDP, Government Expenditure on Education as % of GDP, Urban Population as % of total population
wdi = wdi[wdi['id'].isin(['SE.XPD.TOTL.GD.ZS', 'NE.TRD.GNFS.ZS', 'SP.URB.TOTL.IN.ZS'])]
wdi_f = wdi.melt(id_vars=['c3', 'id'], var_name='year', value_name='value')
wdi_f['year'] = wdi_f['year'].astype(int)
wdi_f = wdi_f.pivot_table(index=['c3', 'year'], columns='id', values='value').reset_index()
wdi_f = wdi_f.rename(columns = {'NE.TRD.GNFS.ZS': 'trade_pc', 'SE.XPD.TOTL.GD.ZS': 'govt_exp_educ', 'SP.URB.TOTL.IN.ZS': 'urban_pop_pct'})

df = pd.merge(df, wdi_f, on=['c3', 'year'], how='outer')

weo = weo[weo['a'] == 'Actual']
weo = weo[weo['c3'].isin(ctr_codes)]
weo = weo[weo['id'].isin(['IMF.WEO.NID_NGDP', 'IMF.WEO.PCPIEPCH'])]
weo = weo.drop(columns = ['a'])
weo_f = weo.melt(id_vars=['c3', 'id'], var_name='year', value_name='value')
weo_f['year'] = weo_f['year'].astype(int)
weo_f = weo_f.pivot_table(index=['c3', 'year'], columns='id', values='value').reset_index()
weo_f = weo_f.rename(columns = {'IMF.WEO.PCPIEPCH': 'inf_rate', 'IMF.WEO.NID_NGDP': 'inv_pc'})

df = pd.merge(df, weo_f, on=['c3', 'year'], how='outer')

In [20]:
df = df.infer_objects()
df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
df = df.reset_index(drop=True)
df

  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
  df = df.groupby('c3')[df.columns].apply(lambda group: group.interpolat

Unnamed: 0,c3,year,qy1,dy1,gdppc,ln(inc),ln(poor_q1),ln(poor_d1),del_ln(inc),del_ln(poor_q1),del_ln(poor_d1),gini,gdppc_init,life_expectancy,mean_years_schooling,trade_pc,govt_exp_educ,urban_pop_pct,inv_pc,inf_rate
0,AFG,1980,400.365,292.84,1521.54,7.327478,5.992377,5.679626,-0.003923,-0.003926,-0.003919,44.484,1521.54,45.967,0.871962,46.709895,1.840930,15.995,27.24,6.53
1,AFG,1981,449.480,328.76,1708.19,7.443190,6.108091,5.795328,0.115711,0.115715,0.115702,44.484,1521.54,45.967,0.871962,46.709895,1.909170,16.562,27.24,6.53
2,AFG,1982,498.985,364.97,1896.33,7.547676,6.212576,5.899815,0.104486,0.104485,0.104487,44.484,1521.54,45.967,0.871962,46.709895,1.729980,17.147,27.24,6.53
3,AFG,1983,529.240,387.10,2011.30,7.606537,6.271442,5.958683,0.058861,0.058866,0.058868,44.484,1521.54,45.967,0.871962,46.709895,1.853096,17.747,27.24,6.53
4,AFG,1984,525.310,384.23,1996.37,7.599086,6.263989,5.951241,-0.007451,-0.007453,-0.007442,44.484,1521.54,45.967,0.871962,46.709895,1.976212,18.365,27.24,6.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3383,YEM,2019,773.195,602.49,2509.36,7.827783,6.650531,6.401071,0.004397,0.004401,0.004408,39.616,3990.99,65.092,2.687384,58.902279,5.464790,37.273,6.47,1.37
3384,YEM,2020,773.195,602.49,2509.36,7.827783,6.650531,6.401071,0.000000,0.000000,0.000000,39.616,3990.99,64.650,2.776826,58.902279,5.464790,37.908,5.61,43.23
3385,YEM,2021,773.195,602.49,2509.36,7.827783,6.650531,6.401071,0.000000,0.000000,0.000000,39.616,3990.99,63.753,2.776826,58.902279,5.464790,38.546,5.61,43.23
3386,YEM,2022,773.195,602.49,2509.36,7.827783,6.650531,6.401071,0.000000,0.000000,0.000000,39.616,3990.99,63.720,2.776826,58.902279,5.464790,39.188,5.61,43.23


In [21]:
df.to_csv('datasets/dollar-kraay-controls-dataset.csv', index = False, header = True)