In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})
countries = [
    'Australia',
    'Austria',
    'Belgium',
    'Canada',
    'Chile',
    'Colombia',
    'Costa Rica',
    'Czech Republic',
    'Denmark',
    'Estonia',
    'Finland',
    'France',
    'Germany',
    'Greece',
    'Hungary',
    'Iceland',
    'Ireland',
    'Israel',
    'Italy',
    'Japan',
    'Korea',
    'Latvia',
    'Lithuania',
    'Luxembourg',
    'Mexico',
    'Netherlands',
    'New Zealand',
    'Norway',
    'Poland',
    'Portugal',
    'Slovakia',
    'Slovenia',
    'Spain',
    'Sweden',
    'Switzerland',
    'Turkey',
    'USA',
    'United Kingdom'
]
countries = sorted(countries)
# countries = [
#     'Australia','Sweden', 'Costa Rica', 'Slovenia', 'Greece', 'Turkey', 'Netherlands', 'Denmark', 'Norway', 'Poland', 'Canada', 'Hungary', 'Portugal', 'Spain', 'Czech Republic', 'Belgium', 'Lithuania', 'Israel', 'New Zealand', 'Switzerland', 'Latvia', 'Chile', 'Austria', 'Finland', 'Korea', 'Colombia', 'Iceland', 'Japan', 'Luxembourg', 'USA', 'Estonia', 'Ireland', 'Italy', 'Mexico', 'United Kingdom', 'Germany'
# ]
top1p = 'p99p100'
top10p = 'p90p100'
bot50p = 'p0p50'
midp = 'mid'
percentiles = [
    top1p,
    top10p,
    bot50p
]
# https://timodenk.com/blog/exporting-matplotlib-plots-to-latex/

In [61]:
# PERCENTILE ANALYSIS FIRST
df = pd.read_csv('oecd-income-p-cleaned-subset.csv')

c_df = {
    country: {
        p: (country_df[country_df['P'] == p]).set_index('YEAR')
        for p, country_df in zip(percentiles,len(percentiles)*[df_i[df_i['COUNTRY'] == country]])
    }
    for country,df_i in zip(countries, len(countries)*[df])
}

for c, country_df in c_df.items():
    country_df[midp] = 1.0 - country_df[bot50p]['VALUE'] - country_df[top10p]['VALUE']

In [80]:
# Create plot
fig, ((ax1, ax3), (ax2, ax4)) = plt.subplots(2,2,figsize=(14,4*2), sharex=True)
# bot50%
c_cols = {}
for c in countries:
    c_cols[c] = next(ax1._get_lines.prop_cycler)['color']
    if c != "Australia":
        ax1.plot(c_df[c][bot50p]['VALUE'],alpha=0.3,linestyle='dotted', label=c,color=c_cols[c])
        # ax1.plot(c_df[c][bot50p]['VALUE'],alpha=0.3,linestyle='dashed', label=c)
    else:
        ax1.plot(c_df[c][bot50p]['VALUE'], label=c,color=c_cols[c])
ax1.set_title("Bottom 50%")
ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
ax1.set_ylabel('Share of total (%)')
# middle percentile
for c in countries:
    if c != "Australia":
        ax2.plot(c_df[c][midp],alpha=0.3,linestyle='dotted', label=c)
        # ax2.plot(c_df[c][midp],alpha=0.3,linestyle='dashed', label=c)
    else:
        ax2.plot(c_df[c][midp], label=c)
ax2.set_title("Top 50% to top 10%")
ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))

# top10%

for c in countries:
    if c != "Australia":
        ax3.plot(c_df[c][top10p]['VALUE'],alpha=0.3,linestyle='dotted', label=c)
        # ax3.plot(c_df[c][top10p]['VALUE'],alpha=0.3,linestyle='dashed', label=c)

    else:
        ax3.plot(c_df[c][top10p]['VALUE'], label=c)
ax3.set_title("top 10%")
ax3.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))

# top10%
for c in countries:
    if c != "Australia":
        ax4.plot(c_df[c][top1p]['VALUE'],alpha=0.3,linestyle='dotted', label=c)
        # ax4.plot(c_df[c][top1p]['VALUE'],alpha=0.3,linestyle='dashed', label=c)
    else:
        ax4.plot(c_df[c][top1p]['VALUE'], label=c)

ax4.set_title("top 1%")
ax4.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
ax4.set_xlabel("Year")
ax2.set_xlabel("Year")
ax2.set_ylabel('Share of total (%)')

ax1.grid()
ax2.grid()
ax3.grid()
ax4.grid()
plt.legend(loc=(1.05,-0.3))
# fig.set_size_inches(w=4.7747, h=3.5)
fig.savefig('histogram.pgf')
# plt.show()

In [63]:
c_df_diff = {}
for c, country_p in c_df.items():
    c_df_diff[c] = {}
    # print(country_p)
    for p, country_df in country_p.items():
        # print(f"{p=}{country_df=}")
        if p == midp:
            c_df_diff[c][p] = country_df.pct_change().dropna()
        else:
            c_df_diff[c][p] = country_df['VALUE'].pct_change().dropna()

In [64]:
abs(c_df_diff['Australia'][midp]) 

YEAR
1996    0.001665
1997    0.007201
1998    0.004092
1999    0.012885
2000    0.008324
2001    0.014689
2002    0.009024
2003    0.004933
2004    0.005338
2005    0.002875
2006    0.016340
2007    0.011139
2008    0.019907
2009    0.025772
2010    0.000778
2011    0.007386
2012    0.011962
2013    0.007811
2014    0.002559
2015    0.000393
2016    0.006476
2017    0.019692
2018    0.011734
2019    0.001376
2020    0.000000
2021    0.000000
Name: VALUE, dtype: float64

## UPDATED pctdiff

In [65]:
# Create plot
fig, ((ax1, ax3), (ax2, ax4)) = plt.subplots(2,2,figsize=(14,4*2), sharex=True)
# bot50%
for c in countries:
    tmp = abs(c_df_diff[c][bot50p])
    # if max(tmp) > 1:
    #     print(f"most volatile: {c}")
    #     continue
    # if max(tmp) > .5:
    #     print(f"quite volatile: {c}")
    #     continue

    if c != "Australia":
        ax1.plot(c_df_diff[c][bot50p],alpha=0.3,linestyle='dotted', label=c,color=c_cols[c])
        # ax1.plot(c_df_diff[c][bot50p],alpha=0.3,linestyle='dashed', label=c)
    else:
        ax1.plot(c_df_diff[c][bot50p], label=c,color=c_cols[c])
# ax1.set_ylim([-.1,.1])
ax1.set_title("Bottom 50%")
ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
ax1.set_ylabel('Percent change of total perecent share of total (%)')

# middle percentile
for c in countries:
    
    if c != "Australia":
        ax2.plot(c_df_diff[c][midp],alpha=0.3,linestyle='dotted', label=c)
        # ax2.plot(c_df_diff[c][midp],alpha=0.3,linestyle='dashed', label=c)
    else:
        ax2.plot(c_df_diff[c][midp], label=c)
ax2.set_title("Top 50% to top 10%")
ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))

# top10%

for c in countries:
    if c != "Australia":
        
        ax3.plot(c_df_diff[c][top10p],alpha=0.3,linestyle='dotted', label=c)
        # ax3.plot(c_df_diff[c][top10p],alpha=0.3,linestyle='dashed', label=c)

    else:
        ax3.plot(c_df_diff[c][top10p], label=c)
ax3.set_title("Top 10%")
ax3.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))

# top10%
for c in countries:
    if c != "Australia":
        ax4.plot(c_df_diff[c][top1p],alpha=0.3,linestyle='dotted', label=c)
        # ax4.plot(c_df_diff[c][top1p],alpha=0.3,linestyle='dashed', label=c)
    else:
        ax4.plot(c_df_diff[c][top1p], label=c)

ax4.set_title("top 1%")
ax4.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
ax4.set_xlabel("Year")
ax2.set_xlabel("Year")


ax1.grid()
ax2.grid()
ax3.grid()
ax4.grid()
plt.legend(loc=(1.05,-0.3))
# over 100% change
# too volatile: Costa Rica
# too volatile: Greece
# too volatile: Lithuania
# too volatile: Chile
# too volatile: Colombia
# too volatile: Ireland
# too volatile: Mexico

<matplotlib.legend.Legend at 0x16a47d430>

## Ranking of 2021 percentiles

In [66]:
bot50_2021_rank = [(c,i+1, val, c_cols[c])for i,(c, val) in enumerate(sorted([
    (c, c_df[c][bot50p]['VALUE'].loc[2021])
    for c in countries
],key=lambda x : x[1]))]

top10_2021_rank = [(c,i+1, val, c_cols[c])for i,(c, val) in enumerate(sorted([
    (c, c_df[c][top10p]['VALUE'].loc[2021])
    for c in countries
],key=lambda x : x[1]))]

midp_2021_rank = [(c,i+1, val, c_cols[c])for i,(c, val) in enumerate(sorted([
    (c, c_df[c][midp].loc[2021])
    for c in countries
],key=lambda x : x[1]))]

top1_2021_rank = [ (c,i+1, val, c_cols[c])for i,(c, val) in enumerate(sorted([
    (c, c_df[c][top1p]['VALUE'].loc[2021])
    for c in countries
],key=lambda x : x[1]))]


## 2021 percentile share plot

In [81]:
fig, ((ax1, ax3), (ax2, ax4)) = plt.subplots(2,2,figsize=(14,4*2))

# bot50%
for c, rank, p, graph_colour in bot50_2021_rank:
    if c == 'Australia':
        ax1.bar(c,p,color=graph_colour,label=c)
    else:
        ax1.bar(c,p,color=graph_colour,alpha=0.3,label=c)

# middle percentile

for c, rank, p, graph_colour in midp_2021_rank:
    if c == 'Australia':
        ax2.bar(c,p,color=graph_colour, label=c)
    else:
        ax2.bar(c,p,color=graph_colour,alpha=0.3,label=c)

# top10%


for c, rank, p, graph_colour in top10_2021_rank:
    if c == 'Australia':
        ax3.bar(c,p,color=graph_colour, label=c)
    else:
        ax3.bar(c,p,color=graph_colour,alpha=0.3, label=c)


# top1%
for c, rank, p, graph_colour in top1_2021_rank:
    if c == 'Australia':
        ax4.bar(c,p,color=graph_colour,label=c)
    else:
        ax4.bar(c,p,color=graph_colour,alpha=0.3, label=c)


all_axis = [ax1,ax2,ax3,ax4]
for ax in all_axis:
    for tick in ax.get_xticklabels():
        tick.set_rotation(80)
    ax.grid(axis='y')
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))

ax1.set_title("Bottom 50%")
ax1.set_ylabel('Share of total (%)')
ax2.set_ylabel('Share of total (%)')
ax2.set_title("Top 50% to top 10%")
ax3.set_title("Top 10%")
ax4.set_title("top 1%")

ax4.set_xlabel("Countries")
ax2.set_xlabel("Countries")

plt.legend(loc=(1.05,-0.3))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=1,
                    wspace=0.1,
                    hspace=0.5)
fig.savefig('income-fig0.pgf')

In [68]:
top1_2021_rank

[('Netherlands', 1, 0.0692, '#8c564b'),
 ('Slovakia', 2, 0.0704, '#1f77b4'),
 ('Slovenia', 3, 0.0802, '#ff7f0e'),
 ('Belgium', 4, 0.0863, '#2ca02c'),
 ('Italy', 5, 0.0871, '#bcbd22'),
 ('Iceland', 6, 0.0878, '#8c564b'),
 ('Norway', 7, 0.0888, '#7f7f7f'),
 ('Latvia', 8, 0.0913, '#ff7f0e'),
 ('Portugal', 9, 0.0959, '#17becf'),
 ('France', 10, 0.0984, '#ff7f0e'),
 ('Czech Republic', 11, 0.1004, '#7f7f7f'),
 ('Austria', 12, 0.1009, '#ff7f0e'),
 ('Luxembourg', 13, 0.1037, '#d62728'),
 ('Sweden', 14, 0.1054, '#d62728'),
 ('Greece', 15, 0.108, '#d62728'),
 ('Finland', 16, 0.1088, '#1f77b4'),
 ('Lithuania', 17, 0.1095, '#2ca02c'),
 ('Australia', 18, 0.1128, '#1f77b4'),
 ('Switzerland', 19, 0.1146, '#9467bd'),
 ('Estonia', 20, 0.1176, '#17becf'),
 ('Ireland', 21, 0.118, '#e377c2'),
 ('New Zealand', 22, 0.1187, '#e377c2'),
 ('Hungary', 23, 0.1227, '#9467bd'),
 ('Spain', 24, 0.1238, '#2ca02c'),
 ('United Kingdom', 25, 0.1265, '#7f7f7f'),
 ('Germany', 26, 0.1277, '#2ca02c'),
 ('Japan', 27, 0.1286,

## Ranking Volatility

In [69]:
bot50_var_rank = [(c,i+1, val, c_cols[c])for i,(c, val) in enumerate(sorted([
    (c, c_df[c][bot50p]['VALUE'].var())
    for c in countries
],key=lambda x : x[1]))]
midp_var_rank = [(c,i+1, val, c_cols[c])for i,(c, val) in enumerate(sorted([
    (c, c_df[c][midp].var())
    for c in countries
],key=lambda x : x[1]))]
top10_var_rank = [(c,i+1, val, c_cols[c])for i,(c, val) in enumerate(sorted([
    (c, c_df[c][top10p]['VALUE'].var())
    for c in countries
],key=lambda x : x[1]))]
top1_var_rank = [(c,i+1, val, c_cols[c])for i,(c, val) in enumerate(sorted([
    (c, c_df[c][top1p]['VALUE'].var())
    for c in countries
],key=lambda x : x[1]))]

## Ranking Volatility plot

In [70]:
fig, ((ax1, ax3), (ax2, ax4)) = plt.subplots(2,2,figsize=(14,4*2))

# bot50%
for c, rank, coef, graph_colour in bot50_var_rank:
    if c == 'Australia':
        ax1.bar(c,coef,color=graph_colour,label=c)
    else:
        ax1.bar(c,coef,color=graph_colour,alpha=0.3,label=c)

# middle percentile

for c, rank, coef, graph_colour in midp_var_rank:
    if c == 'Australia':
        ax2.bar(c,coef,color=graph_colour, label=c)
    else:
        ax2.bar(c,coef,color=graph_colour,alpha=0.3,label=c)

# top10%


for c, rank, coef, graph_colour in top10_var_rank:
    if c == 'Australia':
        ax3.bar(c,coef,color=graph_colour, label=c)
    else:
        ax3.bar(c,coef,color=graph_colour,alpha=0.3, label=c)


# top1%
for c, rank, coef, graph_colour in top1_var_rank:
    if c == 'Australia':
        ax4.bar(c,coef,color=graph_colour,label=c)
    else:
        ax4.bar(c,coef,color=graph_colour,alpha=0.3, label=c)


all_axis = [ax1,ax2,ax3,ax4]
for ax in all_axis:
    for tick in ax.get_xticklabels():
        tick.set_rotation(80)
    ax.grid(axis='y')

ax1.set_title("Bottom 50%")
ax1.set_ylabel('Variance')
ax2.set_ylabel('Variance')
ax2.set_title("Top 50% to top 10%")
ax3.set_title("Top 10%")
ax4.set_title("top 1%")

ax4.set_xlabel("Countries")
ax2.set_xlabel("Countries")

plt.legend(loc=(1.05,-0.3))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=1,
                    wspace=0.1,
                    hspace=0.5)

## Linear Regressions

In [71]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [72]:
lr = LinearRegression()
# vals = np.array(c_df['Australia'][top1p]['VALUE'].values).reshape(-1,1)
vals = np.array(c_df['Australia'][midp].values).reshape(-1,1)
time = np.array([i for i in range(len(c_df['Australia'][bot50p]['VALUE']))]).reshape(-1,1)
lm = lr.fit(time, vals)
# pred = lr.predict(vals)
lm.coef_

array([[-0.0013014]])

In [73]:
lr_slope_dict = {}
time = np.array([i for i in range(len(c_df['Australia'][bot50p]['VALUE']))]).reshape(-1,1)
for c, country_p in c_df.items():
    lr_slope_dict[c] = {}

    for p, country_df in country_p.items():
        lr = LinearRegression()
        if p == midp:
            vals = np.array(country_df.values).reshape(-1,1)
            lm = lr.fit(time, vals)
            lr_slope_dict[c][p] = float(lm.coef_[0])
        else:
            vals = np.array(country_df['VALUE'].values).reshape(-1,1)
            lm = lr.fit(time, vals)
            lr_slope_dict[c][p] = float(lm.coef_[0])

## Ranked by Increasing Slope

In [74]:
# format (country, ranking, coefficient, graph_co)
bot50_slope_rank = [(c,rank+1, coef, c_cols[c]) for rank,(c, coef) in enumerate(sorted([(c, country_p[bot50p]) for c,country_p in lr_slope_dict.items()], key=lambda x: x[1]))]
mid_slope_rank = [(c,rank+1, coef, c_cols[c]) for rank,(c, coef) in enumerate(sorted([(c, country_p[midp]) for c,country_p in lr_slope_dict.items()], key=lambda x: x[1]))]
top10_slope_rank = [(c,rank+1, coef, c_cols[c]) for rank,(c, coef) in enumerate(sorted([(c, country_p[top10p]) for c,country_p in lr_slope_dict.items()], key=lambda x: x[1]))]
top1_slope_rank = [(c,rank+1, coef, c_cols[c]) for rank,(c, coef) in enumerate(sorted([(c, country_p[top1p]) for c,country_p in lr_slope_dict.items()], key=lambda x: x[1]))]
# https://www.geeksforgeeks.org/bar-plot-in-matplotlib/ create barplot

## Slope Bar plots

In [82]:


fig, ((ax1, ax3), (ax2, ax4)) = plt.subplots(2,2,figsize=(14,4*2))

# bot50%
for c, rank, coef, graph_colour in bot50_slope_rank:
    if c == 'Australia':
        ax1.bar(c,coef,color=graph_colour,label=c)
    else:
        ax1.bar(c,coef,color=graph_colour,alpha=0.3,label=c)

# middle percentile

for c, rank, coef, graph_colour in mid_slope_rank:
    if c == 'Australia':
        ax2.bar(c,coef,color=graph_colour, label=c)
    else:
        ax2.bar(c,coef,color=graph_colour,alpha=0.3,label=c)

# top10%


for c, rank, coef, graph_colour in top10_slope_rank:
    if c == 'Australia':
        ax3.bar(c,coef,color=graph_colour, label=c)
    else:
        ax3.bar(c,coef,color=graph_colour,alpha=0.3, label=c)


# top1%
for c, rank, coef, graph_colour in top1_slope_rank:
    if c == 'Australia':
        ax4.bar(c,coef,color=graph_colour,label=c)
    else:
        ax4.bar(c,coef,color=graph_colour,alpha=0.3, label=c)


all_axis = [ax1,ax2,ax3,ax4]
for ax in all_axis:
    for tick in ax.get_xticklabels():
        tick.set_rotation(80)
    ax.grid(axis='y')

ax1.set_title("Bottom 50%")
ax1.set_ylabel('Linear Trend Coefficient')
ax2.set_ylabel('Linear Trend Coefficient')
ax2.set_title("Top 50% to top 10%")
ax3.set_title("Top 10%")
ax4.set_title("top 1%")

ax4.set_xlabel("Countries")
ax2.set_xlabel("Countries")

plt.legend(loc=(1.05,-0.3))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=1,
                    wspace=0.1,
                    hspace=0.5)
fig.savefig('income-OLS-fig0.pgf')

In [76]:
bot50_slope_rank
mid_slope_rank
top10_slope_rank
top1_slope_rank

[('Luxembourg', 1, -0.0023867521367521376, '#d62728'),
 ('Estonia', 2, -0.0022423687423687427, '#17becf'),
 ('Turkey', 3, -0.0009725274725274728, '#8c564b'),
 ('Israel', 4, -0.0008849816849816857, '#7f7f7f'),
 ('Austria', 5, -0.0007081196581196583, '#ff7f0e'),
 ('Finland', 6, -0.0006393162393162395, '#1f77b4'),
 ('Colombia', 7, -0.0005791819291819297, '#8c564b'),
 ('Sweden', 8, -0.0004957264957264958, '#d62728'),
 ('Greece', 9, -0.00031568986568986595, '#d62728'),
 ('France', 10, -0.00028620268620268625, '#ff7f0e'),
 ('Latvia', 11, -0.00028394383394383395, '#ff7f0e'),
 ('Czech Republic', 12, -0.00027002442002442003, '#7f7f7f'),
 ('Costa Rica', 13, -0.00019102564102564163, '#e377c2'),
 ('Norway', 14, -0.00018620268620268607, '#7f7f7f'),
 ('Iceland', 15, -0.000135042735042735, '#8c564b'),
 ('Ireland', 16, -1.556776556776572e-05, '#e377c2'),
 ('Portugal', 17, 0.00013675213675213684, '#17becf'),
 ('Belgium', 18, 0.00017307692307692312, '#2ca02c'),
 ('Switzerland', 19, 0.0002221001221001222