# 08_03: Visualizing distributions

In [1]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

In [2]:
incomes = pd.read_csv('incomes.csv').set_index(['country', 'year'])

In [3]:
incomes['log10_daily'] = np.log10(incomes.yearly / 365.25)

In [None]:
incomes.loc['USA', 1965].yearly.plot(kind='box', figsize=(6,4));
pp.ylabel('$ / year');

In [None]:
incomes.reset_index().plot(kind='box', column='yearly', by=['country', 'year'], figsize=(6,4))
pp.ylabel('$ / year');

In [None]:
incomes.reset_index().plot(kind='box', column='log10_daily', by=['country', 'year'], figsize=(6,4))
pp.ylabel(r'$\mathrm{log}_{10} (\$ / \mathrm{day})$');  # labels support the LaTeX math language

In [None]:
# histtype = 'step' removes the color fill
incomes.loc['China', 1965].log10_daily.plot(kind='hist', histtype='step', bins=20, figsize=(6,4))
pp.xlabel(r'$\mathrm{log}_{10} (\$ / \mathrm{day})$');

In [None]:
data = incomes.loc['China', 1965].log10_daily
data.plot(kind='hist', histtype='step', bins=20, figsize=(6,4))

pp.xlabel(r'$\mathrm{log}_{10} (\$ / \mathrm{day})$')

pp.axvline(data.mean())
pp.axvline(data.median(), linestyle='--')
pp.axvline(data.quantile(0.25), linestyle=':')
pp.axvline(data.quantile(0.75), linestyle=':');

In [None]:
data.plot.hist(histtype='step', bins=20, density=True, figsize=(6,4))
data.plot.density() # set scale with bw_method = ...
pp.xlabel(r'$\mathrm{log}_{10} (\$ / \mathrm{day})$');

In [None]:
incomes.loc['China', 1965].log10_daily.plot(kind='hist', histtype='step', bins=20, figsize=(6,4), label='China');
incomes.loc['USA', 1965].log10_daily.plot(kind='hist', histtype='step', bins=20, label='USA')
pp.axis(xmin=-1.5, xmax=3)
pp.xlabel(r'$\mathrm{log}_{10} (\$ / \mathrm{day})$'); pp.legend();

In [None]:
incomes.loc['China', 2015].log10_daily.plot(kind='hist', histtype='step', bins=20, figsize=(6,4), label='China');
incomes.loc['USA', 2015].log10_daily.plot(kind='hist', histtype='step', bins=20, label='USA')
pp.axis(xmin=-1.5, xmax=3)
pp.xlabel(r'$\mathrm{log}_{10} (\$ / \mathrm{day})$'); pp.legend();

In [12]:
gm = pd.read_csv('gapminder.csv')

In [13]:
data = incomes.loc["China", 2015].log10_daily

In [None]:
gm.query('country == "China" and year == 2015').population

In [15]:
weights = gm.query('country == "China" and year == 2015').population.iloc[0] * np.ones_like(data) / len(data)

In [None]:
weights[0]

In [None]:
data.plot(kind='hist', histtype='step', bins=20, figsize=(6,4), weights=weights)
pp.xlabel(r'$\mathrm{log}_{10} (\$ / \mathrm{day})$'); pp.legend();

In [None]:
for country in ['China', 'USA']:
    data = incomes.loc[country, 2015].log10_daily
    
    population = gm.query('country == @country and year == 2015').population.iloc[0]
    weights = population * np.ones_like(data) / len(data)
    
    data.plot(kind='hist', histtype='step', bins=20, figsize=(6,4), weights=weights, label=country);

pp.xlabel(r'$\mathrm{log}_{10} (\$ / \mathrm{day})$'); pp.legend();

In [19]:
import seaborn

# silence pandas warnings about methods that will change in the future
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [20]:
data_2015 = incomes.loc[pd.IndexSlice[:,2015], :].reset_index()

In [None]:
seaborn.set_style(style='ticks') # could be white, dark, whitegrid, darkgrid, ticks  
pp.figure(figsize=(6,4))
seaborn.boxplot(data=data_2015, x='country', y='log10_daily');

In [None]:
seaborn.set_style(style='dark')
pp.figure(figsize=(6,4))
seaborn.histplot(data=data_2015, x='log10_daily', hue='country', stat='density', kde=True);