In [7]:
# import packages

import pandas as pd
import numpy as np
import altair as alt

In [2]:
# Import data to local notebook

from google.colab import files
uploaded = files.upload()

Saving data_primary.csv to data_primary.csv


In [3]:
df = pd.read_csv('data_primary.csv')
perc = 75.0 # Here N is 75
min_count =  int(((100-perc)/100)*df.shape[1] + 1)
df = df.dropna(axis=0, thresh=min_count)
df.tail()

Unnamed: 0,year,inflation,interest,earn_all,weekly_all,earn_men,weekly_men,earn_women,weekly_wo,earn_white,...,earn_assoc,weekly_assoc,earn_bach,weekly_bach,earn_bachp,weekly_bachp,earn_adv,weekly_adv,stock_dija,stock_sp
37,2017,0.0213,0.01,0.0337,860,0.0284,941,0.028,770,0.0325,...,0.0244,798,0.0147,1173.0,0.0159,1279,0.018,1470.0,0.2508,0.1942
38,2018,0.0244,0.0183,0.0302,886,0.034,973,0.0247,789,0.0292,...,0.0351,826,0.0213,1198.0,0.0352,1324,0.0347,1521.0,-0.0563,-0.0624
39,2019,0.0181,0.0216,0.035,917,0.0349,1007,0.0406,821,0.0317,...,0.0363,856,0.0417,1248.0,0.0325,1367,0.0302,1567.0,0.2234,0.2888
40,2020,0.0123,0.0038,0.0731,984,0.0745,1082,0.0853,891,0.0614,...,0.0549,903,0.0457,1305.0,0.0395,1421,0.0364,1624.0,0.0725,0.1626
41,2021,0.047,0.0008,0.0142,998,0.0139,1097,0.0236,912,0.015,...,0.0244,925,0.0222,1334.0,0.0218,1452,0.0209,1658.0,0.1873,0.2689


In [8]:
# Stack dataframes to get earnings data by race

def earning_race_function(df = df, column1 = "year", column2 = "earn_all", column3 = 'weekly_all',  Race = "All"): 
    '''take in data for specific columns and return earnings by Race and Year'''
    
    df_new = pd.DataFrame(df, columns = [column1, column2, column3, 'inflation'])
    df_new.rename(columns = {column2 : "Earning growth", column3 : "Weekly earning"}, inplace = True)
    df_new['race'] = Race
    return df_new

# Setup datasets for each race
df_1 = earning_race_function(column2 = "earn_all", column3 = 'weekly_all', Race = "All")
df_2 = earning_race_function(column2 = "earn_white", column3 = 'weekly_wh', Race = "White")
df_3 = earning_race_function(column2 = "earn_black", column3 = 'weekly_blk', Race = "Black")
df_4 = earning_race_function(column2 = "earn_asian", column3 = 'weekly_as', Race = "Asian")
df_5 = earning_race_function(column2 = "earn_hisp", column3 = 'weekly_hisp', Race = "Hispanic")

# Crate stacked dataframe with earnings by race
df_earning = df_1.append([df_2, df_3, df_4, df_5], ignore_index=True, sort = True)
# convert to date
df_earning['year'] = pd.to_datetime(df_earning['year'], format='%Y')
df_earning['year'] = pd.DatetimeIndex(df_earning['year']).year
df_earning = df_earning.sort_values(by= ['year', 'race'])

df_earning.head(5)

Unnamed: 0,Earning growth,Weekly earning,inflation,race,year
0,0.0871,262.0,0.135,All,1980
126,,,0.135,Asian,1980
84,0.0653,212.0,0.135,Black,1980
168,0.0773,209.0,0.135,Hispanic,1980
42,0.0847,269.0,0.135,White,1980


In [9]:
# Set brush
brush = alt.selection(type='interval')

points = alt.Chart(df_earning).mark_line().encode(
    x='year:O',
    y='Earning growth',
    color=alt.condition(brush, 'race:N', alt.value('lightgray'))
).properties(width = 600, title = "Wage Growth by Race and Year").add_selection(
    brush
)


In [10]:
bars = alt.Chart(df_earning).mark_bar().encode(
    y='race',
    color='race',
    x='mean(Weekly earning)'
).transform_filter(
    brush
)


In [11]:
points & bars.properties(width = 600)


In [None]:
chart = alt.Chart(df_earning).mark_point().encode(
    x=alt.X('inflation'),
    y=alt.Y('Earning growth')
).properties(width = 250, height = 250).facet(
    'race:N',
    columns = 3
)

chart

In [12]:
# control for data points by race
df_copy = pd.DataFrame(df_earning).copy()

All = df_copy[df_copy.race == 'All']
White = df_copy[df_copy.race == 'White']
Black = df_copy[df_copy.race == 'Black']
Asian = df_copy[df_copy.race == 'Asian']
Hisp = df_copy[df_copy.race == 'Hispanic']

In [13]:
scale = alt.Scale(domain=['inflation', 'Earning growth'], range=['black', 'red'])

plot1 = alt.Chart(White).transform_fold(
    ['inflation','Earning growth']
).mark_line().encode(
    x = alt.X('year:N', title = "Date", axis=alt.Axis(labels=False)),
    y = alt.Y('value:Q', title = 'Percent Change (y-o-y)'),
    color = alt.Color('key:N', scale = scale, legend = alt.Legend(title = "Metric")),
    tooltip = ["inflation", "Earning growth"]
).properties(title='Whites', width = 250, height = 250)

plot2 = alt.Chart(Black).transform_fold(
    ['inflation','Earning growth']
).mark_line().encode(
    x = alt.X('year:N', title = "Date", axis=alt.Axis(labels=False)),
    y = alt.Y('value:Q', title = 'Percent Change (y-o-y)'),
    color = alt.Color('key:N', scale = scale, legend = alt.Legend(title = "Metric")),
    tooltip = ["inflation", "Earning growth"]
).properties(title='African American', width = 250, height = 250)

plot3 = alt.Chart(Asian).transform_fold(
    ['inflation','Earning growth']
).mark_line().encode(
    x = alt.X('year:N', title = "Date", axis=alt.Axis(labels=False)),
    y = alt.Y('value:Q', title = 'Percent Change (y-o-y)'),
    color = alt.Color('key:N', scale = scale, legend = alt.Legend(title = "Metric")),
    tooltip = ["inflation", "Earning growth"]
).properties(title='Asian', width = 250, height = 250)

plot4 = alt.Chart(Hisp).transform_fold(
    ['inflation','Earning growth']
).mark_line().encode(
    x = alt.X('year:N', title = "Date", axis=alt.Axis(labels=False)),
    y = alt.Y('value:Q', title = 'Percent Change (y-o-y)'),
    color = alt.Color('key:N', scale = scale, legend = alt.Legend(title = "Metric")),
    tooltip = ["inflation", "Earning growth"]
).properties(title='Hispanic', width = 250, height = 250)



plot1 | plot2 | plot3 | plot4

In [14]:
# Stack dataframes to get earnings data by gender

def earning_gender_function(df = df, column1 = "year", column2 = "earn_all", column3 = "weekly_all", Gender = "All"): 
    '''take in data for specific columns and return earnings by Race and Year'''
    
    df_new = pd.DataFrame(df, columns = [column1, 'inflation', column2, column3])
    df_new.rename(columns = {column2 : "Earning growth", column3 : "Weekly earnings"}, inplace = True)
    df_new['gender'] = Gender
    return df_new

# Get differnt columns for the gender calcs
df_1 = earning_gender_function(column2 = "earn_all", column3 = "weekly_all", Gender = "All")
df_2 = earning_gender_function(column2 = "earn_men", column3 = "weekly_men", Gender = "Men")
df_3 = earning_gender_function(column2 = "earn_women", column3 = "weekly_wo", Gender = "Women")

# Create stacked dataframe with earnings by race
df_earning_g = df_1.append([df_2, df_3], ignore_index=True, sort = True)
# convert to date
df_earning_g['year'] = pd.to_datetime(df_earning_g['year'], format='%Y')
df_earning_g['year'] = pd.DatetimeIndex(df_earning_g['year']).year
df_earning_g = df_earning_g.sort_values(by= ['year', 'gender'])

df_earning_g.head(5)

Unnamed: 0,Earning growth,Weekly earnings,gender,inflation,year
0,0.0871,262,All,0.135,1980
42,0.0719,313,Men,0.135,1980
84,0.1044,201,Women,0.135,1980
1,0.084,284,All,0.1032,1981
43,0.0863,340,Men,0.1032,1981


In [22]:
# Set brush
brush = alt.selection(type='interval')

points = alt.Chart(df_earning_g).mark_point().encode(
    x='year:O',
    y='Earning growth',
    color=alt.condition(brush, 'gender:N', alt.value('lightgray'))
).properties(width = 600, title = "Wage Growth by Gender and Year").add_selection(
    brush
)


In [23]:
bars = alt.Chart(df_earning_g).mark_bar().encode(
    y='gender',
    color='gender',
    x='mean(Weekly earnings)'
).transform_filter(
    brush
)

points & bars.properties(width = 600)


In [30]:
# Education
def educ_earning_function(df = df, col1 = 'year', col2 = 'inflation', col3 = 'earn_nohs', col4 = 'weekly_nohs', 
                         education = "No high school"):
    '''take in data for specific columns and return earnings & inflation by Education and Year'''
    
    df_new = pd.DataFrame(df, columns = [col1, col2, col3, col4])
    df_new.rename(columns = {col3 : "wage growth", col4 : "weekly earnings"}, inplace = True)
    df_new['education'] = education
    return df_new

# Add dataframes for education
df_1 = educ_earning_function(col3 = 'earn_nohs', col4 = 'weekly_nohs', education = "No high school")
df_2 = educ_earning_function(col3 = 'earn_hs', col4 = 'weekly_hs', education = "High school")
df_3 = educ_earning_function(col3 = 'earn_assoc', col4 = 'weekly_assoc', education = "Some college")
df_4 = educ_earning_function(col3 = 'earn_bach', col4 = 'weekly_bach', education = "Bachelor only")
df_5 = educ_earning_function(col3 = 'earn_bachp', col4 = 'weekly_bachp', education = "Bachelor of higher")
df_6 = educ_earning_function(col3 = 'earn_adv', col4 = 'weekly_adv', education = "Advanced degree")

# Create stacked dataframe with earnings by race
df_earning_ed = df_1.append([df_2, df_3, df_4, df_5, df_6], ignore_index=True, sort = True)
# convert to date
df_earning_ed['year'] = pd.to_datetime(df_earning_ed['year'], format='%Y')
df_earning_ed['year'] = pd.DatetimeIndex(df_earning_ed['year']).year
df_earning_ed = df_earning_ed.sort_values(by= ['year', 'education'])

df_earning_ed.head(5)

Unnamed: 0,education,inflation,wage growth,weekly earnings,year
210,Advanced degree,0.135,,,1980
168,Bachelor of higher,0.135,0.093,376.0,1980
126,Bachelor only,0.135,,,1980
42,High school,0.135,0.0683,266.0,1980
0,No high school,0.135,0.0571,222.0,1980


In [33]:
# Set brush
brush = alt.selection(type='interval')

points = alt.Chart(df_earning_ed).mark_point().encode(
    x='year:O',
    y='wage growth',
    color=alt.condition(brush, 'education:N', alt.value('lightgray'))
).properties(width = 600, title = "Wage Growth by Education and Year").add_selection(
    brush
)

bars = alt.Chart(df_earning_ed).mark_bar().encode(
    y='education',
    color='education',
    x='mean(weekly earnings):Q'
).transform_filter(
    brush
)

points & bars.properties(width = 600)

In [35]:
df.head()

Unnamed: 0,year,inflation,interest,earn_all,weekly_all,earn_men,weekly_men,earn_women,weekly_wo,earn_white,...,earn_assoc,weekly_assoc,earn_bach,weekly_bach,earn_bachp,weekly_bachp,earn_adv,weekly_adv,stock_dija,stock_sp
0,1980,0.135,0.1336,0.0871,262,0.0719,313,0.1044,201,0.0847,...,0.078,304,,,0.093,376,,,0.1493,0.2577
1,1981,0.1032,0.1638,0.084,284,0.0863,340,0.0896,219,0.0818,...,0.0658,324,,,0.0824,407,,,-0.0923,-0.0973
2,1982,0.0616,0.1226,0.0634,302,0.0706,364,0.0913,239,0.0653,...,0.0833,351,,,0.0762,438,,,0.1961,0.1476
3,1983,0.0321,0.0909,0.0364,313,0.0412,379,0.0544,252,0.0323,...,0.0342,363,,,0.0525,461,,,0.2027,0.1727
4,1984,0.0432,0.1023,0.0415,326,0.0343,392,0.0516,265,0.05,...,0.0523,382,,,0.0542,486,,,-0.0374,0.014


In [47]:
# Education
def metrics_function(df = df, col1 = 'year', col2 = 'inflation', metric_name = "inflation"):
    '''take in data for specific columns and return earnings & inflation by Education and Year'''
    
    df_new = pd.DataFrame(df, columns = [col1, col2])
    df_new.rename(columns = {col2 : "metric"}, inplace = True)
    df_new['metric name'] = metric_name
    return df_new

df_1 = metrics_function(col2 = 'inflation', metric_name = 'inflation rate')
df_2 = metrics_function(col2 = 'interest', metric_name = 'interest rate')
df_3 = metrics_function(col2 = 'earn_all', metric_name = 'earnings growth')

# Create stacked dataframe with earnings by race
df_earning_metric = df_1.append([df_2, df_3], ignore_index=True, sort = True)
# convert to date
df_earning_metric['year'] = pd.to_datetime(df_earning_metric['year'], format='%Y')
df_earning_metric['year'] = pd.DatetimeIndex(df_earning_metric['year']).year
df_earning_metric = df_earning_metric.sort_values(by= ['year', 'metric name'])

df_earning_metric.head(5)

Unnamed: 0,metric,metric name,year
84,0.0871,earnings growth,1980
0,0.135,inflation rate,1980
42,0.1336,interest rate,1980
85,0.084,earnings growth,1981
1,0.1032,inflation rate,1981


In [54]:
# Scatter plot with Loess
highlight = alt.selection(type='single', on='mouseover',
                          fields=['metric name'], nearest=True)

base = alt.Chart(df_earning_metric).encode(
    x='year:N',
    y='metric:Q',
    color='metric name:N'
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(height = 500,
    width=800, title = "Interest Rates and Earnings Growth Vs. Inflation" 
)

lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3))
)

points + lines

In [56]:
df_1 = metrics_function(col2 = 'inflation', metric_name = 'inflation rate')
df_2 = metrics_function(col2 = 'interest', metric_name = 'stock_dija')
df_3 = metrics_function(col2 = 'earn_all', metric_name = 'stock_sp')

# Create stacked dataframe with earnings by race
df_earning_metric2 = df_1.append([df_2, df_3], ignore_index=True, sort = True)
# convert to date
df_earning_metric2['year'] = pd.to_datetime(df_earning_metric2['year'], format='%Y')
df_earning_metric2['year'] = pd.DatetimeIndex(df_earning_metric2['year']).year
df_earning_metric2 = df_earning_metric2.sort_values(by= ['year', 'metric name'])

df_earning_metric2.head(5)


Unnamed: 0,metric,metric name,year
0,0.135,inflation rate,1980
42,0.1336,stock_dija,1980
84,0.0871,stock_sp,1980
1,0.1032,inflation rate,1981
43,0.1638,stock_dija,1981


In [58]:
# Scatter plot with Loess
highlight = alt.selection(type='single', on='mouseover',
                          fields=['metric name'], nearest=True)

base = alt.Chart(df_earning_metric2).encode(
    x='year:N',
    y='metric:Q',
    color='metric name:N'
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(height = 500,
    width=800, title = "Stock Net Losses/Gains Vs. Inflation" 
)

lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3))
)

points + lines