In [28]:
import pandas as pd
# Predict trends using linear regression
from sklearn.linear_model import LinearRegression
import math
# Generate plots
import matplotlib.pyplot as plt



In [29]:
proj_dir =  "C:/Users/singhy/Dropbox/Labor_Market_PT/replication/empirical"

df = pd.read_csv(f"{proj_dir}/temp/educ_wage_growth_by_quartile.csv")

# wage growth pooled 
df_pol = pd.read_csv(f"{proj_dir}/temp/educ_wage_growth_pooled.csv")

cpi = pd.read_csv(f"{proj_dir}/inputs/raw_data/CPI/CPIAUCSL.csv")


In [30]:
df = df.merge(df_pol, on='date_monthly')

In [31]:

# Convert 'date_monthly' from '2016m1' format to datetime
df['date'] = pd.to_datetime(
    df['date_monthly'].str.extract(r'(\d{4})m(\d{1,2})')
    .apply(lambda x: f"{x[0]}-{int(x[1]):02d}", axis=1)
)


cpi['date'] = pd.to_datetime(cpi['observation_date'])
cpi = cpi.rename(columns={'CPIAUCSL': 'P'})
cpi['P'] = pd.to_numeric(cpi['P'], errors='coerce')
cpi['P_12m_change'] = cpi['P'].pct_change(periods=12) * 100
cpi['P_1m_change'] = 1 + (cpi['P_12m_change'] / 100) / 12
cpi = cpi[['date', 'P_1m_change']]

    # Merge CPI with wage growth data
df = df.merge(cpi, on='date', how='left')
    
# Compute monthly wage growth factors
wage_columns = [col for col in df.columns if col.startswith('smwg')]
for col in wage_columns:
    df[f'{col}_mom_grth'] = 1 + (df[col] / 100) / 12

# Compute nominal wage indices
for col in wage_columns:
    df[f'nom_index_{col}'] = df[f'{col}_mom_grth'].cumprod()

# Compute price index
df['price_index'] = df['P_1m_change'].cumprod()

# Compute real wage indices
for col in wage_columns:
        df[f'real_index_{col}'] = df[f'nom_index_{col}'] / df['price_index']

# Select final columns
result_cols = ['date', 'price_index'] + [f'real_index_{col}' for col in wage_columns]
result_df = df[result_cols]




In [32]:
# Recalculate trend dataframe based on the existing df
start_date = pd.to_datetime("2016-01-01")
end_date = pd.to_datetime("2019-12-31")

trend_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
X_trend = ((trend_df['date'].dt.year - trend_df['date'].min().year) * 12 +
           (trend_df['date'].dt.month - trend_df['date'].min().month)).values.reshape(-1, 1)
X_all = ((df['date'].dt.year - trend_df['date'].min().year) * 12 +
         (df['date'].dt.month - trend_df['date'].min().month)).values.reshape(-1, 1)

# Identify real wage index columns
real_index_cols = [col for col in df.columns if col.startswith('real_index_')]

predicted = {}
for col in real_index_cols + ['price_index']:
    y = trend_df[col].values
    model = LinearRegression()
    model.fit(X_trend, y)
    predicted[col] = model.predict(X_all)
    df[f'predicted_{col}'] = predicted[col]

# Compute final gaps between actual and trend
gaps = {
    col: df[f'predicted_{col}'].iloc[-1] - df[col].iloc[-1]
    for col in real_index_cols + ['price_index']
}


In [33]:
df.columns

Index(['date_monthly', 'smwg1st_Bachelors_plus', 'smwg1st_Less_than_Bachelors',
       'smwg2nd_Bachelors_plus', 'smwg2nd_Less_than_Bachelors',
       'smwg3rd_Bachelors_plus', 'smwg3rd_Less_than_Bachelors',
       'smwg4th_Bachelors_plus', 'smwg4th_Less_than_Bachelors',
       'smwgBachelors_plus', 'smwgLess_than_Bachelors', 'date', 'P_1m_change',
       'smwg1st_Bachelors_plus_mom_grth',
       'smwg1st_Less_than_Bachelors_mom_grth',
       'smwg2nd_Bachelors_plus_mom_grth',
       'smwg2nd_Less_than_Bachelors_mom_grth',
       'smwg3rd_Bachelors_plus_mom_grth',
       'smwg3rd_Less_than_Bachelors_mom_grth',
       'smwg4th_Bachelors_plus_mom_grth',
       'smwg4th_Less_than_Bachelors_mom_grth', 'smwgBachelors_plus_mom_grth',
       'smwgLess_than_Bachelors_mom_grth', 'nom_index_smwg1st_Bachelors_plus',
       'nom_index_smwg1st_Less_than_Bachelors',
       'nom_index_smwg2nd_Bachelors_plus',
       'nom_index_smwg2nd_Less_than_Bachelors',
       'nom_index_smwg3rd_Bachelors_plus',
 

In [34]:
# Define relevant columns
gap_columns = {
    'Bachelor_plus_1st_Quartile': ('real_index_smwg1st_Bachelors_plus', 'predicted_real_index_smwg1st_Bachelors_plus'),

    'Less_Bachelor_1st_Quartile': ('real_index_smwg1st_Less_than_Bachelors', 'predicted_real_index_smwg1st_Less_than_Bachelors'),


    'Bachelor_plus_4th_Quartile': ('real_index_smwg4th_Bachelors_plus', 'predicted_real_index_smwg4th_Bachelors_plus'),
    'Less_Bachelor_4th_Quartile': ('real_index_smwg4th_Less_than_Bachelors', 'predicted_real_index_smwg4th_Less_than_Bachelors'),

    'Bachelor_plus_Pooled': ('real_index_smwgBachelors_plus', 'predicted_real_index_smwgBachelors_plus'),
    'Less_Bachelor_Pooled': ('real_index_smwgLess_than_Bachelors', 'predicted_real_index_smwgLess_than_Bachelors')
}

# Filter from Jan 2020 onward
plot_start_date = pd.to_datetime("2020-01-01")
mask = df['date'] >= plot_start_date
df_filtered = df.loc[mask].copy()

# Initialize output DataFrame
gap_df = df_filtered[['date']].copy()

# Calculate gaps
for label, (actual_col, trend_col) in gap_columns.items():
    gap = (df_filtered[actual_col] - df_filtered[trend_col]) *100
    gap.iloc[0] = 0  # normalize gap to 0 at 2020-01
    gap_df[label] = gap.values

df

Unnamed: 0,date_monthly,smwg1st_Bachelors_plus,smwg1st_Less_than_Bachelors,smwg2nd_Bachelors_plus,smwg2nd_Less_than_Bachelors,smwg3rd_Bachelors_plus,smwg3rd_Less_than_Bachelors,smwg4th_Bachelors_plus,smwg4th_Less_than_Bachelors,smwgBachelors_plus,...,predicted_real_index_smwg1st_Less_than_Bachelors,predicted_real_index_smwg2nd_Bachelors_plus,predicted_real_index_smwg2nd_Less_than_Bachelors,predicted_real_index_smwg3rd_Bachelors_plus,predicted_real_index_smwg3rd_Less_than_Bachelors,predicted_real_index_smwg4th_Bachelors_plus,predicted_real_index_smwg4th_Less_than_Bachelors,predicted_real_index_smwgBachelors_plus,predicted_real_index_smwgLess_than_Bachelors,predicted_price_index
0,2016m1,3.065676,3.380805,3.996963,3.009252,3.058247,2.714681,3.010267,3.345819,3.185538,...,1.004418,1.007196,1.007961,1.004315,1.004985,1.005989,1.008601,1.005336,1.005990,0.995386
1,2016m2,3.088249,3.580347,3.790305,3.014980,3.216066,2.794684,2.959407,3.427328,3.157506,...,1.006223,1.008939,1.008785,1.005824,1.005636,1.006977,1.009305,1.006575,1.007037,0.997173
2,2016m3,2.950427,3.587298,3.661434,3.126895,3.155802,2.712042,2.886021,3.450413,3.077776,...,1.008029,1.010682,1.009610,1.007332,1.006287,1.007965,1.010009,1.007813,1.008085,0.998961
3,2016m4,2.956669,3.703196,3.693079,3.151071,3.173965,2.636835,2.838291,3.408664,3.051489,...,1.009834,1.012424,1.010435,1.008841,1.006938,1.008953,1.010713,1.009051,1.009132,1.000748
4,2016m5,2.920797,3.769597,3.937911,3.230909,3.237600,2.642914,2.898383,3.393652,3.161200,...,1.011639,1.014167,1.011260,1.010349,1.007589,1.009941,1.011416,1.010290,1.010180,1.002535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,2024m8,5.707670,5.063751,5.243148,4.944730,5.976493,4.816728,4.870261,4.217474,5.277520,...,1.190356,1.186687,1.092925,1.159684,1.072039,1.107742,1.081080,1.132894,1.113888,1.179479
104,2024m9,5.628385,5.033567,5.436953,4.779777,5.723972,4.858114,4.972438,4.329970,5.344888,...,1.192161,1.188429,1.093750,1.161192,1.072690,1.108729,1.081784,1.134133,1.114936,1.181266
105,2024m10,5.453327,4.703480,5.505326,4.762056,5.528466,4.775824,5.111750,4.452866,5.327370,...,1.193966,1.190172,1.094575,1.162701,1.073341,1.109717,1.082488,1.135371,1.115983,1.183054
106,2024m11,5.305830,4.430231,5.634620,4.661451,5.290781,4.449963,4.998525,4.650443,5.243234,...,1.195771,1.191914,1.095400,1.164209,1.073992,1.110705,1.083191,1.136609,1.117031,1.184841


In [35]:
gap_df.to_csv(f"{proj_dir}/outputs/processed_data/educ_wage_plot_data.csv", index= False)