In [86]:
import numpy as np 
import pandas as pd 

In [87]:
data_dir = "C:/Users/singhy/Desktop/Chicago/cps_data/inflation/raw_data"
output_dir = "C:/Users/singhy/Desktop/Chicago/cps_data/inflation/output"

In [88]:
# Stock of Vacancies 
vacancy = pd.read_excel(f"{data_dir}/barnichon/CompositeHWI.xlsx")

# Stock of Employed and Unemployed Workers 
stocks = pd.read_excel(f"{data_dir}/fred_employment/employment.xls", engine = 'xlrd')

# jolts 
jolts = pd.read_csv(f"{data_dir}/JOLTS/jolts_level.csv")

# Consumer Price Index 
cpi = pd.read_excel(f"{data_dir}/CPI/CPIAUCSL.xls", engine='xlrd')

In [89]:
jolts

Unnamed: 0,DATE,JTSQUL,JTSJOL,JTSHIL,JTSLDL
0,2000-12-01,2882,5088,5426,2018
1,2001-01-01,3245,5234,5722,2220
2,2001-02-01,3053,5097,5303,1855
3,2001-03-01,3054,4762,5528,2133
4,2001-04-01,3163,4615,5204,1883
...,...,...,...,...,...
279,2024-03-01,3409,8355,5617,1601
280,2024-04-01,3452,7919,5615,1542
281,2024-05-01,3403,8230,5655,1678
282,2024-06-01,3214,7910,5248,1560


In [90]:
# Basic Processing of historical vacancies 
vacancy.columns = ['date', 'V', 'V_rate']  
vacancy = vacancy.iloc[8:].reset_index(drop=True)
vacancy = vacancy.dropna(subset=['date', 'V'])
vacancy = vacancy.drop(['V_rate'], axis = 1)
vacancy['V'] = vacancy['V'].astype(float)

# Basic Processing of stocks 
stocks.columns = ['date', 'E', 'U']
stocks = stocks.iloc[11:].reset_index(drop=True)
stocks = stocks.dropna(subset=['date', 'E', 'U'])
stocks['date'] = pd.to_datetime(stocks['date'])
stocks['U'] = stocks['U'].astype(float)


# CPI-U 
cpi = cpi.iloc[11:].reset_index(drop=True)
cpi = cpi.rename(columns={'FRED Graph Observations': 'date', 
                            'Unnamed: 1':               'P'   })

cpi['date'] = pd.to_datetime(cpi['date'])
cpi['P'] = pd.to_numeric(cpi['P'], errors='coerce')
cpi['P_12m_change'] = cpi['P'].pct_change(periods=12) * 100

# JOLTS 
jolts.columns = ['date', 'tot_quits', 'vacancy_stock', 'tot_hires', 'tot_layoffs']
jolts['date'] = pd.to_datetime(jolts['date'])

In [91]:
# Define the mapping function with increased tolerance
def map_to_month(decimal_date):
    fraction = decimal_date - int(decimal_date)
    #print("decimal_date", decimal_date)
    #print("integer decimal_date", int(decimal_date))
    #print('fraction', fraction)
    
    # Define exact mappings with slightly higher tolerance
    if np.isclose(fraction, 1, atol=0.02):
        month = 1  # January
    elif np.isclose(fraction, 0.08, atol=0.02):
        month = 2  # February
    elif np.isclose(fraction, 0.17, atol=0.02):
        month = 3  # March
    elif np.isclose(fraction, 0.25, atol=0.02):
        month = 4  # April
    elif np.isclose(fraction, 0.33, atol=0.02):
        month = 5  # May
    elif np.isclose(fraction, 0.42, atol=0.02):
        month = 6  # June
    elif np.isclose(fraction, 0.50, atol=0.02):
        month = 7  # July
    elif np.isclose(fraction, 0.58, atol=0.02):
        month = 8  # August
    elif np.isclose(fraction, 0.67, atol=0.02):
        month = 9  # September
    elif np.isclose(fraction, 0.75, atol=0.02):
        month = 10  # October
    elif np.isclose(fraction, 0.83, atol=0.02):
        month = 11  # November
    elif np.isclose(fraction, 0.92, atol=0.02):
        month = 12  # December
    else:
        raise ValueError(f"Fraction {fraction} does not match any month")
    
    return month

# Define the conversion function
def convert_to_datetime(decimal_date):
    decimal_date = float(decimal_date)
    year = int(decimal_date)
    month = map_to_month(decimal_date)
    return pd.Timestamp(year=year, month=month, day=1)

# Apply the conversion function to your dataset
vacancy['date'] = vacancy['date'].apply(convert_to_datetime)

In [92]:
temp = stocks.merge(cpi, on = ['date'], how = 'inner')

In [93]:
temp2 = temp.merge(jolts, on = ['date'], how='outer')

In [94]:
final = temp2.merge(vacancy, on = ['date'], how = 'outer')

In [95]:
final = final[(final['date'] >= '1951-01-01') & (final['date'] <= '2024-06-01')]

In [96]:
final['V'] = final['V'].fillna(final['vacancy_stock'])

In [97]:
final['date'] = pd.to_datetime(final['date'])

In [98]:
final['L'] = final['E'] + final['U']

final['U_rate'] = (final['U'] / final['L']) * 100 
final['V_rate'] = (final['V'] / final['L']) * 100


final['tightness'] = final['V'] / final['U']
final['ln_tightness'] = np.log(final['V']) - np.log(final['U'])

In [99]:
keep = ['date', 'P_12m_change', 'U_rate', 'V_rate', 'tightness', 'ln_tightness']
final = final[keep]

In [100]:
final.reset_index(drop=True)

Unnamed: 0,date,P_12m_change,U_rate,V_rate,tightness,ln_tightness
0,1951-01-01,7.954062,3.721283,4.118172,1.106654,0.101341
1,1951-02-01,9.402795,3.426786,3.71603,1.084407,0.081033
2,1951-03-01,9.475465,3.398586,4.085784,1.202201,0.184154
3,1951-04-01,9.598309,3.104776,4.048809,1.304058,0.265481
4,1951-05-01,9.339504,2.991425,4.085784,1.365832,0.311764
...,...,...,...,...,...,...
877,2024-02-01,3.165743,3.857226,5.263818,1.364664,0.310908
878,2024-03-01,3.475131,3.829179,4.976324,1.299580,0.262041
879,2024-04-01,3.357731,3.864677,4.714168,1.219809,0.198694
880,2024-05-01,3.250210,3.964062,4.906637,1.237780,0.213320


In [101]:
final.to_csv(f"{output_dir}/data/historical_data.csv", index=False)