In [230]:
import numpy as np 
import pandas as pd
import os
import re
import statsmodels.api as sm

In [231]:
def get_pi_ck(df):
    df = df.set_index('COMZONE')
    df = df.drop(columns='other')
    total_employment = df.sum().sum()
    df = df.div(total_employment)
    return df.reset_index()

def get_phi_ck(df):
    df = df.set_index('COMZONE')
    df['city_total'] = df.sum(axis=1)
    df = df.div(df['city_total'], axis=0)
    df = df.drop(columns='city_total')
    return df.reset_index()

def return_df(sub_directory, field_name):
    # Directory containing the CSV files
    directory = f'../data/processed/{sub_directory}/'

    # List all files in the directory
    files = os.listdir(directory)

    # Initialize an empty list to store DataFrames
    dfs = []

    # Process each file
    for file in files:
        if file.endswith('.csv'):
            # Extract the year from the file name (assuming the year is in the file name)
            year = file.split('_')[-1].split('.')[0]
            
            # Read the CSV file
            df = pd.read_csv(os.path.join(directory, file))
            if field_name == 'pi_ck':
                df = get_pi_ck(df)
            elif field_name == 'phi_ck':
                df = get_phi_ck(df)
            
            # Add the year column
            df['year'] = year
            
            # Melt the DataFrame
            df_melted = df.melt(id_vars=['COMZONE', 'year'], var_name='occ', value_name=field_name)
            
            # Append the melted DataFrame to the list
            dfs.append(df_melted)

    # Concatenate all DataFrames into a single DataFrame
    return pd.concat(dfs, ignore_index=True)

# Function to extract years from filenames
def extract_years_from_filenames(filenames):
    years = set()
    for filename in filenames:
        match = re.search(r'\d{4}', filename)
        if match:
            years.add(match.group(0))
    return sorted(years)

In [232]:
pi_ckt = return_df('city_occ_employment', 'pi_ck')

In [233]:
# List files in the subfolder
city_occ_employment_files = os.listdir('../data/processed/city_occ_employment/')

# Extract years from filenames
years = extract_years_from_filenames(city_occ_employment_files)

# Initialize a dictionary to store results
results = {}

# Iterate over each year
for year in years:
    # Load and preprocess phi_ck_outer
    phi_ck_outer = pd.read_csv(f'../data/processed/city_occ_employment/city_occ_e_{year}.csv')
    phi_ck_outer = phi_ck_outer.drop(columns='other')
    phi_ck_outer = get_phi_ck(phi_ck_outer)
    phi_ck_outer = phi_ck_outer.set_index('COMZONE')
    
    # Load and preprocess phi_ck_inner
    phi_ck_inner = pd.read_csv(f'../data/processed/city_occ_employment/city_occ_e_{year}.csv')
    # phi_ck_inner = phi_ck_inner.drop(columns='other')
    phi_ck_inner = get_pi_ck(phi_ck_inner)
    phi_ck_inner = phi_ck_inner.set_index('COMZONE')
    
    temp = phi_ck_inner.copy()
    
    for col in phi_ck_inner.columns:
        phi_ck_inner[col] = temp.drop(columns=col).sum(axis=1)
    
    phi_ck = phi_ck_outer * phi_ck_inner
    
    temp = phi_ck.copy()
    
    for rows in phi_ck.index:
        phi_ck.loc[rows] = temp.drop(index=rows).sum(axis=0)
    
    phi_ck = phi_ck.melt(ignore_index=False).reset_index()
    
    # Add year column
    phi_ck['year'] = year
    
    # Store the result
    results[year] = phi_ck

# Concatenate all results and reset index
phi_ckt = pd.concat(results.values()).reset_index(drop=True)
phi_ckt.rename(columns={'value': 'phi_ckt_ss', 'variable': 'occ'}, inplace=True)

In [234]:
# T_st = pd.read_csv('../data/processed/tfp.csv')
# T_st = T_st.set_index('two_digit_naics')

# T_s = T_st['2005']
# L_cs0 = pd.read_csv('../data/processed/city_sec_employment/city_sec_e_2005.csv')
# L_cs0 = L_cs0.set_index('COMZONE')
# L_cs0['sector_total'] = L_cs0.sum(axis=1)
# L_cs0 = L_cs0.div(L_cs0['sector_total'], axis=0)
# L_cs0 = L_cs0.drop(columns='sector_total')
# L_cs0 = L_cs0.fillna(0)
# gamma_ks0 = pd.read_csv('../data/processed/sec_occ_wage/sec_occ_w_2005.csv')
# gamma_ks0 = gamma_ks0.set_index('INDNAICS')
# gamma_ks0 = gamma_ks0.div(gamma_ks0.sum().sum())
# gamma_ks0 = gamma_ks0.apply(lambda x: x * T_s, axis=0)
# gamma_ks0 = gamma_ks0.fillna(0)

# index = L_cs0.index
# columns = gamma_ks0.columns

# L_cs0 = L_cs0.to_numpy()
# gamma_ks0 = gamma_ks0.to_numpy()
# w_ck = L_cs0 @ gamma_ks0
# w_ck = pd.DataFrame(w_ck, index=index, columns=columns)
# w_ck = w_ck.melt(ignore_index=False).reset_index()

In [235]:
T_st = pd.read_csv('../data/processed/tfp.csv')
T_st = T_st.set_index('two_digit_naics')

# List files in the subfolders
city_sec_employment_files = os.listdir('../data/processed/city_sec_employment/')
sec_occ_wage_files = os.listdir('../data/processed/sec_occ_wage/')

# Extract years from filenames
years = extract_years_from_filenames(city_sec_employment_files)
years = [year for year in years if year in extract_years_from_filenames(sec_occ_wage_files)]

# Initialize a dictionary to store results
results = {}

# Iterate over each year
for year in years:
    try:
        T_s = T_st[year]
    except KeyError:
        continue
    
    # Load and preprocess L_cs0
    L_cs0 = pd.read_csv(f'../data/processed/city_sec_employment/city_sec_e_{year}.csv')
    L_cs0 = L_cs0.set_index('COMZONE')
    L_cs0['sector_total'] = L_cs0.sum(axis=1)
    L_cs0 = L_cs0.div(L_cs0['sector_total'], axis=0)
    L_cs0 = L_cs0.drop(columns='sector_total')
    L_cs0 = L_cs0.fillna(0)
    
    # Load and preprocess gamma_ks0
    gamma_ks0 = pd.read_csv(f'../data/processed/sec_occ_wage/sec_occ_w_{year}.csv')
    try:
        gamma_ks0 = gamma_ks0.drop(columns='other')
    except KeyError:
        pass
    gamma_ks0 = gamma_ks0.set_index('INDNAICS')
    gamma_ks0 = gamma_ks0.div(gamma_ks0.sum().sum())
    gamma_ks0 = gamma_ks0.apply(lambda x: x * T_s, axis=0)
    gamma_ks0 = gamma_ks0.fillna(0)
    
    # Perform matrix multiplication
    index = L_cs0.index
    columns = gamma_ks0.columns
    L_cs0 = L_cs0.to_numpy()
    gamma_ks0 = gamma_ks0.to_numpy()
    w_ck = L_cs0 @ gamma_ks0
    w_ck = pd.DataFrame(w_ck, index=index, columns=columns)
    w_ck = w_ck.melt(ignore_index=False).reset_index()
    
    # Add year column
    w_ck['year'] = year
    
    # Store the result
    results[year] = w_ck

# Concatenate all results and reset index
w_ckt = pd.concat(results.values()).reset_index(drop=True)
w_ckt.rename(columns={'value': 'w_ckt_ss', 'variable': 'occ'}, inplace=True)

In [236]:
df = pi_ckt.merge(phi_ckt, on=['COMZONE', 'year', 'occ']).merge(w_ckt, on=['COMZONE', 'year', 'occ'])
df['log_phi_ckt_ss'] = np.log(df['phi_ckt_ss'])
df['log_w_ckt_ss'] = np.log(df['w_ckt_ss'])
df['log_pi_ckt'] = np.log(df['pi_ck'])

In [237]:
w_ckt = return_df('city_occ_wage', 'w_ckt')
phi_ckt = return_df('city_occ_employment', 'phi_ck')

df = df.merge(w_ckt, on=['COMZONE', 'year', 'occ']).merge(phi_ckt, on=['COMZONE', 'year', 'occ'])
df['w_ckt'] = df['w_ckt'].replace(0, 1e-10)
df['log_phi_ckt'] = np.log(df['phi_ck'])
df['log_w_ckt'] = np.log(df['w_ckt'])
df = df.dropna()

In [238]:
# First stages
model = sm.OLS.from_formula('log_w_ckt ~ log_w_ckt_ss + C(COMZONE) + C(occ) + C(year)', data=df)
results = model.fit()
results.summary()
df['log_w_ckt_iv'] = results.predict()

model = sm.OLS.from_formula('log_phi_ckt ~ log_phi_ckt_ss + C(COMZONE) + C(occ) + C(year)', data=df)
results = model.fit()
df['log_phi_ckt_iv'] = results.predict()

model = sm.OLS.from_formula('log_pi_ckt ~ log_w_ckt_iv + log_phi_ckt_iv + C(COMZONE) + C(occ) + C(year)', data=df)
results = model.fit()

# Extract the coefficients
coefficients = results.params

# Filter coefficients for C(COMZONE)
T_c = coefficients.filter(like='C(COMZONE)').reset_index()
T_c.columns = ['COMZONE', 'Coefficient']
T_c['COMZONE'] = T_c['COMZONE'].str.extract(r'C\(COMZONE\)\[T\.(.*)\]')

# Filter coefficients for C(occ)
T_k = coefficients.filter(like='C(occ)').reset_index()
T_k.columns = ['occ', 'Coefficient']
T_k['occ'] = T_k['occ'].str.extract(r'C\(occ\)\[T\.(.*)\]')

# Filter the rest of the coefficients
params = coefficients[['log_w_ckt_iv', 'log_phi_ckt_iv']]
params = params.reset_index()

In [239]:
# df.to_csv('results/data.csv', index=False)
# T_c.to_csv('results/T_c.csv', index=False)
# T_k.to_csv('results/T_k.csv', index=False)
# params.to_csv('results/params.csv', index=False)