In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import glob
import os
from linearmodels.iv import IV2SLS
from patsy.contrasts import Treatment
from statsmodels.formula.api import ols

In [2]:
files = glob.glob('../data/processed/city_occ_employment/*.csv')

employment = pd.DataFrame()

for file in files:
    year = int(os.path.basename(file)[-8:-4])
    current = pd.read_csv(file)
    current['city_total'] = current.iloc[:, 1:].sum(axis=1)
    current = current.melt(id_vars=['MET2013'], var_name='Occupation', value_name='Employed')
    current['Year'] = year
    employment = pd.concat([employment, current], ignore_index=True)

occ_year = employment.copy()
occ_year_total = occ_year.groupby(['Year', 'Occupation'])['Employed'].sum().reset_index(name='Total_Employed')
occ_year = pd.merge(occ_year, occ_year_total, on=['Year', 'Occupation'])
occ_year['shares'] = occ_year['Employed'] / occ_year['Total_Employed']
occ_year = occ_year.drop(columns=['Total_Employed'])

occ_year['city_year'] = "(" + occ_year['MET2013'] + ")" + '_' + occ_year['Year'].astype(str)
occ_year['occ_year'] = "(" + occ_year['Occupation'] + ")" + '_' + occ_year['Year'].astype(str)
occ_year = occ_year.drop(columns=['MET2013', 'Occupation', 'Year'])
occ_year = occ_year[['city_year', 'occ_year', 'shares']]
occ_year = occ_year[~occ_year['occ_year'].str.contains('city_total')]

In [3]:
occ_year['shares'] = occ_year['shares'].apply(lambda x: np.log(x))

formula = 'shares ~ C(city_year) + C(occ_year)'
model = smf.ols(formula=formula, data=occ_year).fit()
coefficients = model.params
city_year_coeffs = {param: coefficients[param] for param in coefficients.index if 'C(city_year)' in param}
occ_year_coeffs = {param: coefficients[param] for param in coefficients.index if 'C(occ_year)' in param}
for category, coeff in city_year_coeffs.items():
    # Extract the actual category name from the model's parameter name
    actual_category = category.split('T.')[1].split(']')[0]  # Adjust based on your parameter naming
    # Map the coefficient to the corresponding rows in the DataFrame
    occ_year.loc[occ_year['city_year'] == actual_category, 'city_year_coeff'] = coeff
for category, coeff in occ_year_coeffs.items():
    # Extract the actual category name from the model's parameter name
    actual_category = category.split('T.')[1].split(']')[0]  # Adjust based on your parameter naming
    # Map the coefficient to the corresponding rows in the DataFrame
    occ_year.loc[occ_year['occ_year'] == actual_category, 'occ_year_coeff'] = coeff
occ_year['residuals'] = model.resid

occ_year.to_csv('results/regressions/city_occ_ols.csv', index=False)

with open('results/regressions/city_occ_ols.txt', 'w') as f:
    f.write(model.summary().as_text())

In [4]:
files = glob.glob('../data/processed/city_occ_wage/*.csv')

wage = pd.DataFrame()

for file in files:
    year = int(os.path.basename(file)[-8:-4])
    current = pd.read_csv(file)
    current['city_total'] = current.iloc[:, 1:].sum(axis=1)
    current = current.melt(id_vars=['MET2013'], var_name='Occupation', value_name='Wage')
    current['Year'] = year
    wage = pd.concat([wage, current], ignore_index=True)

city_wage = wage.groupby(['MET2013', 'Year']).sum().reset_index()
city_wage = city_wage.drop(columns=['Occupation'])

city = employment.groupby(['MET2013', 'Year']).sum().reset_index()
city = city.drop(columns=['Occupation'])

city['year_total'] = city.groupby('Year')['Employed'].transform('sum')
city['share'] = city['Employed'] / city['year_total']
city = city.drop(columns=['Employed', 'year_total'])

city = pd.merge(city, city_wage, on=['MET2013', 'Year'])
city['city_year'] = "(" + city['MET2013'] + ")" + '_' + city['Year'].astype(str)

city['log_wage'] = city['Wage'].apply(lambda x: np.log(x))
city['log_share'] = city['share'].apply(lambda x: np.log(x))
city = city.drop(columns=['Wage', 'share'])

formula = 'log_share ~ log_wage + C(city_year)'
model = smf.ols(formula=formula, data=city).fit()
coefficients = model.params
city_year_coeffs = {param: coefficients[param] for param in coefficients.index if 'C(city_year)' in param}

for category, coeff in city_year_coeffs.items():
    # Extract the actual category name from the model's parameter name
    actual_category = category.split('T.')[1].split(']')[0]  # Adjust based on your parameter naming
    # Map the coefficient to the corresponding rows in the DataFrame
    city.loc[city['city_year'] == actual_category, 'city_year_coeff'] = coeff

city['wage_coeff'] = coefficients['log_wage']

city.to_csv('results/regressions/city_wage_ols.csv', index=False)

with open('results/regressions/city_wage_ols.txt', 'w') as f:
    f.write(model.summary().as_text())

city['log_wage_change'] = city.groupby('MET2013')['log_wage'].diff()
city['log_share_change'] = city.groupby('MET2013')['log_share'].diff()
city = city.dropna()
city = city.drop(columns=['log_wage', 'log_share', 'MET2013', 'Year'])
city = city[['city_year', 'log_wage_change', 'log_share_change']]


  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


In [5]:
formula = 'log_share_change ~ log_wage_change + C(city_year)'

model = smf.ols(formula=formula, data=city).fit()
coefficients = model.params
city_year_coeffs = {param: coefficients[param] for param in coefficients.index if 'C(city_year)' in param}

for category, coeff in city_year_coeffs.items():
    # Extract the actual category name from the model's parameter name
    actual_category = category.split('T.')[1].split(']')[0]  # Adjust based on your parameter naming
    # Map the coefficient to the corresponding rows in the DataFrame
    city.loc[city['city_year'] == actual_category, 'city_year_coeff'] = coeff

city['wage_coeff'] = coefficients['log_wage_change']
city['residuals'] = model.resid

city.to_csv('results/regressions/deltas/city_wage_ols.csv', index=False)

with open('results/regressions/deltas/city_wage_ols.txt', 'w') as f:
    f.write(model.summary().as_text())

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


In [6]:
tfp = pd.read_csv('../data/processed/tfp.csv')
tfp.set_index('two_digit_naics', inplace=True)
tfp = tfp.filter(items=[str(year) for year in range(2010, 2020)])
tfp = tfp.apply(lambda x: np.log(x))
tfp = tfp.diff(axis=1)
tfp.fillna(0, inplace=True)

city_sec = pd.read_csv('../data/processed/city_sec_wage/city_sec_w_2010.csv')
sec_occ = pd.read_csv('../data/processed/sec_occ_wage/sec_occ_w_2010.csv')

city_sec.set_index('MET2013', inplace=True)
city_sec.fillna(0.1, inplace=True)
city_sec['city_total'] = city_sec.sum(axis=1)
city_sec = city_sec.div(city_sec['city_total'], axis=0)
city_sec = city_sec.drop(columns=['city_total'])
sec_occ.set_index('INDNAICS', inplace=True)
sec_occ.fillna(0.1, inplace=True)
sec_occ['sec_total'] = sec_occ.sum(axis=1)
sec_occ = sec_occ.div(sec_occ['sec_total'], axis=0)
sec_occ = sec_occ.drop(columns=['sec_total'])

tfp = tfp.stack().reset_index()
tfp.columns = ['sector', 'year', 'tfp_change']

city_sec = city_sec.stack().reset_index()
city_sec.columns = ['city', 'sector', 'share']

city_sec = pd.merge(city_sec, tfp, left_on='sector', right_on='sector')
city_sec['tfp_change'] = city_sec['tfp_change'] + 1
city_sec['tfp_change'] = city_sec.groupby(['city', 'sector'])['tfp_change'].cumprod()
city_sec['share'] = city_sec['share'] * city_sec['tfp_change']
city_sec['share_change'] = city_sec.groupby(['city', 'sector'])['share'].diff()
city_sec = city_sec.dropna()
city_sec = city_sec.drop(columns=['share', 'tfp_change'])

sec_occ = sec_occ.stack().reset_index()
sec_occ.columns = ['sector', 'occupation', 'share']

city_sec = pd.merge(city_sec, sec_occ, left_on='sector', right_on='sector')
city_sec['sim_wage_change'] = city_sec['share'] * city_sec['share_change']
city_sec = city_sec.drop(columns=['share', 'share_change'])
city_sec = city_sec.groupby(['city', 'occupation', 'year']).sum().reset_index()
city_sec = city_sec.drop(columns=['sector'])

wage = wage.rename(columns={'MET2013': 'city', 'Occupation': 'occupation', 'Year': 'year', 'Wage': 'wage'})
wage_change = wage.copy()
wage_change['wage'] = wage_change['wage'].apply(lambda x: np.log(x))
wage_change.sort_values(['city', 'occupation', 'year'], inplace=True)
wage_change['wage_change'] = wage_change.groupby(['city', 'occupation'])['wage'].diff()
wage_change = wage_change.dropna()
wage_change = wage_change.drop(columns=['wage'])
city_sec['year'] = city_sec['year'].astype(int)
city_sec = pd.merge(city_sec, wage_change, on=['city', 'occupation', 'year'])

In [7]:
occ_year_iv = employment.copy()
occ_year_iv = pd.merge(occ_year_iv, occ_year_total, on=['Year', 'Occupation'])
occ_year_iv['shares'] = occ_year_iv['Employed'] / occ_year_iv['Total_Employed']
occ_year_iv = occ_year_iv.drop(columns=['Total_Employed', 'Employed'])
occ_year_iv['shares'] = occ_year_iv['shares'].apply(lambda x: np.log(x))
occ_year_iv = occ_year_iv.sort_values(by=['MET2013', 'Occupation', 'Year'])
occ_year_iv['actual_occ_change'] = occ_year_iv.groupby(['Occupation', 'MET2013'])['shares'].diff()
occ_year_iv.dropna(inplace=True)
occ_year_iv = occ_year_iv.drop(columns=['shares'])
occ_year_iv = occ_year_iv.rename(columns={'Occupation': 'occupation', 'Year': 'year', 'MET2013': 'city'})
city_sec = pd.merge(city_sec, occ_year_iv, on=['city', 'occupation', 'year'])

city_sec['city_year'] = "(" + city_sec['city'] + ")" + '_' + city_sec['year'].astype(str)
city_sec['occ_year'] = "(" + city_sec['occupation'] + ")" + '_' + city_sec['year'].astype(str)
city_sec = city_sec.drop(columns=['city', 'occupation', 'year'])

# city_sec = city_sec[city_sec['city_year'] != '(Akron, OH)_2011']
# city_sec = city_sec[city_sec['occ_year'] != '(occ2_management)_2011']

In [8]:
model = smf.ols(formula='wage_change ~ sim_wage_change', data=city_sec).fit()
city_sec['predicted_wage'] = model.fittedvalues

with open('results/regressions/deltas/city_sec_iv_first_stage.txt', 'w') as f:
    f.write(model.summary().as_text())

model = smf.ols(formula='actual_occ_change ~ predicted_wage + C(city_year) + C(occ_year)', data=city_sec).fit()
coefficients = model.params
city_year_coeffs = {param: coefficients[param] for param in coefficients.index if 'C(city_year)' in param}
occ_year_coeffs = {param: coefficients[param] for param in coefficients.index if 'C(occ_year)' in param}
for category, coeff in city_year_coeffs.items():
    # Extract the actual category name from the model's parameter name
    actual_category = category.split('T.')[1].split(']')[0]  # Adjust based on your parameter naming
    # Map the coefficient to the corresponding rows in the DataFrame
    city_sec.loc[city_sec['city_year'] == actual_category, 'city_year_coeff'] = coeff

for category, coeff in occ_year_coeffs.items():
    # Extract the actual category name from the model's parameter name
    actual_category = category.split('T.')[1].split(']')[0]  # Adjust based on your parameter naming
    # Map the coefficient to the corresponding rows in the DataFrame
    city_sec.loc[city_sec['occ_year'] == actual_category, 'occ_year_coeff'] = coeff

city_sec['iv_wage_coeff'] = coefficients['predicted_wage']

city_sec.to_csv('results/regressions/deltas/city_sec_iv.csv', index=False)

with open('results/regressions/deltas/city_sec_iv_second_stage.txt', 'w') as f:
    f.write(model.summary().as_text())

In [9]:
model = smf.ols(formula='actual_occ_change ~ wage_change + C(city_year) + C(occ_year)', data=city_sec).fit()

with open('results/regressions/deltas/city_sec_ols.txt', 'w') as f:
    f.write(model.summary().as_text())

In [10]:
tfp = pd.read_csv('../data/processed/tfp.csv')
tfp.set_index('two_digit_naics', inplace=True)
tfp = tfp.filter(items=[str(year) for year in range(2010, 2020)])
tfp = tfp.apply(lambda x: np.log(x))

city_sec = pd.read_csv('../data/processed/city_sec_wage/city_sec_w_2010.csv')
sec_occ = pd.read_csv('../data/processed/sec_occ_wage/sec_occ_w_2010.csv')

city_sec.set_index('MET2013', inplace=True)
city_sec.fillna(0.1, inplace=True)
city_sec['city_total'] = city_sec.sum(axis=1)
city_sec = city_sec.div(city_sec['city_total'], axis=0)
city_sec = city_sec.drop(columns=['city_total'])
sec_occ.set_index('INDNAICS', inplace=True)
sec_occ.fillna(0.1, inplace=True)
sec_occ['sec_total'] = sec_occ.sum(axis=1)
sec_occ = sec_occ.div(sec_occ['sec_total'], axis=0)
sec_occ = sec_occ.drop(columns=['sec_total'])

tfp = tfp.stack().reset_index()
tfp.columns = ['sector', 'year', 'tfp']

city_sec = city_sec.stack().reset_index()
city_sec.columns = ['city', 'sector', 'share']

city_sec = pd.merge(city_sec, tfp, left_on='sector', right_on='sector')
city_sec['tfp_share'] = city_sec['share'] * city_sec['tfp']
city_sec = city_sec.drop(columns=['share', 'tfp'])

sec_occ = sec_occ.stack().reset_index()
sec_occ.columns = ['sector', 'occupation', 'sec_occ_share']

city_sec = pd.merge(city_sec, sec_occ, left_on='sector', right_on='sector')
city_sec['sim_wage'] = city_sec['sec_occ_share'] * city_sec['tfp_share']
city_sec = city_sec.groupby(['city', 'occupation', 'year']).sum().reset_index()
city_sec = city_sec.drop(columns=['sector', 'tfp_share', 'sec_occ_share'])

wage = wage.rename(columns={'MET2013': 'city', 'Occupation': 'occupation', 'Year': 'year', 'Wage': 'wage'})
wage_change = wage.copy()
wage_change['wage'] = wage_change['wage'].apply(lambda x: np.log(x))
wage_change.sort_values(['city', 'occupation', 'year'], inplace=True)
city_sec['year'] = city_sec['year'].astype(int)
city_sec = pd.merge(city_sec, wage_change, on=['city', 'occupation', 'year'])

occ_year_iv = employment.copy()
occ_year_total = occ_year_total[occ_year_total['Occupation'] != 'city_total']
occ_year_iv = pd.merge(occ_year_iv, occ_year_total, on=['Year', 'Occupation'])
occ_year_iv['actual_shares'] = occ_year_iv['Employed'] / occ_year_iv['Total_Employed']
occ_year_iv = occ_year_iv.drop(columns=['Total_Employed', 'Employed'])
occ_year_iv['actual_shares'] = occ_year_iv['actual_shares'].apply(lambda x: np.log(x))
occ_year_iv = occ_year_iv.sort_values(by=['MET2013', 'Occupation', 'Year'])
occ_year_iv = occ_year_iv.rename(columns={'Occupation': 'occupation', 'Year': 'year', 'MET2013': 'city'})
city_sec = pd.merge(city_sec, occ_year_iv, on=['city', 'occupation', 'year'])

city_sec['city_year'] = "(" + city_sec['city'] + ")" + '_' + city_sec['year'].astype(str)
city_sec['occ_year'] = "(" + city_sec['occupation'] + ")" + '_' + city_sec['year'].astype(str)
city_sec = city_sec.drop(columns=['city', 'occupation', 'year'])

In [11]:
model = smf.ols(formula='wage ~ sim_wage + C(city_year) + C(occ_year)', data=city_sec).fit()
with open ('results/regressions/city_sec_iv_first_stage.txt', 'w') as f:
    f.write(model.summary().as_text())
city_sec['predicted_wage'] = model.fittedvalues

model = smf.ols(formula='actual_shares ~ predicted_wage + C(city_year) + C(occ_year)', data=city_sec).fit()
coefficients = model.params
city_year_coeffs = {param: coefficients[param] for param in coefficients.index if 'C(city_year)' in param}
occ_year_coeffs = {param: coefficients[param] for param in coefficients.index if 'C(occ_year)' in param}
for category, coeff in city_year_coeffs.items():
    # Extract the actual category name from the model's parameter name
    actual_category = category.split('T.')[1].split(']')[0]  # Adjust based on your parameter naming
    # Map the coefficient to the corresponding rows in the DataFrame
    city_sec.loc[city_sec['city_year'] == actual_category, 'city_year_coeff'] = coeff

for category, coeff in occ_year_coeffs.items():
    # Extract the actual category name from the model's parameter name
    actual_category = category.split('T.')[1].split(']')[0]  # Adjust based on your parameter naming
    # Map the coefficient to the corresponding rows in the DataFrame
    city_sec.loc[city_sec['occ_year'] == actual_category, 'occ_year_coeff'] = coeff

city_sec['iv_wage_coeff'] = coefficients['predicted_wage']

city_sec.to_csv('results/regressions/city_sec_iv.csv', index=False)

with open('results/regressions/city_sec_iv_second_stage.txt', 'w') as f:
    f.write(model.summary().as_text())

model = smf.ols(formula='actual_shares ~ wage + C(city_year) + C(occ_year)', data=city_sec).fit()

with open('results/regressions/city_sec_ols.txt', 'w') as f:
    f.write(model.summary().as_text())