In [1]:
# table of predicted univariate coefficients for real data
import pandas as pd
import numpy as np
import torch
from data import *
import os 

In [2]:
# file paths
base_file = '/share/garg/311_data/sb2377/clean_codebase/three_year_base.csv'
coeffs_file = '/share/garg/311_data/sb2377/clean_codebase/semisynthetic/semisynthetic_coeffs.csv'
results_dir = '/share/garg/311_data/sb2377/results'

# user specified arguments
types = {'Street': 'StreetConditionDOT',
         'Park': 'MaintenanceorFacilityDPR',
         'Rodent': 'RodentDOHMH',
         'Food': 'FoodDOHMH',
         'DCWP': 'ConsumerComplaintDCWP'}
covariates = {'log(Population density)':{'job_ids': [i * 6 + 3400 for i in range(13)]},
              'log(Median income)':{'job_ids': [i * 6 + 3401 for i in range(13)]},
              'Bachelors degree population':{'job_ids': [i * 6 + 3402 for i in range(13)]},
              'White population':{'job_ids': [i * 6 + 3403 for i in range(13)]},
              'Median age':{'job_ids': [i * 6 + 3404 for i in range(13)]},
              'Households occupied by renter':{'job_ids': [i * 6 + 3405 for i in range(13)]}}
epoch = '59'

In [3]:
# load files
base_df = pd.read_csv(base_file)
coeff_df = pd.read_csv(coeffs_file)

In [4]:
# get type indices
type_df = base_df[['typeagency', 'type_idxs']].drop_duplicates()
indices = {}
for type_name, type_id in types.items():
    idx = type_df[type_df['typeagency'] == type_id]['type_idxs'].iloc[0]
    indices[type_name] = idx

In [5]:
# get predicted coefficients for all jobs
checkpoint_file = '{}/job{}/model-epoch={}.ckpt'
checkpoint_counters = {}
for c in covariates:
    checkpoint_counters[c] = 0
predictions = {}

for c in covariates:
    job_ids = covariates[c]['job_ids']
    pred_coeffs = []
    for i, job_id in enumerate(job_ids):
        if os.path.exists(checkpoint_file.format(results_dir, job_id, epoch)):
            checkpoint_counters[c] += 1

            # get predicted coefficient
            checkpoint = torch.load(checkpoint_file.format(results_dir, job_id, epoch), map_location=torch.device('cpu'))
            coeff = checkpoint['state_dict']['pt_layer']
            mean_coeff = coeff[list(indices.values())].mean(dim=0)
            pred_coeffs.append(mean_coeff)

    predictions[c] = {'coeffs':np.array(pred_coeffs),
                      'mean': np.mean(pred_coeffs, axis=0),
                      'stderr': 1.96 * np.std(pred_coeffs, axis=0) / np.sqrt(len(pred_coeffs) - 1)}

for c in covariates:
    print('{}: checkpoint files done = {}'.format(c, checkpoint_counters[c]))

log(Population density): checkpoint files done = 13
log(Median income): checkpoint files done = 13
Bachelors degree population: checkpoint files done = 13
White population: checkpoint files done = 13
Median age: checkpoint files done = 13
Households occupied by renter: checkpoint files done = 13


In [None]:
# calculate mean and 95% CI width of demographic coefficients
df = pd.DataFrame()
df['covariates'] = list(covariates.keys())

means = []
stds = []
for c in covariates:
    means.append(predictions[c]['mean'][0])
    stds.append(predictions[c]['stderr'][0])

df['mean_coeffs'] = means
df['95%_CI_width'] = stds

# calculate mean and 95% CI width of rating coefficients
rating_coeffs = []
for c in covariates:
    rating_coeffs.append(predictions[c]['mean'][1])
rating_coeffs = np.array(rating_coeffs)
df.loc[len(df)] = ['Rating', 
                   rating_coeffs.mean(),
                   rating_coeffs.std() / np.sqrt(len(rating_coeffs) - 1)]

df.sort_values(by='mean_coeffs', ascending=False)

Unnamed: 0,covariates,mean_coeffs,95%_CI_width
0,log(Population density),0.249722,0.057721
1,log(Median income),0.1728,0.019521
2,Bachelors degree population,0.15876,0.017616
5,Households occupied by renter,0.11475,0.029451
4,Median age,0.104365,0.0156
3,White population,0.092752,0.012033
6,Rating,-0.19675,0.001696
