In [1]:
# table of predicted multivariate coefficients for real data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from data import *
import os
plt.rcParams['font.family'] = 'sans-serif'

In [9]:
# file paths
base_file = '/share/garg/311_data/sb2377/clean_codebase/three_year_base.csv'
coeffs_file = '/share/garg/311_data/sb2377/clean_codebase/semisynthetic/semisynthetic_coeffs.csv'
results_dir = '/share/garg/311_data/sb2377/results'

# user specified arguments
types = {'Street': 'StreetConditionDOT',
         'Park': 'MaintenanceorFacilityDPR',
         'Rodent': 'RodentDOHMH',
         'Food': 'FoodDOHMH',
         'DCWP': 'ConsumerComplaintDCWP'}
covariates = ['log(Population density)',
              'log(Median income)',
              'Bachelors degree population',
              'White population',
              'Median age',
              'Households occupied by renter',
              'Rating']
job_ids = [3000] + [i * 3 + 3005 for i in range(12)]
epoch = '59'

In [3]:
# load files
base_df = pd.read_csv(base_file)
coeff_df = pd.read_csv(coeffs_file)

In [4]:
# get type indices
type_df = base_df[['typeagency', 'type_idxs']].drop_duplicates()
indices = {}
for type_name, type_id in types.items():
    idx = type_df[type_df['typeagency'] == type_id]['type_idxs'].iloc[0]
    indices[type_name] = idx

In [None]:
# get predicted coefficients for all jobs
checkpoint_file = '{}/job{}/model-epoch={}.ckpt'
counter = 0
pred_coeffs = []

for job_id in job_ids:
    checkpoint_file_formatted = checkpoint_file.format(results_dir, job_id, epoch)
    if os.path.exists(checkpoint_file_formatted):
        counter += 1
        checkpoint = torch.load(checkpoint_file_formatted, map_location=torch.device('cpu'))
        coeffs = checkpoint['state_dict']['pt_layer']
        pred_coeffs.append(coeffs[list(indices.values())].mean(dim=0))
        
print('num done = {}'.format(counter))

num done = 13


In [None]:
# calculate mean and 95% CI width of coefficients
df = pd.DataFrame()
df['covariates'] = covariates
pred_coeffs = np.array(pred_coeffs)
df['mean_coeffs'] = pred_coeffs.mean(axis=0)
stderr_coeffs = 1.96 * pred_coeffs.std(axis=0) / np.sqrt(pred_coeffs.shape[0] - 1)
df['95%_CI_width'] = stderr_coeffs
df.sort_values(by='mean_coeffs', ascending=False)

Unnamed: 0,covariates,mean_coeffs,95%_CI_width
2,Bachelors degree population,0.233551,0.029734
5,Households occupied by renter,0.173637,0.027838
4,Median age,0.154986,0.010347
0,log(Population density),0.148471,0.035298
3,White population,-0.072002,0.013358
1,log(Median income),-0.125985,0.023547
6,Rating,-0.192985,0.009148
