In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import shapely.wkt
import pyproj
from pyproj import Geod
import shapely
from shapely.geometry import box, Polygon, Point
from shapely.ops import orient
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing as mp
import time

In [2]:
OAK_FP = '/oak/stanford/groups/deho/building_compliance/'
OUTPUT_FP = os.path.join(OAK_FP, 'outputs/Population-Estimates/outputs/')
cf_fp = '/oak/stanford/groups/deho/building_compliance/CloudFactory/'
fp = '/oak/stanford/groups/deho/building_compliance/outputs/Population-Estimates/final/'

In [3]:
df_processed = pd.read_csv(os.path.join(cf_fp, 'results', 'Batch3', 'processed_buildings.csv'), 
                           dtype={'apn': str})

subset = df_processed[(df_processed['2016_a'] >= 11.15) | (df_processed['2020_a'] >= 11.15)]

## Merge with sampling bins

In [5]:
bins = pd.read_csv(os.path.join(OUTPUT_FP, 'Strata/sj_parcels_bins.csv'))
bins_sample = pd.read_csv(os.path.join(OUTPUT_FP, '../CloudFactory/Iteration1/sampled_APNs_bins.csv'))
bins_sample = pd.read_csv(os.path.join(OUTPUT_FP, '../CloudFactory/Iteration3/complete_sampled_APNs.csv'))

In [6]:
bins_sample['Bin'] = bins_sample['Bin'].apply(lambda x: x.split('.')[0])

## 1. Bootstrap CIs (parallelized)

Define helper functions

In [6]:
def make_global(bins_all_res, bins_sample, merged, col):
    global bins_all_res_global
    global merged_global
    global bins_sample_global
    global col_global

    bins_all_res_global = bins_all_res
    merged_global = merged
    bins_sample_global = bins_sample
    col_global = col
    
def driver(i):
    bin_n = bins_all_res_global['Bin'].value_counts()
    bin_sample_n = bins_sample_global['Bin'].value_counts()
    unique_bins = list(bins_all_res_global['Bin'].unique())
    sampled_unique_bins = list(bins_sample_global['Bin'].unique())

    results_df = pd.DataFrame(columns=['bin', 't', 'ybar', 'N', 'n', 'var', 'tot_var'])
    for b in unique_bins:
        if b in sampled_unique_bins: # we drop 2 bins
            np.random.seed((os.getpid() * int(time.time())) % 123456789)
            bootstrap = bins_sample[bins_sample['Bin'] == b].sample(frac=1.0, replace=True)
            value_counts = bootstrap['APN'].value_counts()

            overlap = list(set(bootstrap['APN'].unique()) & set(merged_global['APN'].unique()))
            overlap_counts = value_counts.loc[overlap]
            sub = pd.DataFrame(columns=merged_global.columns)

            for idx, val in overlap_counts.iteritems():
                for i in range(val):
                    sub.loc[len(sub)] = merged_global[merged_global['APN'] == idx].iloc[0].tolist()

            sub = sub[[col_global, 'APN']].groupby('APN').agg(list)
            sub[col_global] = sub[col_global].apply(lambda x: np.sum(x))

            n_j = bin_sample_n[b]
            N_j = bin_n[b]

            y_avg_j = sub[col_global].sum()/n_j
            t_j = N_j*y_avg_j
            if n_j == 1:
                n_j += 1 # safeguard against div by 0 error
            var_j = np.sum(np.square(np.array(sub[col_global].tolist() + \
                                          [0] * (n_j-len(sub))) - y_avg_j)) / (n_j - 1)
            total_var_j = (1-(n_j/N_j))*N_j*N_j*(var_j/n_j)

#             t += t_j
#             total_var += total_var_j
            results_df.loc[len(results_df)] = [b, t_j, y_avg_j, N_j, n_j, var_j, total_var_j]

    return results_df

def bootstrap_mp(subset, bins_all_res, bins_sample, n, col):
    
    merged = subset[['apn', '2016', '2020', 'change', 
                 'construction', 'change_area1.2']].merge(bins_all_res, left_on='apn', 
                                                       right_on='APN', how='left').drop(columns=['apn'])
    
    nprocs = mp.cpu_count()
    
    
    p = mp.Pool(processes=nprocs, initializer=make_global, 
                initargs=(bins_all_res, bins_sample, merged, col, ))
    
    results_df = pd.DataFrame(columns=['bin', 't', 'ybar', 'N', 'n', 'var', 'tot_var'])
    for x in tqdm(p.imap_unordered(driver, range(n)), total=n):
        results_df = pd.concat([results_df, x])
    
    return results_df

**IF RUNNING: save should be FALSE**

In [None]:
save = False

In [43]:
col = 'construction'
results_df = bootstrap_mp(subset, bins, bins_sample, 1000, col)

if save:
    results_df.to_csv(os.path.join(fp, 'batch3', f'bin_estimates_boostrap_{col}.csv'), index=False)

100%|██████████| 1000/1000 [27:46<00:00,  1.67s/it]


In [44]:
col = 'change'
results_df = bootstrap_mp(subset, bins, bins_sample, 1000, col)

if save:
    results_df.to_csv(os.path.join(fp, 'batch3', f'bin_estimates_boostrap_{col}.csv'), index=False)

100%|██████████| 1000/1000 [28:11<00:00,  1.69s/it]


In [45]:
col = 'change_area1.2'
results_df = bootstrap_mp(subset, bins, bins_sample, 1000, col)

if save:
    results_df.to_csv(os.path.join(fp, 'batch3', f'bin_estimates_boostrap_{col}.csv'), index=False)

100%|██████████| 1000/1000 [27:48<00:00,  1.67s/it]


## 2. PopEstimate CIs (not parallelized)

In [48]:
bin_n = bins['Bin'].value_counts()
bin_sample_n = bins_sample['Bin'].value_counts()

unique_bins = list(bins['Bin'].unique())
sampled_unique_bins = list(bins_sample['Bin'].unique())

merged = subset[['apn', '2016', '2020', 'change', 
                 'construction', 'change_area1.2']].merge(bins, left_on='apn', 
                                                       right_on='APN', how='left').drop(columns=['apn'])

In [46]:
def run_popestimate(merged, col, save_df=None):
    results_df = pd.DataFrame(columns=['bin', 't', 'ybar', 'N', 'n', 'var', 'tot_var'])
    t = 0
    total_var = 0
    t_list = []
    for b in unique_bins:
        if b in sampled_unique_bins: # we drop 2 bins
            sub = merged[merged['Bin'] == b]
            sub = sub[[col, 'APN']].groupby('APN').agg(list)
            sub[col] = sub[col].apply(lambda x: np.sum(x))

            n_j = bin_sample_n[b]
            N_j = bin_n[b]

            y_avg_j = sub[col].sum()/n_j
            t_j = N_j*y_avg_j
            if n_j == 1:
                n_j += 1 # safeguard against div by 0 error
            var_j = np.sum(np.square(np.array(sub[col].tolist() + \
                                          [0] * (n_j-len(sub))) - y_avg_j)) / (n_j - 1)
            total_var_j = (1-(n_j/N_j))*N_j*N_j*(var_j/n_j)
            t_list.append(t_j)
            t += t_j
            total_var += total_var_j
            results_df.loc[len(results_df)] = [b, t_j, y_avg_j, N_j, n_j, var_j, total_var_j]

    if save_df is not None:
        results_df.to_csv(save_df, index=False)
        
    return t, total_var

In [54]:
col = 'change_area1.2'
t, total_var = run_popestimate(merged, col, os.path.join(fp, 'batch3', f'bin_estimates_{col}.csv'))

In [None]:
col = 'change'
t, total_var = run_popestimate(merged, col, os.path.join(fp, 'batch3', f'bin_estimates_{col}.csv'))

In [None]:
col = 'construction'
t, total_var = run_popestimate(merged, col, os.path.join(fp, 'batch3', f'bin_estimates_{col}.csv'))