In [1]:
import urllib.request as urllib
import zipfile
import pandas as pd
pd.set_option('display.max_columns', None)
import os
import matplotlib.pyplot as plt
import time
import numpy as np

In [2]:
# csv parameter options:
# csv=0 --> AMI State, Counties, Cities 2018.csv
# csv=1 --> AMI Census Tracts 2018.csv
# csv=2 --> FPL State, Counties, Cities 2018.csv
# csv=3 --> FPL Census Tracts 2018.csv
# csv=4 --> SMI State, Counties, Cities 2018.csv
# csv=5 --> SMI Census Tracts 2018.csv

def get_LEAD_DataFrame(state_abbrev, csv=1):
    try:
        url = 'https://data.openei.org/files/573/' + state_abbrev + '-2018-LEAD-data.zip'
    except:
        url = 'https://data.openei.org/files/573/' + state_abbrev + '-2018-LEAD-data%20(1).zip'
    filehandle, _ = urllib.urlretrieve(url)
    zip_file_object = zipfile.ZipFile(filehandle, 'r')
    my_file = zip_file_object.namelist()[csv]
    df = pd.read_csv(zip_file_object.open(my_file))
    zip_file_object.close()
    return df

In [3]:
df = get_LEAD_DataFrame('DC')
df.head()

Unnamed: 0,ABV,FIP,TEN,YBL6,BLD,HFL,AMI68,UNITS,HINCP*UNITS,ELEP*UNITS,GASP*UNITS,FULP*UNITS,HINCP UNITS,ELEP UNITS,GASP UNITS,FULP UNITS,HCOUNT,ECOUNT,GCOUNT,FCOUNT,HINCP,ELEP,GASP,FULP
0,DC,11001000100,OWNER,1940-59,1 ATTACHED,BOTTLED GAS,0-30%,3.177028e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
1,DC,11001000100,OWNER,1940-59,1 ATTACHED,BOTTLED GAS,100%+,1.008098e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
2,DC,11001000100,OWNER,1940-59,1 ATTACHED,BOTTLED GAS,30-60%,3.775287e-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
3,DC,11001000100,OWNER,1940-59,1 ATTACHED,BOTTLED GAS,60-80%,1.124989e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
4,DC,11001000100,OWNER,1940-59,1 ATTACHED,BOTTLED GAS,80-100%,2.186435e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,


In [21]:
def get_expenditures_df(sa):
    df = get_LEAD_DataFrame(sa)
    fips = list(df.FIP.unique())
    avg_elec = [0 for _ in fips]
    avg_gas = [0 for _ in fips]
    avg_fuel = [0 for _ in fips]
    avg_nrg = [0 for _ in fips]
    avg_inc = [0 for _ in fips]
    units = [0 for _ in fips]
    nrg_burd = [0 for _ in fips]
    for i in range(len(fips)):
        df2 = df[df['FIP']==fips[i]]
        if(sum(df2['UNITS']>0)):
            avg_elec[i] = round(sum(df2['ELEP*UNITS'])/sum(df2['UNITS']),1)
            avg_gas[i] = round(sum(df2['GASP*UNITS'])/sum(df2['UNITS']),1)
            avg_fuel[i] = round(sum(df2['FULP*UNITS'])/sum(df2['UNITS']),1)
            avg_inc[i] = round(sum(df2['HINCP*UNITS'])/sum(df2['UNITS']))
            avg_nrg[i] = round((sum(df2['ELEP*UNITS'])+sum(df2['GASP*UNITS'])+sum(df2['FULP*UNITS']))/sum(df2['UNITS']))
            units[i] = round(sum(df2.UNITS))
            if(avg_inc[i]>0):
                nrg_burd[i] = avg_nrg[i]/avg_inc[i]
    exp = pd.DataFrame({'FIP':fips,'Total Units':units,'Average Household Income':avg_inc,'Annual Energy Expenditures':avg_nrg,'Energy Burden':nrg_burd,'Annual Electricity Expenditures':avg_elec,'Annual Gas Expenditures':avg_gas,'Annual Fuel Oil Expenditures':avg_fuel})
    
    return exp

In [None]:
for s in ['DC']:
    exp = get_expenditures_df(s)
    exp.to_csv('Tracts/Energy Expenditures/'s+'_NREL_LEAD_Energy_Expenditures.csv')

In [8]:
tenures = ['OWNER','RENTER']
yearbuilts = ['1940-59', '1960-79', '1980-99', '2000-09', '2010+', 'BEFORE 1940']
bldtypes = ['1 ATTACHED', '1 DETACHED', 'MOBILE_TRAILER', '10-19 UNIT',
       '2 UNIT', '3-4 UNIT', '5-9 UNIT', 'BOAT_RV_VAN', '50+ UNIT',
       '20-49 UNIT']
heatfuels = ['BOTTLED GAS', 'ELECTRICITY', 'OTHER', 'SOLAR', 'UTILITY GAS',
       'WOOD', 'COAL', 'NONE', 'FUEL OIL']
fpls = ['0-100%', '100-150%', '150-200%', '200-400%', '400%+']

In [28]:
def process_numpy_all_categories(df_np,sa):
    columns = ['State','Tenure','Year Built','Structure Type','Heating Fuel',
           'Federal Poverty Line','Units','HINCP*UNITS','ELEP*UNITS','GASP*UNITS','FULP*UNITS']
    final_arr = [np.array([0 for _ in range(11)])]
    for t in tenures:
        for y in yearbuilts:
            for b in bldtypes:
                for h in heatfuels:
                    for f in fpls:
                        temp_df = df_np[(df_np[:,2]==t)&(df_np[:,3]==y)&(df_np[:,4]==b)&(df_np[:,5]==h)&(df_np[:,6]==f), :]
                        temp_arr = [np.array([sa, t, y, b, h, f]+[round(sum(temp_df[:,i]),3) for i in range(7,12)])]
                        final_arr = np.append(final_arr, temp_arr, axis=0)
    df_final = pd.DataFrame(final_arr, columns = columns)
    df_final.drop([0],inplace=True)
    hincp, elep, gasp, fulp = [],[],[],[]
    df_final['Household Income'] = [round(float(df_final['HINCP*UNITS'][i])/float(df_final['Units'][i]),2) if float(df_final['Units'][i])>0 else 0 for i in df_final.index]
    df_final['Electricity Payments ($)'] = [round(float(df_final['ELEP*UNITS'][i])/float(df_final['Units'][i]),2) if float(df_final['Units'][i])>0 else 0 for i in df_final.index]
    df_final['Gas Payments ($)'] = [round(float(df_final['GASP*UNITS'][i])/float(df_final['Units'][i]),2) if float(df_final['Units'][i])>0 else 0 for i in df_final.index]
    df_final['Fuel Oil Payments ($)'] = [round(float(df_final['FULP*UNITS'][i])/float(df_final['Units'][i]),2) if float(df_final['Units'][i])>0 else 0 for i in df_final.index]
    df_final = df_final.drop(columns=['HINCP*UNITS','ELEP*UNITS','GASP*UNITS','FULP*UNITS'])
    df_final['Energy Burden (% Income)'] = [round(sum(df_final.iloc[i,8:])/float(df_final.iloc[i,7]),4)*100 for i in range(len(df_final.index))]
    
    return df_final

In [29]:
def process_numpy_single_category(df_np,sa):
    columns2 = ['State','Metric Type','Metric Name','Units','HINCP*UNITS','ELEP*UNITS','GASP*UNITS','FULP*UNITS']
    final_arr2 = [np.array([0 for _ in range(8)])]
    for t in tenures:
        temp_arr = [np.array([sa, 'Tenure', t]+[round(sum(df_np[(df_np[:,2]==t),:][:,i]),3) for i in range(7,12)])]
        final_arr2 = np.append(final_arr2, temp_arr, axis=0)
    for y in yearbuilts:
        temp_arr = [np.array([sa, 'Year Built', y]+[round(sum(df_np[(df_np[:,3]==y),:][:,i]),3) for i in range(7,12)])]
        final_arr2 = np.append(final_arr2, temp_arr, axis=0)
    for b in bldtypes:
        temp_arr = [np.array([sa, 'Building Type', b]+[round(sum(df_np[(df_np[:,4]==b),:][:,i]),3) for i in range(7,12)])]
        final_arr2 = np.append(final_arr2, temp_arr, axis=0)
    for h in heatfuels:
        temp_arr = [np.array([sa, 'Heating Fuel', h]+[round(sum(df_np[(df_np[:,5]==h),:][:,i]),3) for i in range(7,12)])]
        final_arr2 = np.append(final_arr2, temp_arr, axis=0)
    for f in fpls:
        temp_arr = [np.array([sa, 'Percent of FPL', f]+[round(sum(df_np[(df_np[:,6]==f),:][:,i]),3) for i in range(7,12)])]
        final_arr2 = np.append(final_arr2, temp_arr, axis=0)
    df_final2 = pd.DataFrame(final_arr2, columns = columns2)
    df_final2.drop([0],inplace=True)
    hincp, elep, gasp, fulp = [],[],[],[]
    df_final2['Household Income'] = [round(float(df_final2['HINCP*UNITS'][i])/float(df_final2['Units'][i]),2) if float(df_final2['Units'][i])>0 else 0 for i in df_final2.index]
    df_final2['Electricity Payments ($)'] = [round(float(df_final2['ELEP*UNITS'][i])/float(df_final2['Units'][i]),2) if float(df_final2['Units'][i])>0 else 0 for i in df_final2.index]
    df_final2['Gas Payments ($)'] = [round(float(df_final2['GASP*UNITS'][i])/float(df_final2['Units'][i]),2) if float(df_final2['Units'][i])>0 else 0 for i in df_final2.index]
    df_final2['Fuel Oil Payments ($)'] = [round(float(df_final2['FULP*UNITS'][i])/float(df_final2['Units'][i]),2) if float(df_final2['Units'][i])>0 else 0 for i in df_final2.index]
    df_final2 = df_final2.drop(columns=['HINCP*UNITS','ELEP*UNITS','GASP*UNITS','FULP*UNITS'])
    df_final2['Energy Burden (% Income)'] = [round(sum(df_final2.iloc[i,5:])/float(df_final2.iloc[i,4]),4)*100 for i in range(len(df_final2.index))]
    
    return df_final2

In [33]:
def create_LEAD_state_csv(sa):
    st1 = time.time()
    df = get_LEAD_DataFrame(sa, csv=3)
    print('Time for CSV Download:', round(time.time()-st1,2),'Seconds')
    
    df_np = df.to_numpy()
    
    st2 = time.time()
    df_one = process_numpy_single_category(df_np, sa)
    print('Time for single category DF:', round(time.time()-st2,2),'Seconds')
    
    st3 = time.time()
    df_all = process_numpy_all_categories(df_np, sa)
    print('Time for all Categories DF:', round(time.time()-st2,3),'Seconds')
    
    return df_all, df_one

In [38]:
states = ["AL", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", 
          "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", 
          "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [44]:
for sa in states[5:]:
    print(sa)
    stst = time.time()
    df_all, df_one = create_LEAD_state_csv(sa)
    df_all.to_csv('States/'+sa+'_LEAD_All_Categories.csv')
    df_one.to_csv('States/'+sa+'_LEAD_Summary.csv')
    print(sa, 'completed, time: ',round(time.time()-stst,3),'seconds')

AR
Time for CSV Download: 34.86 Seconds
Time for single category DF: 8.41 Seconds




Time for all Categories DF: 210.29 Seconds
AR completed, time:  245.914 seconds
CA
Time for CSV Download: 162.23 Seconds
Time for single category DF: 263.5 Seconds
Time for all Categories DF: 3263.353 Seconds
CA completed, time:  3721.605 seconds
CO
Time for CSV Download: 28.76 Seconds
Time for single category DF: 11.95 Seconds
Time for all Categories DF: 377.515 Seconds
CO completed, time:  407.855 seconds
