### This code reads in all raw CSV files and merges them into a CSV that Stata can input easily 

 This code makes a dataset with 
 
$y$ = (2016 solar additions in MW)/(solar MW capacity built < 2016)


$x$ = (net metering, RPS, RPS w/Solar, avg. retail electricity price, solar index, regulated/not, population, other 2016 cap additions, community solar, region)

In [94]:
import pandas as pd
import numpy as np
import datetime
import json
import glob

# pd.set_option('display.max_rows', None)

In [95]:
StataFiles = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/StataDatasets/'
jsonDir = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/JSONFiles/'

### 1. Load in all raw CSV datafiles and merge...

### Clean generation data

In [96]:
# Import generating data (annual)

data = pd.ExcelFile('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/eia_summary_data/annual_generation_state.xls')

ann_generation = data.parse(sheetname=data.sheet_names[0], skiprows=1)
ann_generation.columns = [x.lower().replace(" ","_") for x in ann_generation.columns]
tot_gen = ann_generation[ann_generation['type_of_producer'].str.contains('Total Electric Power')]

### Clean capacity data

In [97]:
# Import annual capacity data

cap_data = pd.read_csv('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/eia_capacity_data/existcapacity_annual.csv', header=1)
cap_data.columns = [x.lower().replace(" ","_") for x in cap_data.columns]
cap_data['summer_cap']=pd.to_numeric(cap_data['summer_capacity_(megawatts)'].str.replace(",",""),errors='coerce')
cap_data['nameplate_cap']=pd.to_numeric(cap_data['nameplate_capacity_(megawatts)'].str.replace(",",""),errors='coerce')

In [98]:
solar_cap = cap_data[(cap_data['producer_type']=="Total Electric Power Industry") &
                     (cap_data['fuel_source'].str.contains("Solar"))]

In [99]:
solar_gen = tot_gen[tot_gen['energy_source']== "Solar Thermal and Photovoltaic"]

In [100]:
reg1_data = solar_cap.merge(solar_gen, left_on=['year','state_code'],right_on=['year','state'],how='inner')
reg1_data.to_csv('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/StataDatasets/cap_gen.csv')

### Clean generator dataset

In [101]:
# Import generator data (survey of all generators in 2018)

gen_data = pd.read_csv('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/eia_data/eia8602018/merged_new.csv')
gen_data = gen_data.drop(gen_data.columns[gen_data.columns.str.contains("Unnamed")], axis=1)
gen_data['operating_year'] = gen_data['operating_year'].astype(int)
gen_data['operating_month'] = gen_data['operating_month'].astype(int)
gen_data['operating_date'] = pd.to_datetime([f'{y}-{m}-01' for y, m in zip(gen_data.operating_year, gen_data.operating_month)])
gen_data['summer_cap']=pd.to_numeric(gen_data['summer_capacity_mw'].str.replace(",","").replace(" ",""))
gen_data['winter_cap']=pd.to_numeric(gen_data['winter_capacity_mw'].str.replace(",","").replace(" ",""))

  interactivity=interactivity, compiler=compiler, result=result)


In [102]:
solar = gen_data[gen_data['technology'] == "Solar Photovoltaic"]
solar = solar[solar['grant_program']=='none']

In [103]:
solar_cap_monthly = solar.groupby(['plant_state','operating_date','operating_month','operating_year'])[['summer_cap','nameplate_cap', 'winter_cap']].sum().reset_index()
res_price_monthly = price_data[['date', 'state', 'residential_price']]

KeyError: "['date'] not in index"

In [104]:
monthly_vars = solar_cap_monthly.merge(res_price_monthly, how='right', left_on = ['operating_date', 'plant_state'], right_on=['date','state'])
monthly_vars = monthly_vars.drop(columns=['plant_state','operating_date'], axis = 1)
monthly_vars=monthly_vars.fillna(0)

In [105]:
monthly_vars.to_csv('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/monthly_data.csv')

### Merge all of the variables together

In [157]:
grant_data = pd.read_excel('../../../Data for Tax Equity Project/treasury_data/grant_awards.xlsx',header=1)

In [158]:
loan_funds = grant_data.groupby('State').Funded.sum().reset_index()

In [159]:
solar_cap_ann = solar.groupby(['plant_state','operating_year'])[['summer_cap','nameplate_cap', 'winter_cap']].sum().reset_index()

In [160]:
master_df = solar_cap_ann.merge(loan_funds, how='outer',left_on='plant_state',right_on='State')

In [161]:
price_data = pd.read_csv(StataFiles + 'RawCSV/monthly_prices_by_state.csv')
# take mean of monthly observations so they link up nicely with ann capacity data
avg_price_data=price_data.groupby(['state','year']).mean().reset_index()
avg_price_data.drop([x for x in tt.columns if "Unnamed" in x], axis=1, inplace=True)
avg_price_data = avg_price_data[~avg_price_data.isna()]
avg_price_data.to_csv(StataFiles + 'RawCSV/avg_prices_by_state.csv')

In [162]:
master_df = master_df.merge(avg_price_data, how='outer',left_on=['plant_state', 'operating_year'],right_on=['state','year'])

### Apply the recoding json dicts

In [163]:
recode_dict = {}
for fileName in glob.glob(jsonDir + "*.txt"):
    dict_name = fileName.split(jsonDir)[-1].split('.txt')[0]
    with open(fileName) as outfile:
        recode_dict[dict_name] = json.load(outfile)


In [164]:
master_df['regulated']=master_df['plant_state'].map(recode_dict['regulated_dict'])
master_df['region'] = master_df['plant_state'].map(recode_dict['region_dict'])
master_df['rps'] = master_df['plant_state'].map(recode_dict['rps_dict'])
master_df['net_meter'] = master_df['plant_state'].map(recode_dict['net_meter'])

In [165]:
master_df.drop([x for x in master_df.columns if "Unnamed" in x], axis=1,inplace=True)
master_df = master_df[~master_df['plant_state'].isna()]
# remove years with missing price data
master_df = master_df[master_df['operating_year']>=2009].sort_values('year')
master_df.rename(columns={'Funded': 'funding_1603'},inplace=True)
master_df.drop(['State','plant_state'],axis=1,inplace=True)

In [146]:
master_df.to_csv(StataFiles + 'master_dataset.csv')

In [166]:
master_df

Unnamed: 0,operating_year,summer_cap,nameplate_cap,winter_cap,funding_1603,state,year,month,residential_price,perc_residential_rev,perc_commercial_rev,perc_industrial_rev,regulated,region,rps,net_meter
158,2009.0,18.4,18.4,16.6,1064165000.0,NJ,2009.0,6.5,16.219167,0.408875,0.497263,0.090174,Deregulated,PJM,1.0,1.0
88,2009.0,9.0,9.0,9.0,1260238000.0,IL,2009.0,6.5,11.33,0.398281,0.364589,0.233601,Deregulated,MISO,1.0,0.0
215,2009.0,0.9,1.0,0.7,771235800.0,PA,2009.0,6.5,11.709167,0.444288,0.321858,0.228883,Deregulated,PJM,1.0,1.0
61,2009.0,25.0,25.0,25.0,381374800.0,FL,2009.0,6.5,12.399167,0.551099,0.38682,0.061736,Regulated,Southeast,0.0,1.0
11,2009.0,4.0,4.0,4.0,1444632000.0,AZ,2009.0,6.5,10.566667,0.489986,0.399502,0.110512,Regulated,Southwest,1.0,0.0
38,2009.0,5.9,6.1,5.9,545401300.0,CO,2009.0,6.5,9.976667,0.410045,0.384767,0.204349,Regulated,Southwest,1.0,1.0
26,2009.0,14.5,15.3,14.0,7045618000.0,CA,2009.0,6.5,14.665,0.386835,0.465643,0.145429,Deregulated,CAISO,1.0,1.0
107,2010.0,3.4,3.8,2.8,422410600.0,MA,2010.0,6.5,14.601667,0.382153,0.325723,0.28933,Deregulated,ISO-NE,1.0,1.0
62,2010.0,23.7,26.2,23.7,381374800.0,FL,2010.0,6.5,11.4575,0.569049,0.367319,0.06333,Regulated,Southeast,0.0,1.0
168,2010.0,30.0,30.6,30.0,230826300.0,NM,2010.0,6.5,10.475833,0.375655,0.41055,0.213795,Regulated,Southwest,1.0,0.0


In [141]:
price_data[price_data['state']=='WA']

Unnamed: 0.1,Unnamed: 0,year,month,state,residential_price,perc_residential_rev,perc_commercial_rev,perc_industrial_rev
46,11677,2009,1,WA,7.54,0.544498,0.312339,0.14315
96,11728,2009,2,WA,7.55,0.528611,0.325376,0.145999
146,11779,2009,3,WA,7.53,0.521001,0.328526,0.150457
196,11830,2009,4,WA,7.6,0.499511,0.341697,0.158774
246,11881,2009,5,WA,7.68,0.462142,0.362044,0.17579
296,11932,2009,6,WA,7.82,0.428867,0.38966,0.181459
346,11983,2009,7,WA,7.79,0.423274,0.375546,0.201164
396,12034,2009,8,WA,7.86,0.41069,0.382837,0.206404
446,12085,2009,9,WA,7.83,0.402977,0.392688,0.204296
496,12136,2009,10,WA,7.73,0.419596,0.379606,0.200766
