### This code reads in all raw CSV files and merges them into a CSV that Stata can input easily 

 This code makes a dataset with 
 
$y$ = (2016 solar additions in MW)/(solar MW capacity built < 2016)


$x$ = (net metering, RPS, RPS w/Solar, avg. retail electricity price, solar index, regulated/not, population, other 2016 cap additions, community solar, region)

In [1]:
import pandas as pd
import numpy as np
import datetime
import json
import glob

# pd.set_option('display.max_rows', None)

In [95]:
StataFiles = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/StataDatasets/'
jsonDir = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/JSONFiles/'

### 1. Load in all raw CSV datafiles and merge...

### Clean generation data

In [96]:
# Import generating data (annual)

data = pd.ExcelFile('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/eia_summary_data/annual_generation_state.xls')

ann_generation = data.parse(sheetname=data.sheet_names[0], skiprows=1)
ann_generation.columns = [x.lower().replace(" ","_") for x in ann_generation.columns]
tot_gen = ann_generation[ann_generation['type_of_producer'].str.contains('Total Electric Power')]

### Clean capacity data

In [97]:
# Import annual capacity data

cap_data = pd.read_csv('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/eia_capacity_data/existcapacity_annual.csv', header=1)
cap_data.columns = [x.lower().replace(" ","_") for x in cap_data.columns]
cap_data['summer_cap']=pd.to_numeric(cap_data['summer_capacity_(megawatts)'].str.replace(",",""),errors='coerce')
cap_data['nameplate_cap']=pd.to_numeric(cap_data['nameplate_capacity_(megawatts)'].str.replace(",",""),errors='coerce')

In [98]:
solar_cap = cap_data[(cap_data['producer_type']=="Total Electric Power Industry") &
                     (cap_data['fuel_source'].str.contains("Solar"))]

In [99]:
solar_gen = tot_gen[tot_gen['energy_source']== "Solar Thermal and Photovoltaic"]

In [100]:
reg1_data = solar_cap.merge(solar_gen, left_on=['year','state_code'],right_on=['year','state'],how='inner')
reg1_data.to_csv('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/StataDatasets/cap_gen.csv')

### Clean generator dataset

In [3]:
# Import generator data (survey of all generators in 2018)

gen_data = pd.read_csv('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/eia_data/eia8602018/merged_clean_address.csv')
gen_data = gen_data.drop(gen_data.columns[gen_data.columns.str.contains("Unnamed")], axis=1)
gen_data['operating_year'] = gen_data['operating_year'].astype(int)
gen_data['operating_month'] = gen_data['operating_month'].astype(int)
gen_data['operating_date'] = pd.to_datetime([f'{y}-{m}-01' for y, m in zip(gen_data.operating_year, gen_data.operating_month)])
gen_data['summer_cap']=pd.to_numeric(gen_data['summer_capacity_mw'].str.replace(",","").replace(" ",""))
gen_data['winter_cap']=pd.to_numeric(gen_data['winter_capacity_mw'].str.replace(",","").replace(" ",""))

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
solar = gen_data[gen_data['technology'] == "Solar Photovoltaic"]
solar = solar[solar['grant_program']=='none']

In [13]:
gen_data.groupby(['technology', 'plant_state', 'operating_year']).nameplate_cap.sum().reset_index().to_csv("/Users/rachelanderson/Dropbox (Princeton)/Tax Equity Code/Stata_Analysis/MakeStataData/InputData/annual_capacity_form860.csv")

In [6]:
solar_cap_monthly = solar.groupby(['plant_state','operating_date','operating_month','operating_year'])[['summer_cap','nameplate_cap', 'winter_cap']].sum().reset_index()
res_price_monthly = price_data[['date', 'state', 'residential_price']]

In [104]:
monthly_vars = solar_cap_monthly.merge(res_price_monthly, how='right', left_on = ['operating_date', 'plant_state'], right_on=['date','state'])
monthly_vars = monthly_vars.drop(columns=['plant_state','operating_date'], axis = 1)
monthly_vars=monthly_vars.fillna(0)

In [105]:
monthly_vars.to_csv('/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/monthly_data.csv')

### Merge all of the variables together

In [157]:
grant_data = pd.read_excel('../../../Data for Tax Equity Project/treasury_data/grant_awards.xlsx',header=1)

In [158]:
loan_funds = grant_data.groupby('State').Funded.sum().reset_index()

In [159]:
solar_cap_ann = solar.groupby(['plant_state','operating_year'])[['summer_cap','nameplate_cap', 'winter_cap']].sum().reset_index()

In [160]:
master_df = solar_cap_ann.merge(loan_funds, how='outer',left_on='plant_state',right_on='State')

In [161]:
price_data = pd.read_csv(StataFiles + 'RawCSV/monthly_prices_by_state.csv')
# take mean of monthly observations so they link up nicely with ann capacity data
avg_price_data=price_data.groupby(['state','year']).mean().reset_index()
avg_price_data.drop([x for x in tt.columns if "Unnamed" in x], axis=1, inplace=True)
avg_price_data = avg_price_data[~avg_price_data.isna()]
avg_price_data.to_csv(StataFiles + 'RawCSV/avg_prices_by_state.csv')

In [162]:
master_df = master_df.merge(avg_price_data, how='outer',left_on=['plant_state', 'operating_year'],right_on=['state','year'])

### Apply the recoding json dicts

In [163]:
recode_dict = {}
for fileName in glob.glob(jsonDir + "*.txt"):
    dict_name = fileName.split(jsonDir)[-1].split('.txt')[0]
    with open(fileName) as outfile:
        recode_dict[dict_name] = json.load(outfile)


In [164]:
master_df['regulated']=master_df['plant_state'].map(recode_dict['regulated_dict'])
master_df['region'] = master_df['plant_state'].map(recode_dict['region_dict'])
master_df['rps'] = master_df['plant_state'].map(recode_dict['rps_dict'])
master_df['net_meter'] = master_df['plant_state'].map(recode_dict['net_meter'])

In [165]:
master_df.drop([x for x in master_df.columns if "Unnamed" in x], axis=1,inplace=True)
master_df = master_df[~master_df['plant_state'].isna()]
# remove years with missing price data
master_df = master_df[master_df['operating_year']>=2009].sort_values('year')
master_df.rename(columns={'Funded': 'funding_1603'},inplace=True)
master_df.drop(['State','plant_state'],axis=1,inplace=True)

### Save the csv for stata analysis

In [167]:
master_df.to_csv(StataFiles + 'master_dataset.csv')