In [425]:
import pandas as pd
import numpy as np
import json
from plotnine import *
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 200

%matplotlib inline



pd.options.display.max_rows = 999


In [454]:
dataDir = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/eia_860m/'
jsonDir = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/JSONFiles/'
figDir = '/Users/rachelanderson/Dropbox (Princeton)/newFigDir/'

In [383]:
with open(jsonDir + 'final_parent_dict.json') as outfile:
    parent_map = json.load(outfile)

In [384]:
#import data
data = pd.ExcelFile(dataDir + 'february_generator2020.xlsx')
df = data.parse(sheetname=data.sheet_names[0], skiprows=1)

#clean column names
df.columns = [x.lower().replace(" ","_") for x in df.columns]
df.columns = [x.split('_(mw)')[:-1][0] if "(mw)" in x else x for x in df.columns]

In [385]:
df['company_name'] = df.entity_name.str.lower()
df['parent_company'] = df.company_name.map(parent_map)

In [386]:
# merge in the plant data to get ferc status

plant_df = pd.read_excel("/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/eia_data/eia8602018/2___Plant_Y2018.xlsx")
plant_df.columns = plant_df.iloc[0]
plant_df.columns = plant_df.columns.str.replace(" ","_").str.lower()

common_cols = []
for x in df.columns:
    if x in plant_df.columns:
        if "plant" not in x:
            common_cols.append(x)
            
gen_df = plant_df.drop(common_cols,axis=1).merge(df, on='plant_name')

## make tables for select states

In [444]:
gen_df['purpa_qf'] = (gen_df['ferc_small_power_producer_status']=='Y')
solar = gen_df[(gen_df['technology']=="Solar Photovoltaic")]
solar['cap_bin'] = pd.cut(solar.nameplate_capacity,bins=[0,5,20,80, max(solar.nameplate_capacity)], labels=['0-5','6-20','21-80','80+'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [445]:
state_by_year = solar.groupby(['state','operating_year']).nameplate_capacity.sum().reset_index()
state_by_yr_tab = state_by_year.pivot(index='state', columns='operating_year',values='nameplate_capacity').fillna(0)
state_by_yr_tab.columns = [int(x) for x in state_by_yr_tab.columns]

In [446]:
# barely any solar pre-2008 so consolidate in 1 column
pre_2008 = solar[solar['operating_year']<=2008].groupby('state').nameplate_capacity.sum()
state_tab = state_by_yr_tab.drop([x for x in state_by_yr_tab.columns if x<=2008],axis=1)
state_tab.insert(column='pre_2008',value=pre_2008, loc=0)
state_tab['pre_2008'] = state_tab['pre_2008'].fillna(0)

In [447]:
# add cumsum column 
state_tab['tot_cap'] = solar.groupby('state').nameplate_capacity.sum()

# add perc_qf column
state_tab['qf_cap'] = solar[solar['purpa_qf']].groupby('state').nameplate_capacity.sum().fillna(0)
state_tab['perc_qf']=(solar[solar['purpa_qf']].groupby('state').nameplate_capacity.sum()/state_tab.tot_cap).fillna(0)


In [448]:
top_solar_states = state_tab.sort_values(by='tot_cap',ascending=False).head(9).index

In [451]:
top_solar=solar[(solar['state'].isin(top_solar_states))&(solar['operating_year']>2010)][['state','nameplate_capacity','cap_bin','operating_year']]

In [457]:
states = ggplot(top_solar, aes(x='cap_bin',fill='state')) + geom_histogram(bins=25) + facet_wrap(['state'],scales='free') + xlab("Nameplate capacity \n") + theme(legend_title =element_blank())
ggplot.save(states,figDir + 'top_states.png', dpi=300)



In [414]:
top_developers = solar.groupby('parent_company').nameplate_capacity.sum().sort_values(ascending=False).head(9)

In [459]:
top_develop = solar[solar['parent_company'].isin(top_developers.index)]
temp_util_map = {10: '8me',
                95: 'NextEra',
                1248: 'ConEd',
                128: 'Southern',
                320: 'Berkshire Hathaway',
                227: 'Sustainable Power Group',
                566: 'CD Arevon',
                1442: 'Dominion',
                1348: 'Cypress Creek Renewables'}
top_develop['parent_name'] = top_develop.parent_company.map(temp_util_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [427]:
plot_one = ggplot(top_develop, aes(x='nameplate_capacity',fill='parent_name')) + geom_histogram(bins=25) + facet_wrap(['parent_name'],scales='free_y')  
plotnine.ggplot.save(plot_one,'/Users/rachelanderson/Dropbox (Princeton)/Figures/top_developer_size.png', dpi=300)



In [465]:
t= ggplot(top_develop, aes(x='cap_bin',fill='parent_name')) + geom_histogram() + facet_wrap(['parent_name'],scales='free_y') 
ggplot.save(t,figDir + 'developer_bins.png',dpi=300)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
