## Code to clean up energy source variables to identify renewables 

In [3]:
import pandas as pd
import numpy as np
%matplotlib inline

In [5]:
proposed_gen_data = pd.read_csv('../../Clean Data/proposed_gen_master_list.csv')
proposed_gen_data = proposed_gen_data.drop(columns='Unnamed: 0')

  interactivity=interactivity, compiler=compiler, result=result)


In [175]:
# code prime mover and energy source variables according to https://www.eia.gov/survey/form/eia_860/instructions.pdf
prime_mover_dict = {
    'BA': 'storage',
    'CE': 'storage',
    'CP': 'storage',
    'FW': 'storage',
    'ES': 'storage',
    'ST': 'steam turbine',
    'GT': 'gas turbine',
    'IC': 'combustion engine',
    'CA': 'combined cycle',
    'CT': 'combined cycle',
    'CS': 'combined cycle',
    'CC': 'combined cycle',
    'HA': 'hydro',
    'HB': 'hydro',
    'HK': 'hydro',
    'HY': 'hydro',
    'PS': 'pumped storage',
    'BT': 'binary turbine',
    'PV': 'PV',
    'WT': 'onshore wind',
    'WS': 'offshore wind',
    'FC': 'fuel cell',
    'OT': 'other'
}

energy_source_dict = {
    
    # coal 
    'ANT': 'coal',
    'BIT': 'coal',
    'LIG': 'coal',
    'SGC': 'coal',
    'SUB': 'coal',
    'WC': 'coal',
    'RC': 'refined coal',
    'SC': 'coal',
    
    # petroleum
    'DFO': 'petroleum',
    'JF': 'petroleum',
    'KER': 'petroleum',
    'PC': 'petroleum',
    'PG': 'petroleum',
    'RFO': 'petroleum',
    'SGP': 'petroleum',
    'WO': 'petroleum',
    
    # natural gas + other gases
    'BFG': 'gas',
    'NG': 'gas',
    'OG': 'gas',
    'SG': 'gas',
    
    # solid renewables fuels (bio)
    'AB': 'bio',
    'MSW': 'bio',
    'OBS': 'bio',
    'WDS': 'bio',
    
    #liquid renewable (biomass) fuels
    'OBL': 'bio',
    'SLW': 'bio',
    'BLQ': 'bio',
    'WDL': 'bio',
    
    # gaseous renewable (biomass) fuels
    'LFG': 'bio',
    'OBG': 'bio',
    
    # all other renewables
    'SUN': 'solar',
    'WND': 'wind',
    'GEO': 'geothermal',
    'WAT': 'hydro', # also used for pumped storage
    
    # other sources
    'NUC': 'nuclear',
    'PUR': 'steam',
    'WH': 'waste heat', #unknown fuel source? 
    'TDF': 'tires', #tire-derived soure??
    'MWH': 'storage',
    'OTH': 'other'
    
}

renewList = ['SUN','WND','GEO','WAT']

In [255]:
def id_sources(df, prime_mover_col, energy_source_col):
    df['pm_clean'] = df[prime_mover_col].map( prime_mover_dict )
    t = [eval(x.replace('nan', '0')) for x in df[energy_source_col]]
    sources = []
    primary_source = []
    renew = []
    for i in range(0,len(t)):
        sources.append([energy_source_dict[x] for x in t[i] if x!=0])
        renew.append(any([x in renewList for x in t[i] if x!=0]))
    df['sources'] = sources
    df['renew'] = renew
    for i in range(0,len(sources)):
        if sources[i] == []:
            primary_source.append("")
        else:
            primary_source.append(sources[i][0])
    df['primary_source'] = primary_source
    return(df)

In [256]:
proposed_gen_data = proposed_gen_data[proposed_gen_data['year'] >= 2008]
proposed_gen_data = id_sources(proposed_gen_data, 'prime_mover', 'energy_source')

In [260]:
proposed_gen_data.to_csv("../../Clean Data/proposed_gen_master_list_post08.csv")

# group utilities by address... one way of determining common owner. 

## How many entries appear over multiple years?  i.e. How many unique entries do I actually have? Raw number is 22,195 entries, but some seem to be part of same plant...

In [297]:
## Goal is to create a unique ID for each unit in the sample because I am worried that some units will appear in multiple data sets. 

## Solution is to start with creating group_IDs for entries that have the same (plant code, utility name, nameplate cap, generator id) combinations

proposed_gen_data['temp_group_id'] = proposed_gen_data.fillna(method='ffill').groupby(['plant_code', 'utility_name','nameplate_cap', 'generator_id']).grouper.group_info[0]

#len(proposed_gen_data['temp_group_id'].unique())
# There are 10,380 unique temp_group_ids, so now I need to figure out why there are repeated codes

In [298]:
# I do this by selecting observations that are coded to the same group_ids

special_ids = proposed_gen_data.groupby(['temp_group_id']).size() > 1
special_ids = [i for i in special_ids.index if special_ids[i]]
special_ids = proposed_gen_data[proposed_gen_data['temp_group_id'].isin(special_ids)]

# special_ids.shape[0]
# There are 16843 entries with non-unique group_ids.

In [23]:
# Now I look to find out how many of the repeated entries are appearing in multiple years
df = special_ids.groupby('temp_group_id').status.nunique()
df
# For example this thing appears in multiple years... but it's cancelled
# special_ids[special_ids['temp_group_id']==1957].filter(['utility_name','plant_name','status'])

NameError: name 'special_ids' is not defined

In [313]:
status_codes = {}
#IP: Planned new generator cancelled, indefinitely postponed, or no longer in resource plan
status_codes['cancelled'] = 'IP'
#TS: Construction complete, but not yet in commercial operation
status_codes['complete_not_operating'] = 'TS'
#P: Planned for installation but regulatory approvals not initiated; not under construction
status_codes['planned_not_started'] = 'P'
#L: Regulatory approvals pending; not under construction but site preparation could be underway
status_codes['reg_approval_pending'] = 'L'
#T: Regulatory approvals received; but not under construction but site preparation could be underway
status_codes['reg_approval_received'] = 'T'
#U: Under construction, less than or equal to 50 percent complete (based on construction time to date of operation)
status_codes['under_construction'] = 'U' 
#V: Under construction, more than 50 percent complete (based on construction time to date of operation)
status_codes['almost_done'] = 'V'
# Other (described in Comments)
status_codes['other'] = 'OT'


In [312]:
special_ids['status'].value_counts()

IP    6011
P     3412
L     2063
U     1810
V     1424
T     1372
TS     721
OT      30
Name: status, dtype: int64

In [204]:
multi_yr_groups[multi_yr_groups['temp_group_id'] == 1091]

Unnamed: 0,bypass_heat_recovery,carbon_capture,chp,cofire_fuels,cogen,curr_sceheduled_month,curr_scheduled_year,distributed_gen,duct_burners,energy_source,...,technology,transportation,ultrasupercritical,unit_code,utility_id,utility_name,winter_cap_eia,winter_cap_resp,year,temp_group_id
7700,,,,,N,7.0,2010.0,,,"['NG', nan, nan, nan, nan, nan]",...,,[],,,1015,Austin Energy,50.4,47.3,2008,1091
9822,,,,,N,7.0,2010.0,,,"['NG', nan, nan, nan, nan, nan]",...,,[],,,1015,Austin Energy,50.4,47.3,2009,1091
