# make_proposed_gen_master.py
## Creates a master file of all entries in proposed generator for EIA 860, 2004-2018

In [644]:
import pandas as pd
import json
import numpy as np

In [645]:
## Create dictionary of pandas dfs from csv file with proposed generators in annual EIA 860 Forms

proposed_gen_dfs = {}
for year in range(2004,2019):
    proposed_df = pd.read_csv("../Data for Tax Equity Project/eia_data/eia860" + str(year) + 
                                  "/proposed_gen_" + str(year) + ".csv")
    proposed_df.columns = map(str.lower, proposed_df.columns)
    proposed_gen_dfs[year] = proposed_df
    
    ## Print out number of entries by year
    #print(str(year) + ": " + str(proposed_df.shape[0]))

# Merge DataFrames 
### This step consists of standardizing column names, because variable names change year to year, then merging
### 1. Standardize

In [646]:
## Create dictionary of column names in the list of dataframes (proposed_gen_dfs)
## Key = year
## Value = list of column names
all_cols = {}
for x in proposed_gen_dfs:
    all_cols[x] = (proposed_gen_dfs[x].columns)
unique_cols = [y for x in all_cols.values() for y in x]
# print(np.unique(unique_cols))


In [647]:
## Function to check whether col names in df_dict are in the newColDict
def check_cols_in_colDict(key, df_dict, newColDict):
    newColValues =  [y for x in newColDict.values() for y in x]
    tempList = [x in newColValues for x in df_dict[key]]
    return(np.array(df_dict[key])[[not x for x in tempList]])

In [648]:
## Code for reading in a dictionary and check whether all columns in df are in the dictionary
dict_file = '../Tax Equity Code/Clean Data/eia_proposed_gen_col_dict.json'

with open(dict_file) as json_file:
    col_dict = json.load(json_file)
    
missing_cols = []
for x in all_cols:
    missing_cols.append([x for x in check_cols_in_colDict(x,all_cols,col_dict)])
missing_cols = [y for x in missing_cols for y in x]

## only energy source/startup source not yet coded
# print(np.unique(missing_cols))

### 2. Merge

In [649]:
# start by recoding existing columns
# Below is function that takes care of this

def standardize_df_colnames(year, old_df, new_col_dict):
    
    # make a map from old col names to the new ones 
    
    new_name_dict = {}
    for x in new_col_dict.keys():
        for y in new_col_dict[x]:
            if y in old_df.columns:
                new_name_dict[y] = x

    # apply the mapping
    standard_df = old_df.rename(columns = new_name_dict)
    
    # add NaN values for variables in standard but not old df
    for x in new_col_dict.keys():
        if x not in standard_df.columns:
            standard_df[x] = np.nan
    
    # remove unnamed and other weird columns not in the new name dict 
    # need to flatten the energy sources columns into a single column containing a list of the sources
    
    energy_colnames = []
    startup_colnames = []
    transport_colnames = []
    for c in standard_df.columns:
        if c not in new_col_dict.keys():
            if "energy" in c:
                energy_colnames.append(c)
                standard_df = standard_df.drop(columns=c)
            elif "transportation" in c:
                transport_colnames.append(c)
                standard_df = standard_df.drop(columns=c)
            elif "startup" in c:
                startup_colnames.append(c) 
                standard_df = standard_df.drop(columns=c)
            else:
                standard_df = standard_df.drop(columns=c)
    
    standard_df['energy_source'] = old_df.apply(lambda x: list(x[energy_colnames]),axis=1)   
    standard_df['transportation'] = old_df.apply(lambda x: list(x[transport_colnames]),axis=1)   
    standard_df['startup_source'] = old_df.apply(lambda x: list(x[startup_colnames]),axis=1)   
    
    # add a variable called "year" 
    standard_df.insert(0, "year", year) 
    
    standard_df = standard_df.sort_index(axis=1)        
     
    standard_df = standard_df.loc[:,~standard_df.columns.duplicated()]    
    return(standard_df)
       

In [650]:
standard_dfs = [standardize_df_colnames(yr, proposed_gen_dfs[yr], col_dict) for yr in proposed_gen_dfs]

In [651]:
merged = pd.concat(standard_dfs)
merged.to_csv('../Tax Equity Code/Clean Data/proposed_gen_master_list.csv')