Merge pull request #19 from varoozhan/workflow

Data Hub Pull Request
pnnl · Sep 15, 2023 · 866cb63 · 866cb63
2 parents 8093546 + e547845
commit 866cb63
Show file tree

Hide file tree

Showing 25 changed files with 12,845 additions and 6 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,37 @@
+name: Python Package
+
+on: 
+  push:
+    paths:
+      - 'hubdata/input/**' # Only run when changes are pushed to this directory
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+
+    - name: Create and activate virtual environment
+      run: |
+        python -m venv venv
+        source venv/bin/activate
+
+    - name: Install dependencies
+      run: |
+        source venv/bin/activate  
+        python -m pip install --upgrade pip
+        pip install -r data/requirements.txt
+
+    - name: Test
+      run: |
+        source venv/bin/activate
+        # python data/main.py
+        PYTHONPATH=. pytest data/test/test_data.py
+
diff --git a/data/Readme.md b/data/Readme.md
@@ -0,0 +1,23 @@
+# How to run
+#### Run script directly from cmd:
+python script.py (considering requirements are installed)
+
+# improvements to be applied:
+1. In the initial_processing.py file,
+ for the solar_trace data, years and sizes variables are hardcoded. These must be improved by reading all the years and sizes that are available from the original input file.
+The current vesrion (filename: "pnnl_utility_timelines_summary.xlsx" as of 9-6-23) of the input file for SolarTRACE has the following columns:
+
+	state	ahj	geo_id	utility	eia_id	All	2017	2018	2019	2020	2021	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX	Installs	Pre Ix	N	Final IX	N	Full IX	Total IX
+
+Assuming numbered years start from column 7 (2017 in this case), This may be used as a starting point to extract the years present in the file. Another approach is to read the first row that has the following columns in the current vesrion (filename: "pnnl_utility_timelines_summary.xlsx" as of 9-6-23) of the input file for SolarTRACE: 
+Total installs	2017 | 0-10kW	2018 | 0-10kW	2019 | 0-10kW	2020 | 0-10kW	2021 | 0-10kW   2017 | 11-50kW  2018 | 11-50kW  2019 | 11-50kW  2020 | 11-50kW  2021 | 11-50kW
+
+NOTE: the read_input function reads the input files and year can be extracted in this module instead.											
+
+2. In the main_processing.py file,
+ for the solar_trace data, the indices ideally must be read from the file. After figuring out the ending year's index which is 11 with the year 2021 for the current vesrion (filename: "pnnl_utility_timelines_summary.xlsx" as of 9-6-23), every 14 columns (with first 7 for PV-only and second 7 PV+Storage).
+
+In this same file, The while loops can be encapsulated and the duplicated code removed.
+
+3. In the helper_methods.py file, 
+ The get_api_results function may return a None which must be cleaned up in the output file. Currently these None values, result in 0 lat 0 lon entries within the output file that may look odd.
diff --git a/data/__init_.py b/data/__init_.py
diff --git a/data/helper/helper_methods.py b/data/helper/helper_methods.py
@@ -0,0 +1,98 @@
+import pandas as pd
+from pathlib import Path
+import requests, json
+
+pd.options.mode.chained_assignment = None  # default='warn'
+
+RESERVATION_COLS = ["GEOID","BASENAME","NAME","Lat","Lon"]
+COUNTY_COLS = ["POPULATION", "POP_SQMI", "SQMI", "ORIG_FID"]
+
+# Define the components of the file path
+SCRIPT_DIR = Path(__file__).resolve().parent # Get the directory that the current script is in
+PARENT_DIR = SCRIPT_DIR.parent.parent # Get the parent directory
+INPUT_DIR = PARENT_DIR / 'hubdata' / 'input' # Get the input directory
+INPUT_FILE = 'CENTROIDS' # Choose the input file that contains centroids (keep this uppercase for consistency)
+API_STRING = 'https://nominatim.openstreetmap.org/search?format=json' # Current NOMINATIM URL
+
+
+def get_centroids():
+    '''
+    This function is used to interpret and extract infromation from county centroids file that has lat lon for counties.
+    It is directly used in the queued up data processing (queues_2021_clean_data.xlsx) and indirectly
+    in solarTRACE by converting state abbreviations to state names and geo_ids
+    '''
+    all_files = list(INPUT_DIR.glob('*'))
+    for file in all_files: # Loop over all files
+        if INPUT_FILE in file.name.upper(): # Check if 'centroids' is in the file name
+            centroids_county = pd.read_csv(file).drop(COUNTY_COLS, axis=1)
+    # attach state and county columns to for the 'state_county' column
+    centroids_county_geoid = centroids_county.rename(columns={"FIPS": "geo_id"})
+    centroids_county_geoid['NAME'] = centroids_county_geoid['NAME'].apply(lambda x: x.split(' ')[0].lower())
+    centroids_county_geoid['state_county'] = centroids_county_geoid.STATE_ABBR.str.cat(centroids_county_geoid.NAME, sep='_')
+    selected_cenrtroids = centroids_county_geoid[['STATE_NAME', 'STATE_ABBR', 'state_county', 'lat', 'lon']]
+    selected_cenrtroids['state_county'] = selected_cenrtroids['state_county'].astype('string') # Change the type from object to string to use the string class.
+
+    return selected_cenrtroids
+
+
+def read_input(input_str, sheet, header=None, index_col=None):
+    '''
+    This function is used for reading the input files, as of the first version of this script, there are only two .xlsx
+    files that use this module. The solarTRACE file (currently pnnl_utility_timelines_summary.xlsx) and the berkley lab 
+    files (queues_2021_clean_data.xlsx)
+
+    Args:
+        input_str: is a string that is meant to tell the module which input file is being read.
+        sheet: denotes the excel sheet that holds the information needed for processing.
+        header: is used for the solarTRACE data because they contain extra rows for headers.
+        index_col: is used to indicate there is an index column in this file.
+
+    Returns:
+        returns: A Pandas dataframe(df) (The rest of the module only modifies this df)
+    '''
+    # centroids_reservations = pd.read_csv('AIANNHA_centroids.csv')[COLS]
+    all_files = list(INPUT_DIR.glob('*'))
+    for file in all_files: # Loop over all files
+        if input_str in file.name: # Check if input string is in the file name
+            xlsx = pd.ExcelFile(file) # Read the corresponding file
+            df = pd.read_excel(xlsx, sheet, header=header, index_col=index_col)
+            if header is None and index_col is None:
+                df = pd.read_excel(xlsx, sheet)
+            return df
+    return None
+
+def add_state_full(df):
+    '''
+    This function is used to add states' spelled out name to the df using the centroids file. This is used for searching 
+    through the API from the final_processing.py file.
+    '''
+    temp_df = df.copy()
+    centroids = get_centroids()
+    state_dict = centroids.set_index('STATE_ABBR')['STATE_NAME'].to_dict()
+    temp_df.insert(1, 'state_full', df['state'].map(state_dict))
+    return temp_df
+
+
+def get_api_result(row):
+    '''
+    This function is used to call the api for one row. The for loop that goes over all the rows is in the 
+    final_processing.py file implemented in the api_call function.
+    '''
+    try:
+        if row.get('ahj') is None:
+            return pd.Series([None, None])
+        row_state = row[0]
+        row_ahj = row[1].split()[:-1]
+        response = requests.get(f'{API_STRING}&state={row_state}&city={row_ahj}')
+        data = response.json()
+        if data and len(data) >= 1:
+            result = pd.Series([data[0]['lat'], data[0]['lon']])
+            return result
+        else:
+            return pd.Series([None, None])  # Default values
+
+    except json.JSONDecodeError:  # Catch the exception
+        return pd.Series([None, None])  # Return default values
+
+def write_output(df, output_dir):
+    df.fillna(0.0).to_csv(output_dir, index=False)
diff --git a/data/main.py b/data/main.py
@@ -0,0 +1,9 @@
+import solar_trace as st
+import qu as qu
+
+def main_function():
+    qu.qu_driver()
+    st.st_driver()
+
+if __name__ == "__main__":
+    main_function()
diff --git a/data/qu.py b/data/qu.py
@@ -0,0 +1,33 @@
+from helper.helper_methods import read_input, write_output, get_centroids
+from queued_up.initial_output import cleanup, init_output
+from queued_up.solar_wind import final_output
+from pathlib import Path
+
+QUEUED = 'queues'
+QUEUED_SHEET = 'data'
+SCRIPT_DIR = Path(__file__).resolve().parent # Get the directory that the current script is in
+PARENT_DIR = SCRIPT_DIR.parent # Get the parent directory
+OUTPUT_DIR = PARENT_DIR / 'hubdata' / 'output' / 'queued_up.csv'# Get the output directory
+
+
+
+def qu_driver():
+    '''
+    Queued Up module Driver
+    '''
+    # This function reads the US_county_centroids.csv containing lat lons for counties
+    centroids = get_centroids()
+    # Opens the file and reads from the input file
+    input_df = read_input(QUEUED, QUEUED_SHEET)
+    # Initial cleanup
+    clean_df = cleanup(input_df)
+    # 
+    initial_df = init_output(clean_df)
+    # 
+    final_df = final_output(clean_df, initial_df)
+    #
+    result = final_df.merge(centroids,  how = 'inner', on = ['state_county'])
+
+    write_output(result, OUTPUT_DIR)
+
+
diff --git a/data/queued_up/__init__.py b/data/queued_up/__init__.py
diff --git a/data/queued_up/initial_output.py b/data/queued_up/initial_output.py
@@ -0,0 +1,65 @@
+import pandas as pd
+
+COLS = ["q_status", "q_date", "on_date", "project_name", "utility", "county_1", "type_clean", "mw1", "mw2", "state"]
+
+'''
+    This function does initial cleanup
+    1. Formats date columns to datetime
+    2. Calculate num of days between on_date and q_date
+    3. Fills empty entries with 0.0s and Unknowns
+    4. combines state abbr and county name into "state_county" column with values [state abbreviation]_[county name]
+    5. Removes entries with 0 or negative mw from the mw1 column since we are not interested in those data at this point
+'''
+def cleanup(df):
+    df = df[COLS]
+    # Convert "q_date" column to date/time format (NOTE: There is no time section)
+    df['q_date'] = pd.to_datetime(df['q_date'], errors='coerce')
+    # Convert "on_date" column to date/time format (NOTE: Only 1571 entries have on_date)
+    df['on_date'] = pd.to_datetime(df['on_date'], errors='coerce')
+    # Count number of days between on_date and q_date
+    df['num_days'] = (df['on_date']-df['q_date']).dt.days
+    # Fill the NaN values with respective defaults
+    df['mw2'] = df['mw2'].fillna(0.0)
+    df['mw1'] = df['mw1'].fillna(0.0)
+    df['num_days'] = df['num_days'].fillna(0.0)
+    df['project_name'] = df['project_name'].fillna('UNKNOWN')
+    # create a state_county Column as a pseudo unique identifier for each row
+    df['state_county'] = df.state.str.cat(df.county_1, sep='_')
+    # Remove 'mw1's with 0 or negative values.
+    df = df[df['mw1']>0]
+
+    return df
+
+
+def init_output(df):
+    '''
+    This function uses the state_county column to group then aggregate according to:
+    1. Get the sum for mw1 nad mw2
+    2. Get max between mw1 and mw2
+    3. Get max of num_days
+    4. Get project counts
+    5. *Get q_status counts
+    6. Concatenate all the entries of type_clean column for each grouping of state_county
+    '''
+    # Count utilities 
+    utility_count = df.groupby(['state_county'])['utility'].nunique()
+    # Aggregate counties and sum all the floating point columns per county (NOTE: only mw1 and mw2 are summed.)
+    # print(df)
+    mw_sum = df.groupby(['state_county']).sum(numeric_only=True)
+    # Get max of mw1 and mw2
+    mw_max = mw_sum[['mw1', 'mw2']].max(axis=1)
+    # Aggregate counties and take the max of the days for each county
+    days = df.groupby(['state_county'])['num_days'].max()
+    # Get project counts
+    project_count = df.groupby(['state_county'])['project_name'].count()
+    # Get status counts --> ADD????????????????????????????????
+    status_count = df.groupby(['state_county'])['q_status'].count()
+    # Aggregate types of energy generation for each
+    type_clean = df.fillna('Unknown').groupby('state_county').agg({'type_clean': lambda d: ", ".join(set(d))}).reset_index()
+    # format type_clean to match the pervious pd series with state_county as index
+    type_clean = type_clean.set_index('state_county')
+    # Combine the series to create an intitial version of the queued up output
+    queued_up_initial = pd.concat([utility_count, mw_max, project_count, days, type_clean['type_clean']], axis=1, keys=['utility_count', 'mw_max', 'project_count', 'days_max', 'type_clean']).reset_index()
+
+    return queued_up_initial 
+
diff --git a/data/queued_up/solar_wind.py b/data/queued_up/solar_wind.py
@@ -0,0 +1,73 @@
+import pandas as pd
+
+ENERGY_TYPE_LIST = ["Solar", "Wind", "Neither"]
+STATUS_LIST = ["withdrawn","operational", "active", "suspended"]
+BASE = 'state_county'
+PARAMETER = 'project_name'
+BASE_COL = 'state_county'
+COL = ['mw1', 'mw2']
+
+
+
+def final_output(clean_df, initial_df):
+    '''
+    This function derives number of projects based on solar, wind and neither
+    It also derives max of mw1 and mw2 based on solar, wind and neither
+
+    '''
+    # Project counts based on  solar, wind and neither
+    result_col =list()
+    col_names = []
+    for each_type in ENERGY_TYPE_LIST:
+        for each_status in STATUS_LIST:
+            col_names.append("_".join(["proj_count", "_".join([each_type, each_status])]))
+            result_col.append(proj_count(BASE, PARAMETER, clean_df, each_type, each_status))
+
+    # Max of mw based on solar, wind and neither
+    result_col_mw = list()
+    col_names_mw = list()
+    for each_type in ENERGY_TYPE_LIST:
+        for each_status in STATUS_LIST:
+            col_names_mw.append("_".join(["mw", "_".join([each_type, each_status])]))
+            result_col_mw.append(megawatt_max(BASE_COL, COL, clean_df, each_type, each_status))
+
+    # Cleanup and do initial concatenation of the results
+    queued_up_combos = pd.concat(result_col, axis=1, keys=col_names).reset_index().fillna(0)
+    queued_up_combos = queued_up_combos.fillna(0)
+    queued_up_combos_mw = pd.concat(result_col_mw, axis=1, keys=col_names_mw).reset_index().fillna(0)
+    queued_up_combos_mw = queued_up_combos_mw.fillna(0)
+
+    # Do a final merge
+    queued_up_temp = pd.merge(initial_df, queued_up_combos_mw, how='outer', on='state_county')
+    queued_up_final = pd.merge(queued_up_temp, queued_up_combos, how='outer', on='state_county')
+    queued_up_final['state_county'] = queued_up_final.state_county.astype('string')
+    return queued_up_final
+
+
+
+def proj_count(parameter, col, df, energy_type, status):
+    '''
+    Helper for the final_output function above to get the project count for each type and status
+    '''
+    if energy_type == "neither":
+        result = df[~df.type_clean.str.contains("Wind|Solar").fillna(False)]
+    else:
+        result = df[df.type_clean.str.contains(energy_type).fillna(False)]
+    result = result[result.q_status.str.contains(status)]
+    result = result.groupby([parameter])[col].count()
+    return result
+
+
+
+def megawatt_max(base, c, df, energy_type, status):
+    '''
+    Helper for the final_output function above to get the mw max for each type and status
+    '''
+    if energy_type == "neither":
+        result = df[~df.type_clean.str.contains("Wind|Solar").fillna(False)]
+    else:
+        result = df[df.type_clean.str.contains(energy_type).fillna(False)]
+    result = result[result.q_status.str.contains(status)]
+    result = result.groupby([base]).sum(numeric_only=True)
+    result = result[c].max(axis=1)
+    return result
diff --git a/data/requirements.txt b/data/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+requests
+openpyxl
+pytest
diff --git a/data/solarTRACE/final_processing.py b/data/solarTRACE/final_processing.py
@@ -0,0 +1,20 @@
+import pandas as pd
+from helper.helper_methods import get_api_result
+
+
+def api_call(df):
+    '''
+    This function calls the NOMINATIM API and loops through all the rows of solar trace data finding lat lon for each row.
+    '''
+    df_temp = df.copy()
+    df_temp[['lat', 'lon']] = df[['state_full', 'ahj']].apply(get_api_result, axis=1)
+    return df_temp
+
+
+def final_processing(api_df,weighted_df):
+    '''
+    This function concatenates the api results from the previous module on this file with the rest of the results from the main_processing, 
+    adding the lat lons to create a final df for the output file.
+    '''
+    result = pd.concat([api_df.iloc[:, list(range(6)) + list(range(-22, 0))], weighted_df.iloc[:,-10:]],axis=1).drop('geo_id', axis=1).reset_index()
+    return result