singularity-energy · grgmiller · Feb 1, 2024 · Jan 30, 2024 · Jan 31, 2024 · Jan 30, 2024
diff --git a/Pipfile b/Pipfile
@@ -20,6 +20,7 @@ seaborn = "*"
 sqlalchemy = "*"
 statsmodels = "*"
 coloredlogs = "*"
+s3fs = {extras=["boto3"], versions="==2023.12.2"}
 "catalystcoop.pudl" = {git = "git+https://github.com/singularity-energy/pudl.git@oge_release"}
 gridemissions = {git = "git+https://github.com/singularity-energy/gridemissions"}
 

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -84,13 +84,26 @@ Notebooks are organized into five directories based on their purpose
 - `work_in_progress`: temporary notebooks being used for development purposes on specific branches
 
 ### Data Structure
-All manual reference tables are stored in `src/oge/reference_tables`. 
+All manual reference tables are stored in `src/oge/reference_tables`.
 
 All files downloaded/created as part of the pipeline are stored in your HOME directory (e.g. users/user.name/):
 - `HOME/open_grid_emissions_data/downloads` contains all files that are downloaded by functions in `load_data`
 - `HOME/open_grid_emissions_data/outputs` contains intermediate outputs from the data pipeline... any files created by our code that are not final results
 - `HOME/open_grid_emissions_data/results` contains all final output files that will be published
 
+## Importing OGE as a Package in your Project
+OGE is not yet available on PyPi but can be installed from GitHub. For example, this can be done by adding `oge = {git="https://github.com/singularity-energy/open-grid-emissions.git"}` to your Pipfile if you are using `pipenv` for your project.
+
+Note that you don't need to run the pipeline to generate the output data as these are available on Amazon Simple Storage Service (S3). Simply, set the `OGE_DATA_STORE` environment variable to `s3` in the **\_\_init\_\_.py** file of your project to fetch OGE data from Amazon S3. Additionally, you can tell OGE to not write logs in a file by overwriting the default configuration of the OGE logger.
+
+To summarize, your **\_\_init\_\_.py** file would then look like this:
+```python
+from oge.logging_util import configure_root_logger
+
+os.environ["OGE_DATA_STORE"] = "s3"
+configure_root_logger(logfile=None)
+```
+
 ## Development Setup
 If you would like to run the code on your own computer and/or contribute updates to the code, the following steps can help get you started.
 

diff --git a/environment.yml b/environment.yml
@@ -3,11 +3,9 @@ channels:
   - defaults
   - conda-forge
 dependencies:
-  - black # development: code formatting
   - blas=*=openblas # prevent mkl implementation of blas
   - cvxopt
   - cvxpy=1.2.1 # used by gridemissions, newer version not working as of 12/12/2022
-  - flake8 # development: linter
   - ipykernel
   - nomkl # prevent mkl implementation of blas
   - notebook
@@ -23,6 +21,7 @@ dependencies:
   - qdldl-python==0.1.5,!=0.1.5.post2 # used for gridemissions, newer version not working as of 12/12/2022
   - requests>=2.28.1
   - ruff
+  - s3fs
   - seaborn # used by gridemissions
   - setuptools # used for pudl
   - sqlalchemy

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
   "sqlalchemy",
   "statsmodels",
   "coloredlogs",
+  "s3fs[boto3] == 2023.12.2",
   "catalystcoop-pudl@git+https://github.com/singularity-energy/pudl.git@oge_release",
   "gridemissions@git+https://github.com/singularity-energy/gridemissions.git",
 ]

diff --git a/src/oge/data_pipeline.py b/src/oge/data_pipeline.py
@@ -69,6 +69,11 @@ def print_args(args: argparse.Namespace, logger):
 
 def main(args):
     """Runs the OGE data pipeline."""
+    if os.getenv("OGE_DATA_STORE") in ["s3", "2"]:
+        raise OSError(
+            "Invalid OGE_DATA_STORE environment variable. Should be 'local' or '1'"
+        )
+
     args = get_args()
     year = args.year
 

diff --git a/src/oge/eia930.py b/src/oge/eia930.py
@@ -2,7 +2,6 @@
 import re
 from datetime import timedelta
 import os
-from os.path import join
 
 import oge.load_data as load_data
 from oge.column_checks import get_dtypes
@@ -152,9 +151,7 @@ def clean_930(year: int, small: bool = False, path_prefix: str = ""):
     # Adjust
     logger.info("Adjusting EIA-930 time stamps")
     df = manual_930_adjust(df)
-    df.to_csv(
-        join(data_folder, "eia930_raw.csv")
-    )  # Will be read by gridemissions workflow
+    df.to_csv(data_folder + "eia930_raw.csv")  # Will be read by gridemissions workflow
 
     # Run cleaning
     logger.info("Running physics-based data cleaning")

diff --git a/src/oge/filepaths.py b/src/oge/filepaths.py
@@ -2,11 +2,20 @@
 import os
 
 
-def top_folder(rel=""):
-    """
-    Returns a path relative to the top-level repo folder.
+def get_data_store():
+    """Set data location"""
+    store = os.getenv("OGE_DATA_STORE")
+    if store is None:
+        return os.path.join(os.path.expanduser("~"), "open_grid_emissions_data")
+    elif store == "1" or store.lower() == "local":
+        return os.path.join(os.path.expanduser("~"), "open_grid_emissions_data")
+    elif store == "2" or store.lower() == "s3":
+        return "s3://open-grid-emissions/open_grid_emissions_data"
+
 
-    This will work regardless of where the function is imported or called from.
+def top_folder(rel=""):
+    """Returns a path relative to the top-level repo folder. This will work regardless
+    of where the function is imported or called from.
     """
     return os.path.join(
         os.path.abspath(os.path.join(os.path.realpath(__file__), "../")), rel
@@ -19,9 +28,7 @@ def reference_table_folder(rel=""):
 
 def data_folder(rel=""):
     """Returns a path relative to the `data` folder."""
-    return os.path.join(
-        os.path.join(os.path.expanduser("~"), "open_grid_emissions_data"), rel
-    )
+    return os.path.join(get_data_store(), rel)
 
 
 def downloads_folder(rel=""):