In [1]:
import os
import logging


CURRENT_DIR = '/code/examples/datasets_ipynb'

logging.config.fileConfig(
    os.path.join(CURRENT_DIR, 'logging.conf'),
    disable_existing_loggers=False
)

In [2]:
"""
This file defines classes that hides the logic/path for saving and loading specific datasets that
are used across this project, as well as providing a brief description for each dataset.

To define a new dataset, create a property in Datasets.__init__() following the existing patthern.

The DATA variable is assigned an instance of the Datasets class and can be imported into other
scripts/notebooks.

To load the dataset called `the_dataset`, use the following code:

```
from source.services.data import DATA
df = DATA.the_dataset.load()
```

To save the dataset called `the_dataset`, use the following code:

```
from source.services.data import DATA

df = ...logic..
DATA.the_dataset.save(df)
```
"""
import os
import datetime
import logging
import pickle


def read_pickle(path):
    """
    Simple helper function that read's from a pickled object.

    Args:
        path:
            File path where the pickled object will be stored.
    """

    with open(path, 'rb') as handle:
        unpickled_object = pickle.load(handle)
    return unpickled_object


def to_pickle(obj, path):
    """
    Helper function that saves a pickled object.

    NOTE: If there is an existing file that matches `path`, it will rename that file by appending
    the current timestamp. The intent is to cache files in case there is an issue or any desire
    to look at past datasets.

    Args:
        obj:
            the object to save
        path:
            File path where the pickled object will be read from.
    """
    if os.path.isfile(path):
        timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
        os.rename(path, path + '.' + timestamp)

    with open(path, 'wb') as handle:
        pickle.dump(obj, handle)


class DataLoader:
    """class that wraps the logic of saving/loading/describing a given dataset."""
    def __init__(self, name: str, description: str, dependencies: list, path: str) -> None:
        self._name = name
        self._description = description
        self._dependencies = dependencies
        self._path = path

    def load(self):
        logging.info(f"Loading data `{self._name}` from `{self._path}`")
        return read_pickle(path=self._path)

    def save(self, data):
        logging.info(f"Saving data `{self._name}` to `{self._path}`")
        to_pickle(obj=data, path=self._path)


class Datasets:
    """class that defines all of the datasets available globally to the project."""
    def __init__(self) -> None:
        """Use this function to define datasets by following the existing pattern."""
        self.dataset_1 = DataLoader(
            name='dataset_1',
            description="Dataset description",
            dependencies=['SNOWFLAKE.SCHEMA.TABLE'],
            path=os.path.join(CURRENT_DIR, 'data/dataset_1.pkl'),
        )

    @property
    def datasets(self) -> list[str]:
        """Returns the names of the datasets available."""
        ignore = set(['datasets', 'descriptions', 'dependencies'])
        return [
            attr for attr in dir(self)
            if attr not in ignore and isinstance(getattr(self, attr), DataLoader)
        ]

    @property
    def descriptions(self) -> dict[str]:
        """Returns the names and descriptions of the datasets available."""
        return [
            dict(
                dataset=getattr(self, x)._name,
                description=getattr(self, x)._description
            )
            for x in self.datasets
        ]

    @property
    def dependencies(self) -> dict[str]:
        """Returns the names and dependencies of the datasets available."""
        return [
            dict(
                dataset=getattr(self, x)._name,
                dependencies=getattr(self, x)._dependencies
            )
            for x in self.datasets
        ]


# create a global object that can be imported into other scripts
DATA = Datasets()

---

# Usage

In [3]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.gitkeep']

In [4]:
import pandas as pd

df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
DATA.dataset_1.save(df)

2023-03-04 19:59:59 - INFO     | Saving data `dataset_1` to `/code/examples/datasets_ipynb/data/dataset_1.pkl`


In [5]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.gitkeep', 'dataset_1.pkl']

In [6]:
DATA.dataset_1.load()

2023-03-04 19:59:59 - INFO     | Loading data `dataset_1` from `/code/examples/datasets_ipynb/data/dataset_1.pkl`


Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [7]:
DATA.dataset_1.save(df.replace(2, 'a'))

2023-03-04 19:59:59 - INFO     | Saving data `dataset_1` to `/code/examples/datasets_ipynb/data/dataset_1.pkl`


In [8]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.gitkeep', 'dataset_1.pkl', 'dataset_1.pkl.2023_03_04_19_59_59']

In [9]:
DATA.dataset_1.load()

2023-03-04 19:59:59 - INFO     | Loading data `dataset_1` from `/code/examples/datasets_ipynb/data/dataset_1.pkl`


Unnamed: 0,a,b
0,1,4
1,a,5
2,3,6


---

# Properties

In [10]:
DATA.datasets

['dataset_1']

In [11]:
pd.DataFrame(DATA.descriptions).style.hide(axis='index')

dataset,description
dataset_1,Dataset description


In [12]:
pd.DataFrame(DATA.dependencies).style.hide(axis='index')

dataset,dependencies
dataset_1,['SNOWFLAKE.SCHEMA.TABLE']


---