In [1]:
import os
import logging


CURRENT_DIR = '/code/examples/datasets_ipynb'

logging.config.fileConfig(
    os.path.join(CURRENT_DIR, 'logging.conf'),
    disable_existing_loggers=False
)

In [2]:
"""
This file defines classes that hides the logic/path for saving and loading specific datasets that
are used across this project, as well as providing a brief description for each dataset.

To define a new dataset, create a property in Datasets.__init__() following the existing patthern.

The DATA variable is assigned an instance of the Datasets class and can be imported into other
scripts/notebooks.

To load the dataset called `the_dataset`, use the following code:

```
from source.services.data import DATA
df = DATA.the_dataset.load()
```

To save the dataset called `the_dataset`, use the following code:

```
from source.services.data import DATA

df = ...logic..
DATA.the_dataset.save(df)
```
"""
import os
import datetime
import logging
import pickle


def read_pickle(path):
    """
    Simple helper function that read's from a pickled object.

    Args:
        path:
            File path where the pickled object will be stored.
    """
    with open(path, 'rb') as handle:
        unpickled_object = pickle.load(handle)
    return unpickled_object


def to_pickle(obj, path):
    """
    Helper function that saves a pickled object.

    NOTE: If there is an existing file that matches `path`, it will rename that file by appending
    the current timestamp. The intent is to cache files in case there is an issue or any desire
    to look at past datasets.

    Args:
        obj:
            the object to save
        path:
            File path where the pickled object will be read from.
    """
    if os.path.isfile(path):
        timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
        os.rename(path, path + '.' + timestamp)

    with open(path, 'wb') as handle:
        pickle.dump(obj, handle)


class DataWrapper:
    """class that wraps the logic of saving/loading/describing a given dataset."""
    def __init__(self, description: str, dependencies: list, path: str):
        """
        Args:
            description: description of the dataset
            dependencies: dependencies of the dataset
            path:
                the path to save to and load from. NOTE: this should **not** contain the file name
                which is assigned at a later point in time based on the property name in the
                `Datasets` class.
        """
        self.description = description
        self.dependencies = dependencies
        self.path = path
        self.name = None

    def load(self):
        assert self.name
        _file_name = os.path.join(self.path, self.name + '.pkl')
        logging.info(f"Loading data `{self.name}` from `{_file_name}`")
        return read_pickle(path=_file_name)

    def save(self, data):
        assert self.name
        _file_name = os.path.join(self.path, self.name + '.pkl')
        logging.info(f"Saving data `{self.name}` to `{_file_name}`")
        to_pickle(obj=data, path=_file_name)


class Datasets:
    """class that defines all of the datasets available globally to the project."""
    def __init__(self) -> None:
        """Use this function to define datasets by following the existing pattern."""

        ####
        # DEFINE DATASETS HERE
        ####
        self.dataset_1 = DataWrapper(
            description="Dataset description",
            dependencies=['SNOWFLAKE.SCHEMA.TABLE'],
            path=os.path.join(CURRENT_DIR, 'data'),
        )
        self.other_dataset_2 = DataWrapper(
            description="Other dataset description",
            dependencies=['dataset_1'],
            path=os.path.join(CURRENT_DIR, 'data'),
        )
        ####
        # END OF DATASETS
        ####

        # dynamically set the name property in the DataWrapper object in all of the object;
        # this forces the names to match the property name and reduces the redundancy of
        # duplicating the name when defining the property and passing in the name ot the loader
        for dataset in self.datasets:
            dataset_obj = getattr(self, dataset)
            dataset_obj.name = dataset

    @property
    def datasets(self) -> list[str]:
        """Returns the names of the datasets available."""
        ignore = set(['datasets', 'descriptions', 'dependencies'])
        return [
            attr for attr in dir(self)
            if attr not in ignore and isinstance(getattr(self, attr), DataWrapper)
        ]

    @property
    def descriptions(self) -> dict[str]:
        """Returns the names and descriptions of the datasets available."""
        return [
            dict(
                dataset=x,
                description=getattr(self, x).description
            )
            for x in self.datasets
        ]

    @property
    def dependencies(self) -> dict[str]:
        """Returns the names and dependencies of the datasets available."""
        return [
            dict(
                dataset=x,
                dependencies=getattr(self, x).dependencies
            )
            for x in self.datasets
        ]


# create a global object that can be imported into other scripts
DATA = Datasets()

# ensure all names got set properly
assert all([getattr(DATA, x).name == x for x in DATA.datasets])

---

# Usage

In [3]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.gitkeep']

In [4]:
import pandas as pd

df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
DATA.dataset_1.save(df)

2023-03-04 20:42:42 - INFO     | Saving data `dataset_1` to `/code/examples/datasets_ipynb/data/dataset_1.pkl`


In [5]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.gitkeep', 'dataset_1.pkl']

In [6]:
DATA.dataset_1.load()

2023-03-04 20:42:42 - INFO     | Loading data `dataset_1` from `/code/examples/datasets_ipynb/data/dataset_1.pkl`


Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [7]:
df = df.replace(2, 'a')
DATA.dataset_1.save(df)

2023-03-04 20:42:42 - INFO     | Saving data `dataset_1` to `/code/examples/datasets_ipynb/data/dataset_1.pkl`


In [8]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.gitkeep', 'dataset_1.pkl', 'dataset_1.pkl.2023_03_04_20_42_42']

In [9]:
DATA.dataset_1.load()

2023-03-04 20:42:43 - INFO     | Loading data `dataset_1` from `/code/examples/datasets_ipynb/data/dataset_1.pkl`


Unnamed: 0,a,b
0,1,4
1,a,5
2,3,6


In [10]:
DATA.other_dataset_2.save(df.replace('a', 'asdf'))

2023-03-04 20:42:43 - INFO     | Saving data `other_dataset_2` to `/code/examples/datasets_ipynb/data/other_dataset_2.pkl`


In [11]:
DATA.other_dataset_2.load()

2023-03-04 20:42:43 - INFO     | Loading data `other_dataset_2` from `/code/examples/datasets_ipynb/data/other_dataset_2.pkl`


Unnamed: 0,a,b
0,1,4
1,asdf,5
2,3,6


In [15]:
df

Unnamed: 0,a,b
0,1,4
1,a,5
2,3,6


---

# Properties

In [12]:
DATA.datasets

['dataset_1', 'other_dataset_2']

In [13]:
pd.DataFrame(DATA.descriptions).style.hide(axis='index')

dataset,description
dataset_1,Dataset description
other_dataset_2,Other dataset description


In [14]:
pd.DataFrame(DATA.dependencies).style.hide(axis='index')

dataset,dependencies
dataset_1,['SNOWFLAKE.SCHEMA.TABLE']
other_dataset_2,['dataset_1']


---