In [1]:
import os
import logging


CURRENT_DIR = '/code/examples/datasets_ipynb'

logging.config.fileConfig(
    os.path.join(CURRENT_DIR, 'logging.conf'),
    disable_existing_loggers=False
)

In [2]:
import abc

class Shape(metaclass=abc.ABCMeta):
    def __init__(self, a) -> None:
        self.a = a

    @abc.abstractmethod
    def area(self):
        pass

    @abc.abstractmethod
    def perimeter(self):
        pass

class Rectangle(Shape):
    def __init__(self, length, width):
        super().__init__(a=1)
        self.length = length
        self.width = width

    def area(self):
        return self.length * self.width

    def perimeter(self):
        return 2 * (self.length + self.width)

Rectangle(2, 3)

<__main__.Rectangle at 0xffff9d7ae1d0>

In [3]:
"""
This file defines classes that hides the logic/path for saving and loading specific datasets that
are used across this project, as well as providing a brief description for each dataset.

To define a new dataset, create a property in Datasets.__init__() following the existing patthern.

The DATA variable is assigned an instance of the Datasets class and can be imported into other
scripts/notebooks.

To load the dataset called `the_dataset`, use the following code:

```
from source.services.data import DATA
df = DATA.the_dataset.load()
```

To save the dataset called `the_dataset`, use the following code:

```
from source.services.data import DATA

df = ...logic..
DATA.the_dataset.save(df)
```
"""
import os
import datetime
import logging
import pickle
from abc import ABC, abstractmethod, abstractproperty

import pandas as pd


class DataPersistence(ABC):
    """
    Class that wraps the logic of saving/loading/describing a given dataset.
    Meant to be subclassed with specific types of loaders (e.g. pickle, database, etc.)
    """
    def __init__(self, description: str, dependencies: list):
        """
        Args:
            description: description of the dataset
            dependencies: dependencies of the dataset
        """
        self.description = description
        self.dependencies = dependencies
        self.name = None  # this is set dynamically

    @abstractmethod
    def _load(self):
        pass

    @abstractmethod
    def _save(self, data):
        pass

    def load(self):
        assert self.name
        return self._load()
    
    def save(self, data):
        assert self.name
        self._save(data)

class FileDataPersistence(DataPersistence):
    """
    Class that wraps the logic of saving/loading/describing a given dataset to the file-system.
    Adds logic for backing up datasets if they are being saved and already exist (i.e. renaming
    the file with a timestamp)
    Meant to be subclassed with specific types of loaders (e.g. pickle, csv, etc.)
    """
    def __init__(self, description: str, dependencies: list, directory: str):
        """
        Args:
            description: description of the dataset
            dependencies: dependencies of the dataset
        """
        super().__init__(description, dependencies)
        self.directory = directory

    @abstractmethod
    def _load(self):
        """Logic to load the `data`"""
        pass

    @abstractmethod
    def _save(self, data):
        """Logic to save the `data`"""
        pass

    @abstractproperty
    def path(self):
        """Full path (directory and file name) to load/save."""
        pass

    def load(self):
        assert self.name
        logging.info(f"Loading data `{self.name}` from `{self.path}`")
        return self._load()
    
    def save(self, data):
        assert self.name
        logging.info(f"Saving data `{self.name}` to `{self.path}`")
        # if the file already exists, save it to another name
        if os.path.isfile(self.path):
            timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
            new_name = self.path + '.' + timestamp
            logging.info(f"Backing up current data `{self.name}` to `{new_name}`")
            os.rename(self.path, new_name)
        self._save(data)


class PickledDataLoader(FileDataPersistence):
    """
    Class that wraps the logic of saving/loading/describing a given dataset.
    """
    def __init__(self, description: str, dependencies: list, directory: str):
        """
        Args:
            description: description of the dataset
            dependencies: dependencies of the dataset
            directory:
                the directory to save to and load from. NOTE: this should **not** contain the file name
                which is assigned at a later point in time based on the property name in the
                `Datasets` class.
        """
        super().__init__(description, dependencies, directory)

    @property
    def path(self):
        return os.path.join(self.directory, self.name + '.pkl')

    def _load(self):
        with open(self.path, 'rb') as handle:
            unpickled_object = pickle.load(handle)
        return unpickled_object

    def _save(self, data):
        with open(self.path, 'wb') as handle:
            pickle.dump(data, handle)


class CsvDataLoader(FileDataPersistence):
    """
    Class that wraps the logic of saving/loading/describing a given dataset.
    """
    def __init__(self, description: str, dependencies: list, directory: str):
        """
        Args:
            description: description of the dataset
            dependencies: dependencies of the dataset
            directory:
                the path to save to and load from. NOTE: this should **not** contain the file name
                which is assigned at a later point in time based on the property name in the
                `Datasets` class.
        """
        super().__init__(description, dependencies, directory)

    @property
    def path(self):
        return os.path.join(self.directory, self.name + '.pkl')

    def _load(self):
        return pd.read_csv(self.path)

    def _save(self, data: pd.DataFrame):
        data.to_csv(self.path, index=None)


class Datasets:
    """class that defines all of the datasets available globally to the project."""
    def __init__(self) -> None:
        """Use this function to define datasets by following the existing pattern."""

        ####
        # DEFINE DATASETS HERE
        ####
        self.dataset_1 = PickledDataLoader(
            description="Dataset description",
            dependencies=['SNOWFLAKE.SCHEMA.TABLE'],
            directory=os.path.join(CURRENT_DIR, 'data'),
        )
        self.other_dataset_2 = PickledDataLoader(
            description="Other dataset description",
            dependencies=['dataset_1'],
            directory=os.path.join(CURRENT_DIR, 'data'),
        )
        self.dataset_3_csv = CsvDataLoader(
            description="Other dataset description",
            dependencies=['dataset_1'],
            directory=os.path.join(CURRENT_DIR, 'data'),
        )
        ####
        # END OF DATASETS
        ####
        # dynamically set the name property in the DataPersistence object in all of the object;
        # I don't love this design, but it forces the names to match the property name and reduces
        # the redundancy of duplicating the name when defining the property and passing in the name
        # ot the loader
        for dataset in self.datasets:
            dataset_obj = getattr(self, dataset)
            dataset_obj.name = dataset

    @property
    def datasets(self) -> list[str]:
        """Returns the names of the datasets available."""
        ignore = set(['datasets', 'descriptions', 'dependencies'])
        return [
            attr for attr in dir(self)
            if attr not in ignore and isinstance(getattr(self, attr), DataPersistence)
        ]

    @property
    def descriptions(self) -> dict[str]:
        """Returns the names and descriptions of the datasets available."""
        return [
            dict(
                dataset=x,
                description=getattr(self, x).description
            )
            for x in self.datasets
        ]

    @property
    def dependencies(self) -> dict[str]:
        """Returns the names and dependencies of the datasets available."""
        return [
            dict(
                dataset=x,
                dependencies=getattr(self, x).dependencies
            )
            for x in self.datasets
        ]


# create a global object that can be imported into other scripts
DATA = Datasets()

# ensure all names got set properly
assert all([getattr(DATA, x).name == x for x in DATA.datasets])

---

# Usage

In [4]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.DS_Store', '.gitkeep']

In [5]:
import pandas as pd

df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
DATA.dataset_1.save(df)
DATA.dataset_3_csv.save(df)

2023-03-19 20:51:02 - INFO     | Saving data `dataset_1` to `/code/examples/datasets_ipynb/data/dataset_1.pkl`
2023-03-19 20:51:02 - INFO     | Saving data `dataset_3_csv` to `/code/examples/datasets_ipynb/data/dataset_3_csv.pkl`


In [6]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.DS_Store', '.gitkeep', 'dataset_1.pkl', 'dataset_3_csv.pkl']

In [7]:
DATA.dataset_1.load()

2023-03-19 20:51:06 - INFO     | Loading data `dataset_1` from `/code/examples/datasets_ipynb/data/dataset_1.pkl`


Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [8]:
DATA.dataset_3_csv.load()

2023-03-19 20:51:07 - INFO     | Loading data `dataset_3_csv` from `/code/examples/datasets_ipynb/data/dataset_3_csv.pkl`


Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [9]:
assert (DATA.dataset_1.load() == DATA.dataset_3_csv.load()).all().all()

2023-03-19 20:51:07 - INFO     | Loading data `dataset_1` from `/code/examples/datasets_ipynb/data/dataset_1.pkl`
2023-03-19 20:51:07 - INFO     | Loading data `dataset_3_csv` from `/code/examples/datasets_ipynb/data/dataset_3_csv.pkl`


In [10]:
df = df.replace(2, 'a')
DATA.dataset_1.save(df)

2023-03-19 20:51:11 - INFO     | Saving data `dataset_1` to `/code/examples/datasets_ipynb/data/dataset_1.pkl`
2023-03-19 20:51:11 - INFO     | Backing up current data `dataset_1` to `/code/examples/datasets_ipynb/data/dataset_1.pkl.2023_03_19_20_51_11`


In [11]:
df = df.replace(2, 'a')
DATA.dataset_3_csv.save(df)

2023-03-19 20:51:27 - INFO     | Saving data `dataset_3_csv` to `/code/examples/datasets_ipynb/data/dataset_3_csv.pkl`
2023-03-19 20:51:27 - INFO     | Backing up current data `dataset_3_csv` to `/code/examples/datasets_ipynb/data/dataset_3_csv.pkl.2023_03_19_20_51_27`


In [12]:
os.listdir(os.path.join(CURRENT_DIR, 'data'))

['.DS_Store',
 '.gitkeep',
 'dataset_1.pkl',
 'dataset_1.pkl.2023_03_19_20_51_11',
 'dataset_3_csv.pkl',
 'dataset_3_csv.pkl.2023_03_19_20_51_27']

In [13]:
DATA.dataset_1.load()

2023-03-19 20:51:54 - INFO     | Loading data `dataset_1` from `/code/examples/datasets_ipynb/data/dataset_1.pkl`


Unnamed: 0,a,b
0,1,4
1,a,5
2,3,6


In [16]:
DATA.dataset_3_csv.load()

2023-03-19 20:52:27 - INFO     | Loading data `dataset_3_csv` from `/code/examples/datasets_ipynb/data/dataset_3_csv.pkl`


Unnamed: 0,a,b
0,1,4
1,a,5
2,3,6


In [17]:
DATA.other_dataset_2.save(df.replace('a', 'asdf'))

2023-03-19 20:53:01 - INFO     | Saving data `other_dataset_2` to `/code/examples/datasets_ipynb/data/other_dataset_2.pkl`


In [18]:
DATA.other_dataset_2.load()

2023-03-19 20:53:02 - INFO     | Loading data `other_dataset_2` from `/code/examples/datasets_ipynb/data/other_dataset_2.pkl`


Unnamed: 0,a,b
0,1,4
1,asdf,5
2,3,6


---

# Properties

In [19]:
DATA.datasets

['dataset_1', 'dataset_3_csv', 'other_dataset_2']

In [20]:
pd.DataFrame(DATA.descriptions).style.hide(axis='index')

dataset,description
dataset_1,Dataset description
dataset_3_csv,Other dataset description
other_dataset_2,Other dataset description


In [21]:
pd.DataFrame(DATA.dependencies).style.hide(axis='index')

dataset,dependencies
dataset_1,['SNOWFLAKE.SCHEMA.TABLE']
dataset_3_csv,['dataset_1']
other_dataset_2,['dataset_1']


---