-
Notifications
You must be signed in to change notification settings - Fork 292
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add update_column
and add_column
to SingleTableMetadata
#915
Merged
pvk-developer
merged 12 commits into
V1.0.0.dev
from
issue-877-add-update-column-and-add-column-to-stm
Jul 22, 2022
Merged
Changes from 10 commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
c5dc211
Implement update / add columns. *WIP
pvk-developer 3f8ccf0
Add docstrings
pvk-developer 2126d3d
Fix call functions
pvk-developer 31b2c47
WIP: Unit tests in progress
pvk-developer c9aaf44
Finish unit tests
pvk-developer 8e262fd
Address comments
pvk-developer 169020a
Fix multiple python version erroring
pvk-developer cd6829e
fix exception
pvk-developer 1626b89
Fix error msg
pvk-developer 72913f2
Address comments
pvk-developer 7272d89
Address comments about frozensets
pvk-developer 214a504
Bump macos version.
pvk-developer File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
"""Single Table Metadata.""" | ||
|
||
import copy | ||
import json | ||
import re | ||
from copy import deepcopy | ||
from datetime import datetime | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
|
@@ -12,16 +14,86 @@ | |
class SingleTableMetadata: | ||
"""Single Table Metadata class.""" | ||
|
||
_EXPECTED_KWARGS = { | ||
'numerical': ['representation'], | ||
'datetime': ['datetime_format'], | ||
'categorical': ['order', 'order_by'], | ||
'boolean': [], | ||
'text': ['regex_format'], | ||
} | ||
|
||
_DTYPES_TO_SDTYPES = { | ||
'i': 'numerical', | ||
'f': 'numerical', | ||
'O': 'categorical', | ||
'b': 'boolean', | ||
'M': 'datetime', | ||
} | ||
|
||
_NUMERICAL_REPRESENTATIONS = [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we make this a frozenset instead of a list? That way it is immutable |
||
'int', 'int64', 'int32', 'int16', 'int8', | ||
'uint', 'uint64', 'uint32', 'uint16', 'uint8', | ||
'float', 'float64', 'float32', 'float16', 'float8', | ||
] | ||
KEYS = ['columns', 'primary_key', 'alternate_keys', 'constraints', 'SCHEMA_VERSION'] | ||
SCHEMA_VERSION = 'SINGLE_TABLE_V1' | ||
|
||
def _validate_numerical(self, column_name, **kwargs): | ||
representation = kwargs.get('representation') | ||
if representation and representation not in self._NUMERICAL_REPRESENTATIONS: | ||
raise ValueError( | ||
f"Invalid value for 'representation' {representation} for column '{column_name}'.") | ||
|
||
@staticmethod | ||
def _validate_datetime(column_name, **kwargs): | ||
datetime_format = kwargs.get('datetime_format') | ||
if datetime_format: | ||
try: | ||
formated_date = datetime.now().strftime(datetime_format) | ||
except Exception as exception: | ||
raise ValueError( | ||
f"Invalid datetime format string '{datetime_format}' " | ||
f"for datetime column '{column_name}'." | ||
) from exception | ||
|
||
matches = re.findall('(%.)|(%)', formated_date) | ||
if matches: | ||
raise ValueError( | ||
f"Invalid datetime format string '{datetime_format}' " | ||
f"for datetime column '{column_name}'." | ||
) | ||
|
||
@staticmethod | ||
def _validate_categorical(column_name, **kwargs): | ||
order = kwargs.get('order') | ||
order_by = kwargs.get('order_by') | ||
if order and order_by: | ||
raise ValueError( | ||
f"Categorical column '{column_name}' has both an 'order' and 'order_by' " | ||
'attribute. Only 1 is allowed.' | ||
) | ||
if order_by and order_by not in ('numerical_value', 'alphabetical'): | ||
raise ValueError( | ||
f"Unknown ordering method '{order_by}' provided for categorical column " | ||
f"'{column_name}'. Ordering method must be 'numerical_value' or 'alphabetical'." | ||
) | ||
if (isinstance(order, list) and not len(order)) or\ | ||
(not isinstance(order, list) and order is not None): | ||
raise ValueError( | ||
f"Invalid order value provided for categorical column '{column_name}'. " | ||
"The 'order' must be a list with 1 or more elements." | ||
) | ||
|
||
@staticmethod | ||
def _validate_text(column_name, **kwargs): | ||
regex = kwargs.get('regex_format') | ||
try: | ||
re.compile(regex) | ||
except Exception as exception: | ||
raise ValueError( | ||
f"Invalid regex format string '{regex}' for text column '{column_name}'." | ||
) from exception | ||
|
||
def __init__(self): | ||
self._columns = {} | ||
self._primary_key = None | ||
|
@@ -36,6 +108,88 @@ def __init__(self): | |
'SCHEMA_VERSION': self.SCHEMA_VERSION | ||
} | ||
|
||
def _validate_unexpected_kwargs(self, column_name, sdtype, **kwargs): | ||
expected_kwargs = self._EXPECTED_KWARGS.get(sdtype, ['pii']) | ||
unexpected_kwargs = set(list(kwargs)) - set(expected_kwargs) | ||
if unexpected_kwargs: | ||
unexpected_kwargs = list(unexpected_kwargs) | ||
unexpected_kwargs.sort() | ||
unexpected_kwargs = ', '.join(unexpected_kwargs) | ||
raise ValueError( | ||
f"Invalid values '({unexpected_kwargs})' for {sdtype} column '{column_name}'.") | ||
|
||
def _validate_column_exists(self, column_name): | ||
if column_name not in self._columns: | ||
raise ValueError( | ||
f"Column name ('{column_name}') does not exist in the table. " | ||
"Use 'add_column' to add new column." | ||
) | ||
|
||
def _validate_column(self, column_name, sdtype, **kwargs): | ||
self._validate_unexpected_kwargs(column_name, sdtype, **kwargs) | ||
if sdtype == 'categorical': | ||
self._validate_categorical(column_name, **kwargs) | ||
elif sdtype == 'numerical': | ||
self._validate_numerical(column_name, **kwargs) | ||
elif sdtype == 'datetime': | ||
self._validate_datetime(column_name, **kwargs) | ||
elif sdtype == 'text': | ||
self._validate_text(column_name, **kwargs) | ||
|
||
def add_column(self, column_name, **kwargs): | ||
"""Add a column to the ``SingleTableMetadata``. | ||
|
||
Args: | ||
column_name (str): | ||
The column name to be added. | ||
|
||
kwargs (type): | ||
Any additional key word arguments for the column, where ``sdtype`` is required. | ||
|
||
Raises: | ||
- ``ValueError`` if the column already exists. | ||
- ``ValueError`` if the ``kwargs`` do not contain ``sdtype``. | ||
- ``ValueError`` if the column has unexpected values or ``kwargs`` for the given | ||
``sdtype``. | ||
""" | ||
if column_name in self._columns: | ||
raise ValueError( | ||
f"Column name '{column_name}' already exists. Use 'update_column' " | ||
'to update an existing column.' | ||
) | ||
|
||
sdtype = kwargs.get('sdtype') | ||
if sdtype is None: | ||
raise ValueError(f"Please provide a 'sdtype' for column '{column_name}'.") | ||
|
||
self._validate_column(column_name, **kwargs) | ||
self._columns[column_name] = deepcopy(kwargs) | ||
|
||
def update_column(self, column_name, **kwargs): | ||
"""Update an existing column in the ``SingleTableMetadata``. | ||
|
||
Args: | ||
column_name (str): | ||
The column name to be updated. | ||
**kwargs (type): | ||
Any key word arguments that describe metadata for the column. | ||
|
||
Raises: | ||
- ``ValueError`` if the column doesn't already exist in the ``SingleTableMetadata``. | ||
- ``ValueError`` if the column has unexpected values or ``kwargs`` for the current | ||
``sdtype``. | ||
""" | ||
self._validate_column_exists(column_name) | ||
_kwargs = deepcopy(kwargs) | ||
if 'sdtype' in kwargs: | ||
sdtype = kwargs.pop('sdtype') | ||
else: | ||
sdtype = self._columns[column_name]['sdtype'] | ||
_kwargs['sdtype'] = sdtype | ||
|
||
self._validate_column(column_name, sdtype, **kwargs) | ||
self._columns[column_name] = _kwargs | ||
|
||
def detect_from_dataframe(self, data): | ||
"""Detect the metadata from a ``pd.DataFrame`` object. | ||
|
||
|
@@ -99,7 +253,7 @@ def to_dict(self): | |
elif value: | ||
metadata[key] = value | ||
|
||
return copy.deepcopy(metadata) | ||
return deepcopy(metadata) | ||
|
||
def _set_metadata_dict(self, metadata): | ||
"""Set a ``metadata`` dictionary to the current instance. | ||
|
@@ -110,7 +264,7 @@ def _set_metadata_dict(self, metadata): | |
""" | ||
self._metadata = {} | ||
for key in self.KEYS: | ||
value = copy.deepcopy(metadata.get(key)) | ||
value = deepcopy(metadata.get(key)) | ||
if key == 'constraints' and value: | ||
value = [Constraint.from_dict(constraint_dict) for constraint_dict in value] | ||
|
||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we make this a frozendict as well?