Skip to content

Commit

Permalink
feat(eda): save imdt as json file
Browse files Browse the repository at this point in the history
  • Loading branch information
Waterpine authored and jinglinpeng committed Oct 26, 2021
1 parent 87e77df commit 7867386
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 4 deletions.
101 changes: 100 additions & 1 deletion dataprep/eda/intermediate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
"""
Intermediate class
"""
from typing import Any, Dict, Tuple, Union
from typing import Any, Dict, Tuple, Union, Optional

from pathlib import Path
import json
import os
import numpy as np
import pandas as pd


Expand All @@ -27,6 +31,101 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
else:
raise ValueError("Unsupported initialization")

def save(self, path: Optional[str] = None) -> None:
"""
Save intermediate to current working directory.
Parameters
----------
filename: Optional[str], default 'intermediate'
The filename used for saving intermediate without the extension name.
to: Optional[str], default Path.cwd()
The path to where the intermediate will be saved.
"""
saved_file_path = None

if path:
extension = os.path.splitext(path)[1]
posix_path = Path(path).expanduser()

if posix_path.is_dir():
if path.endswith("/"):
path += "imdt.json"
else:
path += "/imdt.json"

elif extension:
if extension != ".json":
raise ValueError(
"Format '{extension}' is not supported (supported formats: json)"
)

else:
path += ".json"

saved_file_path = Path(path).expanduser()

else:
path = str(Path.cwd()) + "/imdt.json"
saved_file_path = Path(path).expanduser()

# pylint: disable=no-member
inter_dict: Dict[str, Any] = {}
for key in self.keys():
inter_dict[key] = self[key]
self._standardize_type(inter_dict)
with open(path, "w") as outfile:
json.dump(inter_dict, outfile, indent=4)
print(f"Intermediate has been saved to {saved_file_path}!")

def _standardize_type(self, inter_dict: Dict[str, Any]) -> None:
"""
In order to make intermediate could be saved as json file,
check the type of data contained in the intermediate
Parameters
----------
inter_dict: Dict[str, Any], default "Intermediate"
The intermediate result
Returns
-------
"""
for key in inter_dict:
if isinstance(inter_dict[key], dict):
self._standardize_type(inter_dict[key])
elif isinstance(
inter_dict[key],
(
np.int_,
np.intc,
np.intp,
np.int8,
np.int16,
np.int32,
np.int64,
np.uint8,
np.uint16,
np.uint32,
np.uint64,
),
):
inter_dict[key] = int(inter_dict[key])
elif isinstance(inter_dict[key], (np.float_, np.float16, np.float32, np.float64)):
inter_dict[key] = float(inter_dict[key])
elif isinstance(inter_dict[key], (np.ndarray,)):
inter_dict[key] = inter_dict[key].tolist()
elif isinstance(inter_dict[key], tuple):
inter_dict[key] = list(inter_dict[key])
for index in range(len(inter_dict[key])):
if isinstance(inter_dict[key][index], (np.ndarray,)):
inter_dict[key][index] = inter_dict[key][index].tolist()
inter_dict[key] = tuple(inter_dict[key])
elif isinstance(inter_dict[key], pd.DataFrame):
inter_dict[key] = inter_dict[key].to_dict()
else:
pass


class ColumnsMetadata:
"""Container for storing each column's metadata."""
Expand Down
33 changes: 30 additions & 3 deletions docs/source/user_guide/eda/introduction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,31 @@
"An example report can be downloaded [here](../../_static/images/create_report/titanic_dp.html)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get the intermediate data\n",
"\n",
"DataPrep.EDA separates the computation and rendering, so that you can just compute the intermediate data and render it using other plotting libraries. \n",
"\n",
"For each `plot` function, there is a corresponding `compute` function, which returns the computed intermediates used for rendering. For example, for `plot_correlation(df)` function, you can get the intermediates using `compute_correlation(df)`. It's a dictionary, and you can also save it to a json file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dataprep.eda import compute_correlation\n",
"from dataprep.datasets import load_dataset\n",
"df = load_dataset(\"titanic\")\n",
"imdt = compute_correlation(df)\n",
"imdt.save(\"imdt.json\")\n",
"imdt"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -272,9 +297,11 @@
"metadata": {
"celltoolbar": "Edit Metadata",
"hide_input": false,
"interpreter": {
"hash": "1f1c771aa7e44c1d88ee5cde649fa00b963c4e0914d7adcf5ffc0b0d1ad3cd5b"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"display_name": "Python 3.7.3 64-bit ('.venv': poetry)",
"name": "python3"
},
"language_info": {
Expand All @@ -287,7 +314,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
"version": "3.7.3"
},
"toc": {
"base_numbering": 1,
Expand Down

0 comments on commit 7867386

Please sign in to comment.