From 7d4ccaca35716c28a10f18bc0deda30c4e01d7d8 Mon Sep 17 00:00:00 2001 From: roedoejet Date: Mon, 2 Sep 2019 14:22:41 -0700 Subject: [PATCH] add pkl support and tqdm to parsers --- .gitignore | 4 +++- MANIFEST.in | 3 ++- mtd/languages/manifest_schema.json | 2 +- mtd/parsers/csv_parser.py | 3 ++- mtd/parsers/dict_parser.py | 9 +-------- mtd/parsers/gsheet_parser.py | 3 ++- mtd/parsers/json_parser.py | 5 +++-- mtd/parsers/psv_parser.py | 3 ++- mtd/parsers/tsv_parser.py | 3 ++- mtd/parsers/xlsx_parser.py | 3 ++- mtd/parsers/xml_parser.py | 3 ++- mtd/tests/test_json_parser.py | 15 ++++++++++++--- mtd/version.py | 2 +- requirements.txt | 3 ++- 14 files changed, 37 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 8af8c65..3f4ebf2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,6 @@ mtd/languages/*/gsheet/*.json .mypy_cache .coverage htmlcov -docs/_build/ \ No newline at end of file +docs/_build/ +*.cprof +*.prof \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index bb1bda8..f3f2d33 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -9,4 +9,5 @@ recursive-include mtd/tests/test_data/ * recursive-include mtd/static/ * recursive-exclude * *.py[co] recursive-exclude * *~ -recursive-exclude * *.orig \ No newline at end of file +recursive-exclude * *.orig +recursive-exclude mtd/tests/test_data/exports/ * \ No newline at end of file diff --git a/mtd/languages/manifest_schema.json b/mtd/languages/manifest_schema.json index 31d570f..272c237 100644 --- a/mtd/languages/manifest_schema.json +++ b/mtd/languages/manifest_schema.json @@ -38,7 +38,7 @@ "properties": { "file_type": { "type": "string", - "enum": ["csv", "json", "psv", "tsv", "xlsx", "xml"] + "enum": ["csv", "json", "psv", "pkl", "tsv", "xlsx", "xml"] }, "name": { "type": "string" diff --git a/mtd/parsers/csv_parser.py b/mtd/parsers/csv_parser.py index 77e48c2..ca84215 100644 --- a/mtd/parsers/csv_parser.py +++ b/mtd/parsers/csv_parser.py @@ -6,6 +6,7 @@ from jsonschema.exceptions import ValidationError from mtd.parsers.utils import ResourceManifest from typing import Dict, List, Union +from tqdm import tqdm class Parser(BaseParser): ''' @@ -29,7 +30,7 @@ def __init__(self, manifest: ResourceManifest, resource_path: str): def resolve_targets(self) -> List[dict]: word_list = [] - for entry in self.resource: + for entry in tqdm(self.resource): word_list.append(self.fill_entry_template(self.entry_template, entry, lambda x, y: x[int(y)])) return word_list diff --git a/mtd/parsers/dict_parser.py b/mtd/parsers/dict_parser.py index 25b4d3f..71891e1 100644 --- a/mtd/parsers/dict_parser.py +++ b/mtd/parsers/dict_parser.py @@ -20,11 +20,4 @@ def __init__(self, manifest: ResourceManifest, resource: Union[dict, list]): self.resource = resource if "location" in self.manifest: self.resource = resolve_pointer(self.resource, self.manifest['location']) - self.entry_template = self.manifest['targets'] - - def parse(self) -> Dict[str, Union[dict, pd.DataFrame]]: - try: - data = self.resolve_targets() - return {"manifest": self.manifest, "data": pd.DataFrame(data)} - except JsonPointerException as e: - raise e \ No newline at end of file + self.entry_template = self.manifest['targets'] \ No newline at end of file diff --git a/mtd/parsers/gsheet_parser.py b/mtd/parsers/gsheet_parser.py index a747f23..b2d596e 100644 --- a/mtd/parsers/gsheet_parser.py +++ b/mtd/parsers/gsheet_parser.py @@ -3,6 +3,7 @@ from mtd.exceptions import UnsupportedFiletypeError from mtd.parsers.utils import ResourceManifest from typing import Dict, List, Tuple, Union +from tqdm import tqdm import gspread class Parser(BaseParser): @@ -40,7 +41,7 @@ def getCellValue(self, entry: Dict, col: str) -> str: def resolve_targets(self) -> List[dict]: word_list = [] - for entry in self.resource: + for entry in tqdm(self.resource): word_list.append(self.fill_entry_template(self.entry_template, entry, self.getCellValue)) return word_list diff --git a/mtd/parsers/json_parser.py b/mtd/parsers/json_parser.py index 1c3f7f5..8f32a38 100644 --- a/mtd/parsers/json_parser.py +++ b/mtd/parsers/json_parser.py @@ -8,6 +8,7 @@ from mtd.parsers.utils import ResourceManifest from typing import Dict, List, Union from jsonpath_rw import parse as json_parse +from tqdm import tqdm class Parser(BaseParser): ''' @@ -36,7 +37,7 @@ def getValueFromJsonPath(self, entry: dict, path: str): def resolve_targets(self) -> List[dict]: word_list = [] - for entry in self.resource: + for entry in tqdm(self.resource): word_list.append(self.fill_entry_template(self.entry_template, entry, self.getValueFromJsonPath)) return word_list @@ -82,6 +83,7 @@ def fill_listof_entry_template(self, listof_dict: dict, entry, convert_function) new_els.append(new_el) return new_els + def fill_entry_template(self, entry_template: dict, entry, convert_function) -> dict: '''This recursive function "fills in" the data according to the resource manifest. This is a slight modification from the one used by all parsers. @@ -91,7 +93,6 @@ def fill_entry_template(self, entry_template: dict, entry, convert_function) -> :param function convert_function: A function that takes an entry and a path and returns the "filled in" object ''' new_lemma = {} - for k, v in entry_template.items(): if isinstance(v, dict): if "listof" in v: diff --git a/mtd/parsers/psv_parser.py b/mtd/parsers/psv_parser.py index 0d6bac1..70ed34a 100644 --- a/mtd/parsers/psv_parser.py +++ b/mtd/parsers/psv_parser.py @@ -6,6 +6,7 @@ from jsonschema.exceptions import ValidationError from mtd.parsers.utils import ResourceManifest from typing import Dict, List, Union +from tqdm import tqdm class Parser(BaseParser): ''' @@ -30,7 +31,7 @@ def __init__(self, manifest: ResourceManifest, resource_path: str): def resolve_targets(self) -> List[dict]: word_list = [] - for entry in self.resource: + for entry in tqdm(self.resource): word_list.append(self.fill_entry_template(self.entry_template, entry, lambda x, y: x[int(y)])) return word_list diff --git a/mtd/parsers/tsv_parser.py b/mtd/parsers/tsv_parser.py index 564dc68..802cb17 100644 --- a/mtd/parsers/tsv_parser.py +++ b/mtd/parsers/tsv_parser.py @@ -6,6 +6,7 @@ from jsonschema.exceptions import ValidationError from mtd.parsers.utils import ResourceManifest from typing import Dict, List, Union +from tqdm import tqdm class Parser(BaseParser): ''' @@ -30,7 +31,7 @@ def __init__(self, manifest: ResourceManifest, resource_path: str): def resolve_targets(self): word_list = [] - for entry in self.resource: + for entry in tqdm(self.resource): word_list.append(self.fill_entry_template(self.entry_template, entry, lambda x, y: x[int(y)])) return word_list diff --git a/mtd/parsers/xlsx_parser.py b/mtd/parsers/xlsx_parser.py index 88b6733..cd6de6b 100644 --- a/mtd/parsers/xlsx_parser.py +++ b/mtd/parsers/xlsx_parser.py @@ -5,6 +5,7 @@ from mtd.parsers.utils import ResourceManifest from openpyxl.cell.cell import Cell from typing import Dict, List, Tuple, Union +from tqdm import tqdm class Parser(BaseParser): ''' @@ -53,7 +54,7 @@ def getCellValue(self, entry: Tuple[Cell, ...], col: str) -> str: def resolve_targets(self) -> List[dict]: word_list = [] - for entry in self.resource: + for entry in tqdm(self.resource): word_list.append(self.fill_entry_template(self.entry_template, entry, self.getCellValue)) return word_list diff --git a/mtd/parsers/xml_parser.py b/mtd/parsers/xml_parser.py index 7c88caf..4e36b3b 100644 --- a/mtd/parsers/xml_parser.py +++ b/mtd/parsers/xml_parser.py @@ -3,6 +3,7 @@ from mtd.parsers.utils import ResourceManifest from typing import Dict, List, Union import pandas as pd +from tqdm import tqdm class Parser(BaseParser): ''' @@ -29,7 +30,7 @@ def getValueFromXpath(self, entry: etree._Element, xpath: str) -> str: def resolve_targets(self) -> List[dict]: word_list = [] - for entry in self.resource: + for entry in tqdm(self.resource): word_list.append(self.fill_entry_template(self.entry_template, entry, self.getValueFromXpath)) return word_list diff --git a/mtd/tests/test_json_parser.py b/mtd/tests/test_json_parser.py index c2f38f9..d6ddb98 100644 --- a/mtd/tests/test_json_parser.py +++ b/mtd/tests/test_json_parser.py @@ -2,14 +2,20 @@ import json from mtd.tests import SAMPLE_DATA_DF, SAMPLE_DATA_OBJ, SAMPLE_DATA_DF_REDUCED, SAMPLE_DATA_OBJ_REDUCED, SAMPLE_DATA_DF_REDUCED_EMPTY, SAMPLE_DATA_OBJ_REDUCED_EMPTY import os -from unittest import TestCase +from unittest import main, TestCase from mtd.parsers import parse class JsonParserTest(TestCase): def setUp(self): self.path = os.path.dirname(json_path.__file__) - self.data = [(os.path.join(self.path, 'data.json'), SAMPLE_DATA_DF, SAMPLE_DATA_OBJ), (os.path.join(self.path, 'data_reduced.json'), SAMPLE_DATA_DF_REDUCED_EMPTY, SAMPLE_DATA_OBJ_REDUCED_EMPTY)] + self.data = [(os.path.join(self.path, 'data.json'), SAMPLE_DATA_DF, + SAMPLE_DATA_OBJ), + (os.path.join(self.path, 'data_reduced.json'), + SAMPLE_DATA_DF_REDUCED_EMPTY, + SAMPLE_DATA_OBJ_REDUCED_EMPTY)] self.manifest = os.path.join(self.path, 'manifest.json') + self.large_manifest = os.path.join(self.path, 'large_data_manifest.json') + self.large_data_path = os.path.join(self.path, 'data_large.json') self.maxDiff = None def test_data_df_matches_sample(self): @@ -25,4 +31,7 @@ def test_data_obj_matches_sample(self): for data in self.data: parsed_data = parse(self.manifest, data[0]) parsed_data_obj = parsed_data['data'].to_dict(orient='records') - self.assertEqual(data[2], parsed_data_obj) \ No newline at end of file + self.assertEqual(data[2], parsed_data_obj) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/mtd/version.py b/mtd/version.py index d6a3b4d..db31a24 100644 --- a/mtd/version.py +++ b/mtd/version.py @@ -1 +1 @@ -__version__ = "0.14.20190816" \ No newline at end of file +__version__ = "0.14.20190902" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 47551fd..a221e95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,5 @@ oauth2client flask_restful flask_cors requests -lxml \ No newline at end of file +lxml +tqdm \ No newline at end of file