Skip to content

Commit

Permalink
add pkl support and tqdm to parsers
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed Sep 2, 2019
1 parent 84f1a3d commit 7d4ccac
Show file tree
Hide file tree
Showing 14 changed files with 37 additions and 24 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ mtd/languages/*/gsheet/*.json
.mypy_cache
.coverage
htmlcov
docs/_build/
docs/_build/
*.cprof
*.prof
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ recursive-include mtd/tests/test_data/ *
recursive-include mtd/static/ *
recursive-exclude * *.py[co]
recursive-exclude * *~
recursive-exclude * *.orig
recursive-exclude * *.orig
recursive-exclude mtd/tests/test_data/exports/ *
2 changes: 1 addition & 1 deletion mtd/languages/manifest_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"properties": {
"file_type": {
"type": "string",
"enum": ["csv", "json", "psv", "tsv", "xlsx", "xml"]
"enum": ["csv", "json", "psv", "pkl", "tsv", "xlsx", "xml"]
},
"name": {
"type": "string"
Expand Down
3 changes: 2 additions & 1 deletion mtd/parsers/csv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from jsonschema.exceptions import ValidationError
from mtd.parsers.utils import ResourceManifest
from typing import Dict, List, Union
from tqdm import tqdm

class Parser(BaseParser):
'''
Expand All @@ -29,7 +30,7 @@ def __init__(self, manifest: ResourceManifest, resource_path: str):

def resolve_targets(self) -> List[dict]:
word_list = []
for entry in self.resource:
for entry in tqdm(self.resource):
word_list.append(self.fill_entry_template(self.entry_template, entry, lambda x, y: x[int(y)]))
return word_list

Expand Down
9 changes: 1 addition & 8 deletions mtd/parsers/dict_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,4 @@ def __init__(self, manifest: ResourceManifest, resource: Union[dict, list]):
self.resource = resource
if "location" in self.manifest:
self.resource = resolve_pointer(self.resource, self.manifest['location'])
self.entry_template = self.manifest['targets']

def parse(self) -> Dict[str, Union[dict, pd.DataFrame]]:
try:
data = self.resolve_targets()
return {"manifest": self.manifest, "data": pd.DataFrame(data)}
except JsonPointerException as e:
raise e
self.entry_template = self.manifest['targets']
3 changes: 2 additions & 1 deletion mtd/parsers/gsheet_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from mtd.exceptions import UnsupportedFiletypeError
from mtd.parsers.utils import ResourceManifest
from typing import Dict, List, Tuple, Union
from tqdm import tqdm
import gspread

class Parser(BaseParser):
Expand Down Expand Up @@ -40,7 +41,7 @@ def getCellValue(self, entry: Dict, col: str) -> str:

def resolve_targets(self) -> List[dict]:
word_list = []
for entry in self.resource:
for entry in tqdm(self.resource):
word_list.append(self.fill_entry_template(self.entry_template, entry, self.getCellValue))
return word_list

Expand Down
5 changes: 3 additions & 2 deletions mtd/parsers/json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from mtd.parsers.utils import ResourceManifest
from typing import Dict, List, Union
from jsonpath_rw import parse as json_parse
from tqdm import tqdm

class Parser(BaseParser):
'''
Expand Down Expand Up @@ -36,7 +37,7 @@ def getValueFromJsonPath(self, entry: dict, path: str):

def resolve_targets(self) -> List[dict]:
word_list = []
for entry in self.resource:
for entry in tqdm(self.resource):
word_list.append(self.fill_entry_template(self.entry_template, entry, self.getValueFromJsonPath))
return word_list

Expand Down Expand Up @@ -82,6 +83,7 @@ def fill_listof_entry_template(self, listof_dict: dict, entry, convert_function)
new_els.append(new_el)
return new_els


def fill_entry_template(self, entry_template: dict, entry, convert_function) -> dict:
'''This recursive function "fills in" the data according to the resource manifest. This is a slight modification from the one used by all parsers.
Expand All @@ -91,7 +93,6 @@ def fill_entry_template(self, entry_template: dict, entry, convert_function) ->
:param function convert_function: A function that takes an entry and a path and returns the "filled in" object
'''
new_lemma = {}

for k, v in entry_template.items():
if isinstance(v, dict):
if "listof" in v:
Expand Down
3 changes: 2 additions & 1 deletion mtd/parsers/psv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from jsonschema.exceptions import ValidationError
from mtd.parsers.utils import ResourceManifest
from typing import Dict, List, Union
from tqdm import tqdm

class Parser(BaseParser):
'''
Expand All @@ -30,7 +31,7 @@ def __init__(self, manifest: ResourceManifest, resource_path: str):

def resolve_targets(self) -> List[dict]:
word_list = []
for entry in self.resource:
for entry in tqdm(self.resource):
word_list.append(self.fill_entry_template(self.entry_template, entry, lambda x, y: x[int(y)]))
return word_list

Expand Down
3 changes: 2 additions & 1 deletion mtd/parsers/tsv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from jsonschema.exceptions import ValidationError
from mtd.parsers.utils import ResourceManifest
from typing import Dict, List, Union
from tqdm import tqdm

class Parser(BaseParser):
'''
Expand All @@ -30,7 +31,7 @@ def __init__(self, manifest: ResourceManifest, resource_path: str):

def resolve_targets(self):
word_list = []
for entry in self.resource:
for entry in tqdm(self.resource):
word_list.append(self.fill_entry_template(self.entry_template, entry, lambda x, y: x[int(y)]))
return word_list

Expand Down
3 changes: 2 additions & 1 deletion mtd/parsers/xlsx_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from mtd.parsers.utils import ResourceManifest
from openpyxl.cell.cell import Cell
from typing import Dict, List, Tuple, Union
from tqdm import tqdm

class Parser(BaseParser):
'''
Expand Down Expand Up @@ -53,7 +54,7 @@ def getCellValue(self, entry: Tuple[Cell, ...], col: str) -> str:

def resolve_targets(self) -> List[dict]:
word_list = []
for entry in self.resource:
for entry in tqdm(self.resource):
word_list.append(self.fill_entry_template(self.entry_template, entry, self.getCellValue))
return word_list

Expand Down
3 changes: 2 additions & 1 deletion mtd/parsers/xml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from mtd.parsers.utils import ResourceManifest
from typing import Dict, List, Union
import pandas as pd
from tqdm import tqdm

class Parser(BaseParser):
'''
Expand All @@ -29,7 +30,7 @@ def getValueFromXpath(self, entry: etree._Element, xpath: str) -> str:

def resolve_targets(self) -> List[dict]:
word_list = []
for entry in self.resource:
for entry in tqdm(self.resource):
word_list.append(self.fill_entry_template(self.entry_template, entry, self.getValueFromXpath))
return word_list

Expand Down
15 changes: 12 additions & 3 deletions mtd/tests/test_json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,20 @@
import json
from mtd.tests import SAMPLE_DATA_DF, SAMPLE_DATA_OBJ, SAMPLE_DATA_DF_REDUCED, SAMPLE_DATA_OBJ_REDUCED, SAMPLE_DATA_DF_REDUCED_EMPTY, SAMPLE_DATA_OBJ_REDUCED_EMPTY
import os
from unittest import TestCase
from unittest import main, TestCase
from mtd.parsers import parse

class JsonParserTest(TestCase):
def setUp(self):
self.path = os.path.dirname(json_path.__file__)
self.data = [(os.path.join(self.path, 'data.json'), SAMPLE_DATA_DF, SAMPLE_DATA_OBJ), (os.path.join(self.path, 'data_reduced.json'), SAMPLE_DATA_DF_REDUCED_EMPTY, SAMPLE_DATA_OBJ_REDUCED_EMPTY)]
self.data = [(os.path.join(self.path, 'data.json'), SAMPLE_DATA_DF,
SAMPLE_DATA_OBJ),
(os.path.join(self.path, 'data_reduced.json'),
SAMPLE_DATA_DF_REDUCED_EMPTY,
SAMPLE_DATA_OBJ_REDUCED_EMPTY)]
self.manifest = os.path.join(self.path, 'manifest.json')
self.large_manifest = os.path.join(self.path, 'large_data_manifest.json')
self.large_data_path = os.path.join(self.path, 'data_large.json')
self.maxDiff = None

def test_data_df_matches_sample(self):
Expand All @@ -25,4 +31,7 @@ def test_data_obj_matches_sample(self):
for data in self.data:
parsed_data = parse(self.manifest, data[0])
parsed_data_obj = parsed_data['data'].to_dict(orient='records')
self.assertEqual(data[2], parsed_data_obj)
self.assertEqual(data[2], parsed_data_obj)

if __name__ == '__main__':
main()
2 changes: 1 addition & 1 deletion mtd/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.20190816"
__version__ = "0.14.20190902"
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ oauth2client
flask_restful
flask_cors
requests
lxml
lxml
tqdm

0 comments on commit 7d4ccac

Please sign in to comment.