From c231e4e6453668595bc47dc1f9a0ebaf130e4968 Mon Sep 17 00:00:00 2001 From: roedoejet Date: Fri, 25 Jan 2019 20:56:43 -0800 Subject: [PATCH] added gsheet parser --- config-ayajuthem.js | 0 mtd/dictionary.py | 21 ++++++++++++- mtd/languages/__init__.py | 3 +- mtd/languages/config_schema.json | 15 ++++----- mtd/parsers/__init__.py | 8 +++-- mtd/parsers/gsheet_parser.py | 52 ++++++++++++++++++++++++++++++++ mtd/parsers/utils.py | 5 ++- mtd/tests/run.py | 2 +- mtd/tests/test_dict_parser.py | 9 ++++++ newbuild.py | 1 + 10 files changed, 102 insertions(+), 14 deletions(-) create mode 100644 config-ayajuthem.js create mode 100644 mtd/parsers/gsheet_parser.py diff --git a/config-ayajuthem.js b/config-ayajuthem.js new file mode 100644 index 0000000..e69de29 diff --git a/mtd/dictionary.py b/mtd/dictionary.py index 7272163..5916ecc 100644 --- a/mtd/dictionary.py +++ b/mtd/dictionary.py @@ -158,10 +158,29 @@ def return_formatted_config(self, form: str="js") -> Union[str, dict]: config_template_object = {"L1": {"name": self.config['L1'], "lettersInLanguage": self.config['alphabet']}, "L2": {"name": self.config['L2']}} + ## Add transducer name that converts search queries + if 'L1_compare_transducer_name' in self.config: + config_template_object['L1']['compare'] = self.config['L1_compare_transducer_name'] if form == 'obj': return config_template_object elif form == 'js': - return f"var config = {json.dumps(config_template_object)}" + ## Add adhoc_vars + adhoc_vars = '' + if "adhoc_vars" in self.config: + adhoc_vars = [] + for av in self.config['adhoc_vars']: + for k,v in av.items(): + adhoc_vars.append(f"var {k} = {v};") + adhoc_vars = "\n".join(adhoc_vars) + ## Add transducers + # for data_obj in self.data_objs: + # transducers = [] + # if "transducers" in data_obj['manifest']: + # transducers = data_obj['manifest']['transducers'] + # transducer_obj = Transducer(transducers) + # transducers.append(transducer_obj.return_js_template(transducers)) + # transducers_js = "\n".join([t.return_js_template() for t in transducers]) + return f"var config = {json.dumps(config_template_object)}" + adhoc_vars elif form == 'json': return json.dumps(config_template_object) diff --git a/mtd/languages/__init__.py b/mtd/languages/__init__.py index bd13d8c..7a6e772 100644 --- a/mtd/languages/__init__.py +++ b/mtd/languages/__init__.py @@ -5,6 +5,7 @@ from jsonschema.exceptions import ValidationError from urllib.parse import urlparse from typing import Union +import requests ldir = os.path.dirname(ldir.__file__) @@ -27,7 +28,7 @@ def __init__(self, config_object: Union[str, dict]): if isinstance(config_object, dict): self._config = config_object elif 'http' in urlparse(config_object).scheme: - r = requests.get(config_path) + r = requests.get(config_object) self._config = r.json() else: if not os.path.isabs(config_object): diff --git a/mtd/languages/config_schema.json b/mtd/languages/config_schema.json index 4c25fdd..2d1f400 100644 --- a/mtd/languages/config_schema.json +++ b/mtd/languages/config_schema.json @@ -6,6 +6,7 @@ "properties": { "L1": {"type": "string"}, "L2": {"type": "string"}, + "L1_compare_transducer_name": {"type": "string"}, "optional_field_name": {"type": "string"}, "alphabet": { "oneOf": [ @@ -23,7 +24,13 @@ "role": {"type": "string"}, "name": {"type": "string"} } - }} + }}, + "adhoc_vars": { + "type": "array", + "items": { + "type": "object" + } + } }, "required": ["L1", "L2"] }, @@ -45,12 +52,6 @@ "required": ["manifest", "resource"] } }, - "adhoc_vars": { - "type": "array", - "items": { - "type": "object" - } - }, "github_credentials_path": {"type": "string"} }, "required": ["config", "data"] diff --git a/mtd/parsers/__init__.py b/mtd/parsers/__init__.py index d5ac2f3..8b02f90 100644 --- a/mtd/parsers/__init__.py +++ b/mtd/parsers/__init__.py @@ -4,9 +4,11 @@ import re import requests from mtd.parsers.utils import ResourceManifest -from mtd.parsers import request_parser, dict_parser +from mtd.parsers import request_parser, dict_parser, gsheet_parser from mtd.exceptions import MissingFileError, UnsupportedFiletypeError from urllib.parse import urlparse +from pandas import DataFrame +from gspread.models import Spreadsheet from .. import exceptions @@ -25,10 +27,10 @@ def parse(manifest, resource_dict_or_path): ''' if not isinstance(manifest, ResourceManifest): manifest = ResourceManifest(manifest) - if isinstance(resource_dict_or_path, dict) or isinstance(resource_dict_or_path, list): parser = dict_parser.Parser(manifest, resource_dict_or_path) - + elif isinstance(resource_dict_or_path, Spreadsheet): + parser = gsheet_parser.Parser(manifest, resource_dict_or_path) # If resource is URL, use request parser elif 'http' in urlparse(resource_dict_or_path).scheme: parser = request_parser.Parser(manifest, resource_dict_or_path) diff --git a/mtd/parsers/gsheet_parser.py b/mtd/parsers/gsheet_parser.py new file mode 100644 index 0000000..a747f23 --- /dev/null +++ b/mtd/parsers/gsheet_parser.py @@ -0,0 +1,52 @@ +import pandas as pd +from mtd.parsers.utils import BaseParser +from mtd.exceptions import UnsupportedFiletypeError +from mtd.parsers.utils import ResourceManifest +from typing import Dict, List, Tuple, Union +import gspread + +class Parser(BaseParser): + ''' + Parse data for MTD **TODO: test worksheet location. Skipheader in manifest skips first row. Location in manifest decides worksheet. + + :param ResourceManifest manifest: Manifest for parser + :param str resource_path: path to file + ''' + def __init__(self, manifest: ResourceManifest, resource: gspread.models.Spreadsheet): + self.manifest = manifest + try: + work_book = resource + if "location" in self.manifest: + work_sheet = work_book.get_worksheet(self.manifest['location']) + else: + work_sheet = work_book.get_worksheet(0) + if "skipheader" in self.manifest and self.manifest['skipheader']: + min_row = 1 + else: + min_row = 0 + self.resource = work_sheet.get_all_records()[min_row:] + except: + raise UnsupportedFiletypeError('Google Spreadsheet') + + self.entry_template = self.manifest['targets'] + + def getCellValue(self, entry: Dict, col: str) -> str: + ''' Given a gspread record dict, return the value of the key matching the header in the record + ''' + for k,v in entry.items(): + if k == col: + return v + return '' + + def resolve_targets(self) -> List[dict]: + word_list = [] + for entry in self.resource: + word_list.append(self.fill_entry_template(self.entry_template, entry, self.getCellValue)) + return word_list + + def parse(self) -> Dict[str, Union[dict, pd.DataFrame]]: + try: + data = self.resolve_targets() + return {"manifest": self.manifest, "data": pd.DataFrame(data)} + except Exception as e: + print(e) \ No newline at end of file diff --git a/mtd/parsers/utils.py b/mtd/parsers/utils.py index c8ba95a..d9fa7ca 100644 --- a/mtd/parsers/utils.py +++ b/mtd/parsers/utils.py @@ -6,6 +6,7 @@ from mtd.tests import logger from typing import Dict, List, Union from jsonpath_rw import jsonpath, parse as json_parse +import requests class ResourceManifest(): '''A manifest file for a given resource. @@ -65,6 +66,7 @@ def validate(self, manifest): self.warn_extra_properties_in(schema_properties, manifest_properties) self.warn_extra_properties_in(schema_targets, manifest_targets) validate(manifest, self.schema) + return manifest except ValidationError as e: raise ValidationError(f"Attempted to validate the manifest file, but got {e}. Please refer to the Mother Tongues data manifest schema.") @@ -96,7 +98,8 @@ class BaseParser(): between many of the format specific parsers """ def __init__(self): - pass + # to be overwritten by parsers + self.manifest = None def return_manifest_key_type(self, key: str, manifest: dict) -> Union[dict, list, str]: '''Given a key in a nested dict, return the type of the corresponding value diff --git a/mtd/tests/run.py b/mtd/tests/run.py index 4c4d6f0..4222145 100644 --- a/mtd/tests/run.py +++ b/mtd/tests/run.py @@ -29,7 +29,7 @@ loader.loadTestsFromTestCase(test) # for test in [JsonParserTest] # for test in (CsvParserTest, PsvParserTest, TsvParserTest, XlsxParserTest) - # for test in [XmlParserTest] + # for test in [DictParserTest] for test in (CsvParserTest, DictParserTest, JsonParserTest, PsvParserTest, RequestsParserTest, TsvParserTest, XlsxParserTest, XmlParserTest) ] diff --git a/mtd/tests/test_dict_parser.py b/mtd/tests/test_dict_parser.py index 7dbbf10..fec9354 100644 --- a/mtd/tests/test_dict_parser.py +++ b/mtd/tests/test_dict_parser.py @@ -12,6 +12,15 @@ def setUp(self): self.manifest = os.path.join(self.path, 'dict_manifest.json') self.maxDiff = None + # def test_dict_manifest(self): + # '''Check manifest loaded as dict NOT WORKING + # ''' + # with open(self.manifest, 'r') as f: + # json_manifest = json.load(f) + # for data in self.data: + # parsed_data = parse(json_manifest, data[0]) + # self.assertTrue(parsed_data['data'].equals(data[1])) + def test_data_df_matches_sample(self): '''Check test Dict or List data is parsed and looks like ground truth data. ''' diff --git a/newbuild.py b/newbuild.py index 34a0c6a..aaec911 100644 --- a/newbuild.py +++ b/newbuild.py @@ -1,6 +1,7 @@ from mtd import create_dictionary from mtd.languages import LanguageConfig + dictionary = create_dictionary('/Users/pinea/mothertongues/mtd/languages/ayajuthem/config.json') # dictionary = create_dictionary('/Users/pinea/Desktop/testayajuthm/config.json') # dictionary.export_raw_data('C:\\Users\\pinea\\Desktop', export_type="html")