Skip to content

Commit

Permalink
added gsheet parser
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed Jan 26, 2019
1 parent c69862a commit c231e4e
Show file tree
Hide file tree
Showing 10 changed files with 102 additions and 14 deletions.
Empty file added config-ayajuthem.js
Empty file.
21 changes: 20 additions & 1 deletion mtd/dictionary.py
Expand Up @@ -158,10 +158,29 @@ def return_formatted_config(self, form: str="js") -> Union[str, dict]:
config_template_object = {"L1": {"name": self.config['L1'],
"lettersInLanguage": self.config['alphabet']},
"L2": {"name": self.config['L2']}}
## Add transducer name that converts search queries
if 'L1_compare_transducer_name' in self.config:
config_template_object['L1']['compare'] = self.config['L1_compare_transducer_name']
if form == 'obj':
return config_template_object
elif form == 'js':
return f"var config = {json.dumps(config_template_object)}"
## Add adhoc_vars
adhoc_vars = ''
if "adhoc_vars" in self.config:
adhoc_vars = []
for av in self.config['adhoc_vars']:
for k,v in av.items():
adhoc_vars.append(f"var {k} = {v};")
adhoc_vars = "\n".join(adhoc_vars)
## Add transducers
# for data_obj in self.data_objs:
# transducers = []
# if "transducers" in data_obj['manifest']:
# transducers = data_obj['manifest']['transducers']
# transducer_obj = Transducer(transducers)
# transducers.append(transducer_obj.return_js_template(transducers))
# transducers_js = "\n".join([t.return_js_template() for t in transducers])
return f"var config = {json.dumps(config_template_object)}" + adhoc_vars
elif form == 'json':
return json.dumps(config_template_object)

Expand Down
3 changes: 2 additions & 1 deletion mtd/languages/__init__.py
Expand Up @@ -5,6 +5,7 @@
from jsonschema.exceptions import ValidationError
from urllib.parse import urlparse
from typing import Union
import requests

ldir = os.path.dirname(ldir.__file__)

Expand All @@ -27,7 +28,7 @@ def __init__(self, config_object: Union[str, dict]):
if isinstance(config_object, dict):
self._config = config_object
elif 'http' in urlparse(config_object).scheme:
r = requests.get(config_path)
r = requests.get(config_object)
self._config = r.json()
else:
if not os.path.isabs(config_object):
Expand Down
15 changes: 8 additions & 7 deletions mtd/languages/config_schema.json
Expand Up @@ -6,6 +6,7 @@
"properties": {
"L1": {"type": "string"},
"L2": {"type": "string"},
"L1_compare_transducer_name": {"type": "string"},
"optional_field_name": {"type": "string"},
"alphabet": {
"oneOf": [
Expand All @@ -23,7 +24,13 @@
"role": {"type": "string"},
"name": {"type": "string"}
}
}}
}},
"adhoc_vars": {
"type": "array",
"items": {
"type": "object"
}
}
},
"required": ["L1", "L2"]
},
Expand All @@ -45,12 +52,6 @@
"required": ["manifest", "resource"]
}
},
"adhoc_vars": {
"type": "array",
"items": {
"type": "object"
}
},
"github_credentials_path": {"type": "string"}
},
"required": ["config", "data"]
Expand Down
8 changes: 5 additions & 3 deletions mtd/parsers/__init__.py
Expand Up @@ -4,9 +4,11 @@
import re
import requests
from mtd.parsers.utils import ResourceManifest
from mtd.parsers import request_parser, dict_parser
from mtd.parsers import request_parser, dict_parser, gsheet_parser
from mtd.exceptions import MissingFileError, UnsupportedFiletypeError
from urllib.parse import urlparse
from pandas import DataFrame
from gspread.models import Spreadsheet


from .. import exceptions
Expand All @@ -25,10 +27,10 @@ def parse(manifest, resource_dict_or_path):
'''
if not isinstance(manifest, ResourceManifest):
manifest = ResourceManifest(manifest)

if isinstance(resource_dict_or_path, dict) or isinstance(resource_dict_or_path, list):
parser = dict_parser.Parser(manifest, resource_dict_or_path)

elif isinstance(resource_dict_or_path, Spreadsheet):
parser = gsheet_parser.Parser(manifest, resource_dict_or_path)
# If resource is URL, use request parser
elif 'http' in urlparse(resource_dict_or_path).scheme:
parser = request_parser.Parser(manifest, resource_dict_or_path)
Expand Down
52 changes: 52 additions & 0 deletions mtd/parsers/gsheet_parser.py
@@ -0,0 +1,52 @@
import pandas as pd
from mtd.parsers.utils import BaseParser
from mtd.exceptions import UnsupportedFiletypeError
from mtd.parsers.utils import ResourceManifest
from typing import Dict, List, Tuple, Union
import gspread

class Parser(BaseParser):
'''
Parse data for MTD **TODO: test worksheet location. Skipheader in manifest skips first row. Location in manifest decides worksheet.
:param ResourceManifest manifest: Manifest for parser
:param str resource_path: path to file
'''
def __init__(self, manifest: ResourceManifest, resource: gspread.models.Spreadsheet):
self.manifest = manifest
try:
work_book = resource
if "location" in self.manifest:
work_sheet = work_book.get_worksheet(self.manifest['location'])
else:
work_sheet = work_book.get_worksheet(0)
if "skipheader" in self.manifest and self.manifest['skipheader']:
min_row = 1
else:
min_row = 0
self.resource = work_sheet.get_all_records()[min_row:]
except:
raise UnsupportedFiletypeError('Google Spreadsheet')

self.entry_template = self.manifest['targets']

def getCellValue(self, entry: Dict, col: str) -> str:
''' Given a gspread record dict, return the value of the key matching the header in the record
'''
for k,v in entry.items():
if k == col:
return v
return ''

def resolve_targets(self) -> List[dict]:
word_list = []
for entry in self.resource:
word_list.append(self.fill_entry_template(self.entry_template, entry, self.getCellValue))
return word_list

def parse(self) -> Dict[str, Union[dict, pd.DataFrame]]:
try:
data = self.resolve_targets()
return {"manifest": self.manifest, "data": pd.DataFrame(data)}
except Exception as e:
print(e)
5 changes: 4 additions & 1 deletion mtd/parsers/utils.py
Expand Up @@ -6,6 +6,7 @@
from mtd.tests import logger
from typing import Dict, List, Union
from jsonpath_rw import jsonpath, parse as json_parse
import requests

class ResourceManifest():
'''A manifest file for a given resource.
Expand Down Expand Up @@ -65,6 +66,7 @@ def validate(self, manifest):
self.warn_extra_properties_in(schema_properties, manifest_properties)
self.warn_extra_properties_in(schema_targets, manifest_targets)
validate(manifest, self.schema)
return manifest
except ValidationError as e:
raise ValidationError(f"Attempted to validate the manifest file, but got {e}. Please refer to the Mother Tongues data manifest schema.")

Expand Down Expand Up @@ -96,7 +98,8 @@ class BaseParser():
between many of the format specific parsers
"""
def __init__(self):
pass
# to be overwritten by parsers
self.manifest = None

def return_manifest_key_type(self, key: str, manifest: dict) -> Union[dict, list, str]:
'''Given a key in a nested dict, return the type of the corresponding value
Expand Down
2 changes: 1 addition & 1 deletion mtd/tests/run.py
Expand Up @@ -29,7 +29,7 @@
loader.loadTestsFromTestCase(test)
# for test in [JsonParserTest]
# for test in (CsvParserTest, PsvParserTest, TsvParserTest, XlsxParserTest)
# for test in [XmlParserTest]
# for test in [DictParserTest]
for test in (CsvParserTest, DictParserTest, JsonParserTest, PsvParserTest, RequestsParserTest, TsvParserTest, XlsxParserTest, XmlParserTest)
]

Expand Down
9 changes: 9 additions & 0 deletions mtd/tests/test_dict_parser.py
Expand Up @@ -12,6 +12,15 @@ def setUp(self):
self.manifest = os.path.join(self.path, 'dict_manifest.json')
self.maxDiff = None

# def test_dict_manifest(self):
# '''Check manifest loaded as dict NOT WORKING
# '''
# with open(self.manifest, 'r') as f:
# json_manifest = json.load(f)
# for data in self.data:
# parsed_data = parse(json_manifest, data[0])
# self.assertTrue(parsed_data['data'].equals(data[1]))

def test_data_df_matches_sample(self):
'''Check test Dict or List data is parsed and looks like ground truth data.
'''
Expand Down
1 change: 1 addition & 0 deletions newbuild.py
@@ -1,6 +1,7 @@
from mtd import create_dictionary
from mtd.languages import LanguageConfig


dictionary = create_dictionary('/Users/pinea/mothertongues/mtd/languages/ayajuthem/config.json')
# dictionary = create_dictionary('/Users/pinea/Desktop/testayajuthm/config.json')
# dictionary.export_raw_data('C:\\Users\\pinea\\Desktop', export_type="html")
Expand Down

0 comments on commit c231e4e

Please sign in to comment.