Skip to content

Commit

Permalink
feat(connector): don't support xml website anymore
Browse files Browse the repository at this point in the history
All the supported websites return JSON. Moreover, supporting xml
response requires us to add
lxml as the dependency.
We can add this back if we find a website returns xml.
  • Loading branch information
dovahcrow committed Oct 22, 2020
1 parent a0f8d4e commit fa173a0
Show file tree
Hide file tree
Showing 4 changed files with 3 additions and 98 deletions.
53 changes: 1 addition & 52 deletions dataprep/connector/implicit_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,15 @@
where ImplicitDatabase is a conceptual model describes
a website and ImplicitTable describes an API endpoint.
"""
from io import StringIO
from json import load as jload
from json import loads as jloads
from pathlib import Path
from typing import Any, Dict, List, Union

import pandas as pd
from jsonpath_ng import parse as jparse
from lxml import etree # pytype: disable=import-error

from dataprep.connector.schema.defs import ConfigDef

from ..errors import UnreachableError
from .schema import ConfigDef

_TYPE_MAPPING = {
Expand Down Expand Up @@ -45,10 +41,8 @@ def from_response(self, payload: str) -> pd.DataFrame:
ctype = self.config.response.ctype # pylint: disable=no-member
if ctype == "application/json":
rows = self.from_json(payload)
elif ctype == "application/xml":
rows = self.from_xml(payload)
else:
raise UnreachableError
raise NotImplementedError(f"{ctype} not supported")

return pd.DataFrame(rows)

Expand Down Expand Up @@ -98,51 +92,6 @@ def from_json(self, data: str) -> Dict[str, List[Any]]:

return table_data

def from_xml(self, data: str) -> Dict[str, List[Any]]:
"""
Create rows from xml string.
"""
table_data = {}
respdef = self.config.response
data = data.replace('<?xml version="1.0" encoding="UTF-8"?>', "")

root = etree.parse(StringIO(data))
data_rows = root.xpath(respdef.table_path)

if respdef.orient == "records":
for (
column_name,
column_def,
) in respdef.schema_.items():
column_target = column_def.target
column_type = column_def.type

col: List[Any] = []
for data_row in data_rows:
maybe_cell_value = data_row.xpath(column_target)

if not maybe_cell_value:
col.append(None)
elif len(maybe_cell_value) == 1 and column_type != "object":
(cell_value,) = maybe_cell_value
if cell_value is not None:
# Even we have value matched,
# the value might be None so we don't do type conversion.
cell_value = _TYPE_MAPPING[column_type](cell_value)
col.append(cell_value)
else:
assert (
column_type == "object"
), f"{column_name}: {maybe_cell_value} is not {column_type}"
col.append(maybe_cell_value)

table_data[column_name] = col
else:
# TODO: split orient
raise NotImplementedError

return table_data


class ImplicitDatabase:
"""
Expand Down
2 changes: 1 addition & 1 deletion dataprep/connector/schema/defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def merge_type(a: str, b: str) -> str: # pylint: disable=invalid-name


class ResponseDef(BaseDef):
ctype: str = Field(regex=r"^(application/xml|application/json)$")
ctype: str = Field("application/json", const=True)
table_path: str
schema_: Dict[str, SchemaFieldDef] = Field(alias="schema")
orient: str = Field(regex=r"^(records|split)$")
Expand Down
45 changes: 1 addition & 44 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ bottleneck = "^1.3.2"
jsonschema = "~3.2"
requests = "~2.24"
jinja2 = "~2.11"
lxml = "~4.5"
tqdm = "~4.48"
tornado = "5.0.2"
jsonpath-ng = "~1.5"
Expand Down

0 comments on commit fa173a0

Please sign in to comment.