Skip to content

Commit

Permalink
Issue #5: added pre-trained codec for XML
Browse files Browse the repository at this point in the history
  • Loading branch information
soxofaan committed Jul 14, 2019
1 parent 4f7a689 commit d3cbbe5
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 3 deletions.
Binary file modified dahuffman/codecs/json-compact.pickle
Binary file not shown.
Binary file modified dahuffman/codecs/json.pickle
Binary file not shown.
Binary file added dahuffman/codecs/xml.pickle
Binary file not shown.
1 change: 1 addition & 0 deletions tests/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def test_get_path():
'shakespeare',
'shakespeare-lower',
'json',
'xml',
])
def test_encode_decode(name):
codec = load(name)
Expand Down
2 changes: 2 additions & 0 deletions train/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ on publicly available data sets (e.g. English text, JSON files, ...).
Usage example:

python train/shakespeare.py

Then, copy generated codec files to `dahuffman/codecs`.
14 changes: 11 additions & 3 deletions train/json-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,28 @@ def main():
"https://data.cdc.gov/api/views/e6fc-ccez/rows.json",
"https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.json",
"https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.json",
"https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.json",
"https://data.cityofnewyork.us/api/views/5t4n-d72c/rows.json",
"https://data.cdc.gov/api/views/6rkc-nb2q/rows.json",
"https://data.sfgov.org/api/views/j4sj-j2nf/rows.json",
"https://data.kingcounty.gov/api/views/gmen-63jm/rows.json",
"https://data.mo.gov/api/views/vpge-tj3s/rows.json",
]

_log.info('Building frequency tables')
frequencies_raw = Counter()
frequencies_compact = Counter()
for url in urls:
path = download(url, 'json-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json')
path = download(url, 'json-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json')
with path.open('r') as f:
raw = f.read()
frequencies_raw.update(raw)
# Only take first N bytes.
# Large files probably have a lot of structural repetition, which skews the frequencies
frequencies_raw.update(raw[:100000])

# Parse and re-encode to compact JSON
compact = json.dumps(json.loads(raw), separators=(',', ':'))
frequencies_compact.update(compact)
frequencies_compact.update(compact[:100000])

# TODO add more metadata
_log.info(f'Frequencies raw {len(frequencies_raw)}: {frequencies_raw}')
Expand Down
1 change: 1 addition & 0 deletions train/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def download(url: str, path: str) -> Path:
ensure_dir(path.parent)
_log.info(f'Downloading {url}')
with requests.get(url) as r:
r.raise_for_status()
with path.open('wb') as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
Expand Down
54 changes: 54 additions & 0 deletions train/xml-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import hashlib
import logging
from collections import Counter

from dahuffman import HuffmanCodec
from train.train_utils import download, CODECS

_log = logging.getLogger()


def main():
logging.basicConfig(level=logging.INFO)

# XML data sets from https://www.data.gov/
urls = [
"https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.xml",
"https://data.cdc.gov/api/views/bi63-dtpu/rows.xml",
"https://data.cdc.gov/api/views/cjae-szjv/rows.xml",
"https://data.cityofnewyork.us/api/views/25th-nujf/rows.xml",
"https://data.ct.gov/api/views/kbxi-4ia7/rows.xml",
"https://data.cityofchicago.org/api/views/pfsx-4n4m/rows.xml",
"https://data.cdc.gov/api/views/6vp6-wxuq/rows.xml",
"https://www.sba.gov/sites/default/files/data.xml",
"https://data.cdc.gov/api/views/e6fc-ccez/rows.xml",
"https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.xml",
"https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.xml",
"https://gisdata.nd.gov/Metadata/ISO/xml/metadata_Roads_MileMarkers.xml",
"https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.xml",
"https://data.cityofnewyork.us/api/views/5t4n-d72c/rows.xml",
"https://data.cdc.gov/api/views/6rkc-nb2q/rows.xml",
"https://gisdata.nd.gov/Metadata/ISO/xml/metadata_Airports.xml",
"https://data.sfgov.org/api/views/j4sj-j2nf/rows.xml",
"https://data.kingcounty.gov/api/views/gmen-63jm/rows.xml",
"https://data.mo.gov/api/views/vpge-tj3s/rows.xml",
]

_log.info('Building frequency tables')
frequencies = Counter()
for url in urls:
path = download(url, 'xml-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.xml')
with path.open('r') as f:
# Only take first N bytes.
# Large files probably have a lot of structural repetition, which skews the frequencies
raw = f.read(100000)
frequencies.update(raw)

# TODO add more metadata
_log.info(f'Frequencies raw {len(frequencies)}: {frequencies}')
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(CODECS / "xml.pickle", metadata={"frequencies": frequencies})


if __name__ == '__main__':
main()

0 comments on commit d3cbbe5

Please sign in to comment.