Issue #5: added pre-trained codec for XML

soxofaan · Jul 14, 2019 · d3cbbe5 · d3cbbe5
1 parent 4f7a689
commit d3cbbe5
Show file tree

Hide file tree

Showing 8 changed files with 69 additions and 3 deletions.
diff --git a/dahuffman/codecs/json-compact.pickle b/dahuffman/codecs/json-compact.pickle
diff --git a/dahuffman/codecs/json.pickle b/dahuffman/codecs/json.pickle
diff --git a/dahuffman/codecs/xml.pickle b/dahuffman/codecs/xml.pickle
diff --git a/tests/test_codecs.py b/tests/test_codecs.py
@@ -19,6 +19,7 @@ def test_get_path():
     'shakespeare',
     'shakespeare-lower',
     'json',
+    'xml',
 ])
 def test_encode_decode(name):
     codec = load(name)

diff --git a/train/README.md b/train/README.md
@@ -6,3 +6,5 @@ on publicly available data sets (e.g. English text, JSON files, ...).
 Usage example:
 
     python train/shakespeare.py
+
+Then, copy generated codec files to `dahuffman/codecs`.
diff --git a/train/json-data.py b/train/json-data.py
@@ -26,20 +26,28 @@ def main():
         "https://data.cdc.gov/api/views/e6fc-ccez/rows.json",
         "https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.json",
         "https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.json",
+        "https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.json",
+        "https://data.cityofnewyork.us/api/views/5t4n-d72c/rows.json",
+        "https://data.cdc.gov/api/views/6rkc-nb2q/rows.json",
+        "https://data.sfgov.org/api/views/j4sj-j2nf/rows.json",
+        "https://data.kingcounty.gov/api/views/gmen-63jm/rows.json",
+        "https://data.mo.gov/api/views/vpge-tj3s/rows.json",
     ]
 
     _log.info('Building frequency tables')
     frequencies_raw = Counter()
     frequencies_compact = Counter()
     for url in urls:
-        path = download(url, 'json-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json')
+        path = download(url, 'json-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json')
         with path.open('r') as f:
             raw = f.read()
-        frequencies_raw.update(raw)
+        # Only take first N bytes.
+        # Large files probably have a lot of structural repetition, which skews the frequencies
+        frequencies_raw.update(raw[:100000])
 
         # Parse and re-encode to compact JSON
         compact = json.dumps(json.loads(raw), separators=(',', ':'))
-        frequencies_compact.update(compact)
+        frequencies_compact.update(compact[:100000])
 
     # TODO add more metadata
     _log.info(f'Frequencies raw {len(frequencies_raw)}: {frequencies_raw}')

diff --git a/train/train_utils.py b/train/train_utils.py
@@ -17,6 +17,7 @@ def download(url: str, path: str) -> Path:
         ensure_dir(path.parent)
         _log.info(f'Downloading {url}')
         with requests.get(url) as r:
+            r.raise_for_status()
             with path.open('wb') as f:
                 for chunk in r.iter_content(chunk_size=1024):
                     f.write(chunk)

diff --git a/train/xml-data.py b/train/xml-data.py
@@ -0,0 +1,54 @@
+import hashlib
+import logging
+from collections import Counter
+
+from dahuffman import HuffmanCodec
+from train.train_utils import download, CODECS
+
+_log = logging.getLogger()
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    # XML data sets from https://www.data.gov/
+    urls = [
+        "https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.xml",
+        "https://data.cdc.gov/api/views/bi63-dtpu/rows.xml",
+        "https://data.cdc.gov/api/views/cjae-szjv/rows.xml",
+        "https://data.cityofnewyork.us/api/views/25th-nujf/rows.xml",
+        "https://data.ct.gov/api/views/kbxi-4ia7/rows.xml",
+        "https://data.cityofchicago.org/api/views/pfsx-4n4m/rows.xml",
+        "https://data.cdc.gov/api/views/6vp6-wxuq/rows.xml",
+        "https://www.sba.gov/sites/default/files/data.xml",
+        "https://data.cdc.gov/api/views/e6fc-ccez/rows.xml",
+        "https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.xml",
+        "https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.xml",
+        "https://gisdata.nd.gov/Metadata/ISO/xml/metadata_Roads_MileMarkers.xml",
+        "https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.xml",
+        "https://data.cityofnewyork.us/api/views/5t4n-d72c/rows.xml",
+        "https://data.cdc.gov/api/views/6rkc-nb2q/rows.xml",
+        "https://gisdata.nd.gov/Metadata/ISO/xml/metadata_Airports.xml",
+        "https://data.sfgov.org/api/views/j4sj-j2nf/rows.xml",
+        "https://data.kingcounty.gov/api/views/gmen-63jm/rows.xml",
+        "https://data.mo.gov/api/views/vpge-tj3s/rows.xml",
+    ]
+
+    _log.info('Building frequency tables')
+    frequencies = Counter()
+    for url in urls:
+        path = download(url, 'xml-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.xml')
+        with path.open('r') as f:
+            # Only take first N bytes.
+            # Large files probably have a lot of structural repetition, which skews the frequencies
+            raw = f.read(100000)
+        frequencies.update(raw)
+
+    # TODO add more metadata
+    _log.info(f'Frequencies raw {len(frequencies)}: {frequencies}')
+    codec = HuffmanCodec.from_frequencies(frequencies)
+    codec.save(CODECS / "xml.pickle", metadata={"frequencies": frequencies})
+
+
+if __name__ == '__main__':
+    main()