# MC4-tw Data Preprocess

In [10]:
import gzip
import json
from tqdm.auto import tqdm
import re
from collections import Counter
from urllib.parse import urlparse

In [11]:
# 0ba597 *../data/mc4_trad_chinese_train_000_of_16.jsonl.gz
# available at https://drive.google.com/drive/folders/1zXabzDGmwJ_ogv7RcqyI8cK6LZNi1acg?usp=sharing
!sha1sum ../data/mc4_trad_chinese_train_000_of_16.jsonl.gz

0ba5972511b71004b2e6e6a5a3d0c6e14eb3f9a8 *../data/mc4_trad_chinese_train_000_of_16.jsonl.gz


In [12]:
data_path = "../data/mc4_trad_chinese_train_000_of_16.jsonl.gz"
data = []
with gzip.open(data_path, "rt", encoding="UTF-8") as fin:
  for ln in tqdm(fin):
    obj = json.loads(ln)
    url = obj["url"]
    if ".tw/" in url or "/tw." in url:
      data.append(obj)


0it [00:00, ?it/s]

In [13]:
len(data)

63975

## Distribution of TLDs

In [14]:

def get_domain(url):
  hostname = urlparse(url).hostname
  if not hostname: return "(na)"

  mat = re.findall(r"\.([\w]+)\.tw", hostname)
  if mat:
    return mat[0]
  else:
    return "(na)"
Counter((get_domain(x["url"]) for x in data)).most_common(10)

[('com', 35371),
 ('(na)', 13257),
 ('edu', 2620),
 ('org', 2014),
 ('gov', 1602),
 ('idv', 742),
 ('taaze', 499),
 ('fingermedia', 349),
 ('qmap', 326),
 ('net', 270)]

## Selecting target TLDs

In [15]:
proc_data = []
tlds_counter = Counter()
for item_x in data:
  tld = get_domain(item_x["url"])  
  if tld == ("(na)"):
    continue  
  elif tld in ("com", "edu", "org", "gov") \
      and tlds_counter[tld] < 250:
    proc_data.append({"tld": tld, **item_x})
    tlds_counter[tld] += 1
  else:
      if tlds_counter["other"] < 200:
        pass
        # tlds_counter["other"] += 1
        # proc_data.append({"tld": "other", **item_x})


In [16]:
Counter(x["tld"] for x in proc_data)

Counter({'com': 250, 'org': 250, 'edu': 250, 'gov': 250})

## Writing out

In [17]:
with open("../data/tlds_data_mc4_tw_000.json", "w", encoding="UTF-8") as fout:
  json.dump(proc_data, fout, ensure_ascii=False, indent=2)

In [18]:
# 7943ed *../data/tlds_data_mc4_tw_000.json
!sha1sum ../data/tlds_data_mc4_tw_000.json

7943edf15019216fb6016368eb675c7f21c33dd6 *../data/tlds_data_mc4_tw_000.json
