# MC4-tw Data Preprocess

In [1]:
import gzip
import json
from tqdm.auto import tqdm
import re
from collections import Counter
from urllib.parse import urlparse

available at [Link](https://drive.google.com/drive/folders/1zXabzDGmwJ_ogv7RcqyI8cK6LZNi1acg?usp=sharing)
```
Input Hash
0ba597 *../data/mc4_trad_chinese_train_000_of_16.jsonl.gz
```

In [9]:
!sha1sum ../data/mc4_trad_chinese_train_000_of_16.jsonl.gz

0ba5972511b71004b2e6e6a5a3d0c6e14eb3f9a8 *../data/mc4_trad_chinese_train_000_of_16.jsonl.gz


In [2]:
data_path = "../data/mc4_trad_chinese_train_000_of_16.jsonl.gz"
data = []
with gzip.open(data_path, "rt", encoding="UTF-8") as fin:
  for ln in tqdm(fin):
    obj = json.loads(ln)
    url = obj["url"]
    if ".tw/" in url or "/tw." in url:
      data.append(obj)


0it [00:00, ?it/s]

In [3]:
len(data)

63975

## Distribution of TLDs

In [4]:

def get_domain(url):
  hostname = urlparse(url).hostname
  if not hostname: return "(na)"

  mat = re.findall(r"\.([\w]+)\.tw", hostname)
  if mat:
    return mat[0]
  else:
    return "(na)"
Counter((get_domain(x["url"]) for x in data)).most_common(10)

[('com', 35371),
 ('(na)', 13257),
 ('edu', 2620),
 ('org', 2014),
 ('gov', 1602),
 ('idv', 742),
 ('taaze', 499),
 ('fingermedia', 349),
 ('qmap', 326),
 ('net', 270)]

## Selecting target TLDs

In [5]:
proc_data = []
tlds_counter = Counter()
for item_x in data:
  tlds = get_domain(item_x["url"])
  if tlds in ("com", "edu", "org", "gov") \
      and tlds_counter[tlds] < 1500:
    proc_data.append({"tlds": tlds, **item_x})
    tlds_counter[tlds] += 1


In [6]:
Counter(x["tlds"] for x in proc_data)

Counter({'com': 1500, 'org': 1500, 'edu': 1500, 'gov': 1500})

## Writing out

In [7]:
with open("../data/tlds_data_mc4_tw_000.json", "w", encoding="UTF-8") as fout:
  json.dump(proc_data, fout, ensure_ascii=False, indent=2)

```
Output Hash
4727bb *../data/tlds_data_mc4_tw_000.json
```

In [8]:
!sha1sum ../data/tlds_data_mc4_tw_000.json

4727bb8285d8a0c0b9761de958ebfbd29c337a0e *../data/tlds_data_mc4_tw_000.json
