In [1]:
# a49586 ../data/tlds_data_mc4_tw_000.json
!sha1sum ../data/tlds_data_mc4_tw_000.json

a495865eb20822c8eac1f56f976ea7e0cca44f55 *../data/tlds_data_mc4_tw_000.json


In [2]:
from pathlib import Path
import json
data = json.loads(Path("../data/tlds_data_mc4_tw_000.json").read_text(encoding="UTF-8"))
len(data)

8000

## Simple tokens

In [3]:
import re
from tqdm.auto import tqdm
from collections import Counter

tok_pat = re.compile(r"[a-zA-Z0-9./%<>]+|[^\s\x00-\x31]")
tok_freq = Counter()
for data_x in tqdm(data):
  toks = tok_pat.findall(data_x["text"])
  tok_freq.update(toks)

  0%|          | 0/8000 [00:00<?, ?it/s]

In [4]:
len(tok_freq)

53696

In [5]:
[(tok,freq) for tok, freq in tok_freq.most_common() if re.match("[‰∏Ä-Èæú]", tok)][:10]

[('ÁöÑ', 88862),
 ('‰∏Ä', 30795),
 ('Â≠∏', 27586),
 ('‰∫∫', 27391),
 ('ÊòØ', 25457),
 ('Âúã', 24166),
 ('Êúâ', 23559),
 ('Â§ß', 22822),
 ('‰∏≠', 22133),
 ('Âπ¥', 21987)]

In [6]:
[(tok,freq) for tok, freq in tok_freq.most_common() if re.match(".{2,}", tok)][:10]

[('10', 3076),
 ('11', 2480),
 ('12', 2319),
 ('2019', 2099),
 ('...', 1977),
 ('2018', 1805),
 ('iPhone', 1775),
 ('00', 1695),
 ('the', 1540),
 ('30', 1470)]

In [7]:
[(tok,freq) for tok, freq in tok_freq.most_common() if re.match(".{2,}", tok)][100:110]

[('XS', 280),
 ('Taiwan', 279),
 ('1A', 272),
 ('..', 271),
 ('false', 269),
 ('TwBsBall', 265),
 ('4.', 263),
 ('101', 257),
 ('else', 257),
 ('sqrt', 257)]

In [8]:
[(tok,freq) for tok, freq in tok_freq.most_common() if re.match("[\U0001F300-\U0001FAFF]", tok)][:10]

[('üëâ', 21),
 ('üî∏', 12),
 ('üå∏', 11),
 ('üì£', 10),
 ('üìå', 6),
 ('üôã', 6),
 ('üéØ', 5),
 ('üî•', 4),
 ('üåè', 4),
 ('üèÉ', 4)]

In [9]:
from nltk.util import bigrams

bg_freq = Counter()
for data_x in tqdm(data):
  toks = tok_pat.findall(data_x["text"])
  bgs = ["".join(x) for x in bigrams(toks)]
  bg_freq.update(bgs)

  0%|          | 0/8000 [00:00<?, ?it/s]

In [10]:
print([x[0] for x in bg_freq.most_common(100)], sep=" ")

['Âè∞ÁÅ£', 'ÊàëÂÄë', 'Ê¥ªÂãï', 'ÊïôËÇ≤', 'Â§ßÂ≠∏', 'Ë®àÁï´', 'ÂèØ‰ª•', 'ÊïôÂ≠∏', 'ÊúçÂãô', 'Ôºå‰∏¶', 'ÂúãÈöõ', '‰∏ÄÂÄã', 'Â∑•‰Ωú', 'Â≠∏Áîü', 'ÂÖ¨Âè∏', 'Ë≥áË®ä', 'Ëá™Â∑±', '„ÄçÔºå', 'Á†îÁ©∂', 'Ôºå‰ΩÜ', 'Ôºå‰πü', 'ÊîøÂ∫ú', 'Ôºå‰ª•', 'ÊñáÂåñ', 'ÔºåÊàë', 'ÊôÇÈñì', 'Ôºå‰∏ç', 'ÔºåÂú®', 'Ë®≠Ë®à', 'ÁîüÊ¥ª', '‰∏≠ÂøÉ', 'ÁôºÂ±ï', 'ÂúãÂ∞è', 'Ë≥áÊñô', '‰∏ñÁïå', 'ÂæåÔºå', 'ÔºåÈÄô', 'Êèê‰æõ', '‰∏≠Âúã', 'ÔºåËÄå', 'Ëá∫ÁÅ£', '‰ΩøÁî®', 'ÁÆ°ÁêÜ', '‰øùË≠∑', 'Ê≤íÊúâ', 'ÔºåËÆì', 'Â∞±ÊòØ', 'Êñ∞ËÅû', 'Á¨¨‰∏Ä', 'Âè∞Âåó', 'Âõ†ÁÇ∫', 'Á∂ìÊøü', 'È´òÈõÑ', 'Â≠∏Áøí', 'ÁßëÊäÄ', 'Â≠∏Ë®à', 'Áõ∏Èóú', 'Áï´Ë°®', 'ÊôÇÔºå', 'ÁéªÁíÉ', 'ÔºåÂõ†', 'Áî¢Ê•≠', 'Â≠∏Ê†°', 'ÂÖ®ÁêÉ', 'Áí∞Â¢É', '‰ªñÂÄë', 'ÊïôÊéà', 'Âπ¥Â∫¶', 'ÈÄ≤Ë°å', 'ÂúãÂÆ∂', 'Á§æÊúÉ', 'Ôºå‰∏Ä', 'ËóùË°ì', 'ÁæéÂúã', 'ÔºåÂ∞±', 'ÂïèÈ°å', '‰∏≠Ôºå', '‰ºÅÊ•≠', 'Êó•Êúü', 'ÈÄôÂÄã', '..', 'Êó•Êú¨', 'Ë™≤Á®ã', 'Â§ñÔºå', 'ÈñãÂßã', 'ËÄÅÂ∏´', 'ÂåóÂ∏Ç', 'Á≥ªÁµ±', '‰ª•Âèä', 'Â§ßÂÆ∂', 'Ë°®Á§∫', 'Ôºö„Äå', 'ÈÄèÈÅé', 'Ôºå‰ªñ', 'ÂßîÂì°', 'Á∂≤Ë∑Ø', 'ÔºåÊòØ', 'Âêà‰Ωú', 'Ëæ¶ÁêÜ', 'ÂÆâÂÖ®']


## Cleaned dataset

In [11]:
import re
from tqdm.auto import tqdm
from collections import Counter

stop_pat = re.compile(r"[^\u3300-\u9fffÔºå„ÄÇÔºÅÔºü\s]")
space_pat = re.compile(r"\s+")
ctok_freq = Counter()
cleaned_data = []
for data_x in tqdm(data):  
  proc_text = stop_pat.sub("", data_x["text"])
  data_x["text"] = space_pat.sub(" ", proc_text)
  cleaned_data.append(data_x)  

  0%|          | 0/8000 [00:00<?, ?it/s]

In [12]:
out_path = Path("../data/tlds_data_mc4_tw_000.zhonly.json")
out_path.write_text(json.dumps(cleaned_data, ensure_ascii=False, indent=2), encoding="UTF-8")

5550906

In [13]:
# e43acb *../data/tlds_data_mc4_tw_000.zhonly.json
!sha1sum ../data/tlds_data_mc4_tw_000.zhonly.json

e43acb8fd8e44e8a149e3e25b37ca1734d7dee43 *../data/tlds_data_mc4_tw_000.zhonly.json
