In [None]:
# This notebook tries several tokenization methods for Chinese language to prepare for NLP analysis

In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet('../processed_data/clean_data_final.parquet')

In [None]:
# More data cleaning: replace 'p子'，'\xa0', \u3000
## Replace "p子" with "骗子"
mapping = {'p子': '骗子',
          'p局': '骗局'}
df['fulltext'] = df['fulltext'].replace(mapping, regex=True)

drop_words = r'\xa0|\u3000'
df['fulltext'] = df['fulltext'].replace(drop_words, '', regex=True)

# Tokenization using HanLP

In [None]:
# Processing Data with HanLP
import hanlp
hanlp.pretrained.tok.ALL 
hanlp.pretrained.tok.ALL.keys() 

In [None]:
# Tokenization
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
tok

In [None]:
for i in range(150,161):
    print(df.iloc[i]['description'])

In [None]:
# using pipeline to split sentences and words
HanLP = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence) \
    .append(tok)
HanLP.append(lambda sents: sum(sents, []))

In [None]:
for i in range(5,11):
    print(HanLP(df.iloc[i]['description']))

In [None]:
# not using pipeline
for i in range(1,5):
    print(tok(df.iloc[i]['fulltext']))

In [None]:
# test CTB9_TOK_ELECTRA_BASE tokenization model
tok_test_1 = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE)
for i in range(1,5):
    print(tok_test_1(df.iloc[i]['fulltext']))

In [None]:
# CTB9_TOK_ELECTRA_BASE seems to work better than the COARSE one, so I will this the result of tok_test_1 for me.
# To-do:
# Compare with other tokenization models and select the best one.
# Need to learn how to determine which works the best. 

In [None]:
# rewrite the pipeline of tok_test_1
HanLP_1 = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence) \
    .append(tok_test_1)
HanLP.append(lambda sents: sum(sents, []))

In [None]:
df['tok_title'] = df['title']
dst_idx = df.columns.get_loc('tok_title')
src_idx = df.columns.get_loc('title')
from tqdm.notebook import tqdm
for i in tqdm(range(len(df))):
    try:
        title = df.iat[i, src_idx]
        df.iat[i, dst_idx] = HanLP_1(title)
    except:
        print(i, title)
        raise

In [None]:
for i in range(150,161):
    print(df.iloc[i]['tok_title'])

In [None]:
# Tokenize "fulltext" with tok_test_1
# The pipeline generates a list of list in each row, which is not desired, not using pipeline now.
from tqdm.notebook import tqdm
tqdm.pandas()
df['tok'] = df['fulltext'].progress_apply(tok_test_1)

In [None]:
df.rename(columns = {'tok': 'han_tok'})

# Tokenization using Jieba

In [None]:
# ! pip install jieba

In [None]:
# encoding=utf-8
import jieba

def jieba_tok(text):
    return list(jieba.cut(text))

df['jieba_tok'] = df['fulltext'].apply(jieba_tok)
df['jieba_tok'].head()

In [None]:
df[df['fulltext'].str.contains('货拉拉')]['fulltext']

In [None]:
jieba.add_word('运满满')
jieba.add_word('货车帮')
jieba.add_word('满帮')
jieba.add_word('货拉拉')

df['jieba_tok'] = df['fulltext'].apply(jieba_tok)

In [None]:
df['jieba_tok'].head() 

# Remove stopwords, punctuation, and other noises

In [None]:
with open('../Document/cn_stopwords.txt', 'r') as file:
    stopwords = [line.strip() for line in file]
    print(stopwords)

In [None]:
import string
punctuation = list(string.punctuation)

import re
pattern = r'[é《》（）〔〕.,·+？！()/\%％。，、；;&"“”：:【】=—-丨~－一～─┤Ｔ*‘’!#$§<>?@\_`{|}﹝﹞¨©°À×÷ˉΣαβπФ‐–―‘’…‰※ⅡⅢⅰⅱⅲⅳⅴⅵ←↑→↓√∥≠≦≧⊙①②③④⑤⑥⑦⑧⑨⑩⑴⑵⒋┤╱■□▲►◆◇○◎●★☟〇〈〉「」『-]'
sign = list(pattern)

noises = stopwords + punctuation + sign

import re
number_re = re.compile(r'^[0-9]*(\.)?[0-9]*$')
assert number_re.match('1.2')
assert not number_re.match('1.2x')

def remove_noises(word_list):
    filtered_words = []
    for word in word_list:
        if word not in noises and not number_re.match(word):
            filtered_words.append(word)
    return filtered_words

from tqdm.notebook import tqdm
tqdm.pandas()

df['clean_tok'] = df['jieba_tok'].progress_apply(remove_noises)
any(i is None for i in df['clean_tok'])

In [None]:
# Remove more chaos by checking word frequency 
words_des = sum(df['clean_tok'], [])
from collections import Counter
ct = Counter(words_des)
ct

In [None]:
chaos = ['【', '】', '...', '）', '（', '#', '[', ']', '！！！', ',', '——', '-', '！！', '*', '.', '……', '+',
        '!', ':', '/', '--', '|', '(', '~', '…', '—', ')', '～', '？？？', ';', '·', '？？', '。。。', '。。'
        '！！！！', '---', '「', '」', 'quot', '..', '●', '**' ]

def remove_chaos(word_list):
    clean_words = []
    for word in word_list:
        if word not in chaos:
            clean_words.append(word)
    return clean_words

df['filtered_title'] = df['filtered_title'].progress_apply(remove_chaos)
df['filtered_description'] = df['filtered_description'].progress_apply(remove_chaos)
any(i is None for i in df['filtered_title'])

In [None]:
df.to_parquet('../processed_data/tokenized_data.parquet')

# Generate document-term matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

dtm = vectorizer.fit_transform(sum(df['clean_tok'], []))

In [None]:
def to_pickle(obj, file_name):
    import pickle
    with open(file_name, 'wb') as fout:
        pickle.dump(obj, fout, protocol=pickle.HIGHEST_PROTOCOL)

to_pickle(dtm, '../processed_data/dtm_sparse_mat.pkl')

In [None]:
dtm_sampled = pd.DataFrame(dtm[:20].toarray(), columns=vectorizer.get_feature_names_out())
dtm_sampled