In [1]:
# Import necessary libraries
from bs4 import BeautifulSoup
import re
import gensim
import jieba
import pandas as pd
import warnings
import string

# Ignore DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load dataset
df = pd.read_csv('LIHKG_merged.csv', encoding="utf-8-sig")

# Load user-defined dictionary
jieba.load_userdict("../dict/dict.txt")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/6k/lvj277mx6834z6rvr_nxth_00000gn/T/jieba.cache
Loading model cost 0.544 seconds.
Prefix dict has been built successfully.


In [2]:
# Define functions for cleaning and tokenization
def clean(sentences):
    cleaned = []
    for sent in sentences:
        # Remove unwanted characters and patterns
        sent = re.sub(r"^\d{1,2}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}\(\d+\.\d+\s(?:KB|MB)\)\s*", '', sent)
        sent = sent.lower()
#         sent = re.sub("[~^0-9]", "", sent)
        sent = re.sub(r'http\S+|https\S+', '', sent)
        sent = BeautifulSoup(sent, 'html.parser').get_text(separator=' ')
        sent = sent.translate(str.maketrans('', '', string.punctuation + '，。！？；：‘’“”（）《》【】、'))
        sent = sent.replace('emoji', '').replace('已pm', '').replace('pm', '').replace('thx', '').replace('pls', '').replace('kb', '').replace('mb', '')
        sent = re.sub('\s+', ' ', sent).strip()
        cleaned.append(sent)
    return cleaned

df['clean'] = clean(df['merged'])
df.head()

Unnamed: 0,thread_id,title,msg,merged,clean
0,10167,窮人真係唔撚好生仔,窮仲要生三四件\n樓主屋企幾件,窮人真係唔撚好生仔\n窮仲要生三四件\n樓主屋企幾件,窮人真係唔撚好生仔 窮仲要生三四件 樓主屋企幾件
1,21387,文思怡生仔冇屎忽,搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當堂縮一縮 連睇肥媽...,文思怡生仔冇屎忽\n搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當...,文思怡生仔冇屎忽 搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當堂縮...
2,23740,想養兒防老嘅唔該生多個啦~,你老味生得一個想點呀?\n後生就兩人搵錢三個人用\n退左休就一人搵錢三個人用\n好撚精鳩明囉...,想養兒防老嘅唔該生多個啦~\n你老味生得一個想點呀?\n後生就兩人搵錢三個人用\n退左休就一...,想養兒防老嘅唔該生多個啦 你老味生得一個想點呀 後生就兩人搵錢三個人用 退左休就一人搵錢三個...
3,38885,★支那人d傳宗接代、血脈論思想 究竟累死幾多人？,垃圾支那文化\n#fuck#\n14億人\n好少聽鬼佬話緊張 傳宗接代，擔幡買水 \n\n係...,★支那人d傳宗接代、血脈論思想 究竟累死幾多人？\n垃圾支那文化\n#fuck#\n14億人...,★支那人d傳宗接代血脈論思想 究竟累死幾多人 垃圾支那文化 fuck 14億人 好少聽鬼佬話...
4,49199,失身夜後會有幾多個落仔post?,爸爸媽媽唔好唔要我呀\n#:D#b,失身夜後會有幾多個落仔post?\n爸爸媽媽唔好唔要我呀\n#:D#b,失身夜後會有幾多個落仔post 爸爸媽媽唔好唔要我呀 db


In [3]:
def segment(sentence):
    # Segment Chinese text and remove stopwords
    stopwords = [line.strip() for line in open('../dict/canton.txt', 'r', encoding='utf-8').readlines()]
    outstr = ''
    for word in jieba.cut(sentence.strip()):
        if word not in stopwords and (len(word.strip()) > 0) and (word >= '\u4e00' and word <= '\u9fa5'):
            outstr += word + " "
    return outstr

df['tokenz'] = [segment(sent) for sent in df['clean']]
df.head()

Unnamed: 0,thread_id,title,msg,merged,clean,tokenz
0,10167,窮人真係唔撚好生仔,窮仲要生三四件\n樓主屋企幾件,窮人真係唔撚好生仔\n窮仲要生三四件\n樓主屋企幾件,窮人真係唔撚好生仔 窮仲要生三四件 樓主屋企幾件,窮人 唔撚好 生仔 窮 仲要 生三 四件 樓主 屋企 幾件
1,21387,文思怡生仔冇屎忽,搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當堂縮一縮 連睇肥媽...,文思怡生仔冇屎忽\n搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當...,文思怡生仔冇屎忽 搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當堂縮...,文思 怡 生仔冇屎忽 搭車 諗住睇 下片 見到 屙屎 聽下 古典音樂 放鬆 約 肌 見到 又...
2,23740,想養兒防老嘅唔該生多個啦~,你老味生得一個想點呀?\n後生就兩人搵錢三個人用\n退左休就一人搵錢三個人用\n好撚精鳩明囉...,想養兒防老嘅唔該生多個啦~\n你老味生得一個想點呀?\n後生就兩人搵錢三個人用\n退左休就一...,想養兒防老嘅唔該生多個啦 你老味生得一個想點呀 後生就兩人搵錢三個人用 退左休就一人搵錢三個...,養兒防老 唔該 生 多個 你老味 生得 點呀 後生 兩人 搵錢 三個 人用 退左 休 一人 ...
3,38885,★支那人d傳宗接代、血脈論思想 究竟累死幾多人？,垃圾支那文化\n#fuck#\n14億人\n好少聽鬼佬話緊張 傳宗接代，擔幡買水 \n\n係...,★支那人d傳宗接代、血脈論思想 究竟累死幾多人？\n垃圾支那文化\n#fuck#\n14億人...,★支那人d傳宗接代血脈論思想 究竟累死幾多人 垃圾支那文化 fuck 14億人 好少聽鬼佬話...,支那人 傳宗接代 血脈 思想 究竟 累死 幾多人 垃圾 支那 文化 好少 聽 鬼佬 話緊 張...
4,49199,失身夜後會有幾多個落仔post?,爸爸媽媽唔好唔要我呀\n#:D#b,失身夜後會有幾多個落仔post?\n爸爸媽媽唔好唔要我呀\n#:D#b,失身夜後會有幾多個落仔post 爸爸媽媽唔好唔要我呀 db,失身 夜 後會 多個 落仔 爸爸媽媽 唔好 唔要我


In [4]:
# Define a function to process text data
def process(texts):
    # Build bigram and trigram models and apply to tokenized text
    bigram = gensim.models.Phrases(texts, min_count=10, threshold=10)
    trigram = gensim.models.Phrases(bigram[texts], min_count=10, threshold=10)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Load stopwords and Cantonese-specific words
    stop_words = [line.strip() for line in open('../dict/stopwords.txt', 'r', encoding='UTF-8').readlines()]
    canton = [line.strip() for line in open('../dict/canton.txt', 'r', encoding='UTF-8').readlines()]

    # Define a function to remove stopwords, parts of speech, and Cantonese-specific words
    def remove_words(doc):
        return [word for word in gensim.utils.simple_preprocess(str(doc), min_len=2, deacc=True)
                if word not in stop_words and word not in canton]

    # Apply processing steps to tokenized text
    texts = [[word for word in trigram_mod[bigram_mod[remove_words(doc)]]] for doc in texts]
    texts = [[word for sent in doc for word in sent.split() if not re.match(r'[^\w\s]', word)] for doc in texts]
    return texts

# Apply the processing function to the 'tokenz' column of a DataFrame
df['tokenz'] = process(df['tokenz'])

# Define a function to split sentences into individual words and remove punctuation
def sent_to_words_space(sentences):
    for sent in sentences:
        sent = re.sub(r'[^\w\s]', '', sent)  # remove punctuations
        yield sent

# Convert the 'tokenz' column to a list of strings
df['tokenz'] = df['tokenz'].astype(str)

# Convert the 'tokenz' column into a list of lists of individual words
msg_space = df.tokenz.values.tolist()
msg_space_words = list(sent_to_words_space(msg_space))

# Assign the list of lists of individual words back to the 'tokenz' column
df['tokenz'] = msg_space_words

# Print the final DataFrame
df.head()

Unnamed: 0,thread_id,title,msg,merged,clean,tokenz
0,10167,窮人真係唔撚好生仔,窮仲要生三四件\n樓主屋企幾件,窮人真係唔撚好生仔\n窮仲要生三四件\n樓主屋企幾件,窮人真係唔撚好生仔 窮仲要生三四件 樓主屋企幾件,窮人 唔撚好 生仔 生三 四件 屋企 幾件
1,21387,文思怡生仔冇屎忽,搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當堂縮一縮 連睇肥媽...,文思怡生仔冇屎忽\n搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當...,文思怡生仔冇屎忽 搭車諗住睇下片又見到你 屙屎想聽下古典音樂放鬆䏦約肌見到又係你搞到我當堂縮...,文思 生仔冇屎忽 搭車 諗住睇 下片 屙屎 聽下 古典音樂 放鬆 當堂 縮一縮 連睇 肥媽 ...
2,23740,想養兒防老嘅唔該生多個啦~,你老味生得一個想點呀?\n後生就兩人搵錢三個人用\n退左休就一人搵錢三個人用\n好撚精鳩明囉...,想養兒防老嘅唔該生多個啦~\n你老味生得一個想點呀?\n後生就兩人搵錢三個人用\n退左休就一...,想養兒防老嘅唔該生多個啦 你老味生得一個想點呀 後生就兩人搵錢三個人用 退左休就一人搵錢三個...,養兒防老 你老味 生得 點呀 後生 兩人 搵錢 人用 退左 搵錢 人用 精鳩明 佩服 六體 ...
3,38885,★支那人d傳宗接代、血脈論思想 究竟累死幾多人？,垃圾支那文化\n#fuck#\n14億人\n好少聽鬼佬話緊張 傳宗接代，擔幡買水 \n\n係...,★支那人d傳宗接代、血脈論思想 究竟累死幾多人？\n垃圾支那文化\n#fuck#\n14億人...,★支那人d傳宗接代血脈論思想 究竟累死幾多人 垃圾支那文化 fuck 14億人 好少聽鬼佬話...,支那人 傳宗接代 血脈 思想 累死 幾多人 垃圾 支那 文化 好少 鬼佬 話緊 傳宗接代 擔...
4,49199,失身夜後會有幾多個落仔post?,爸爸媽媽唔好唔要我呀\n#:D#b,失身夜後會有幾多個落仔post?\n爸爸媽媽唔好唔要我呀\n#:D#b,失身夜後會有幾多個落仔post 爸爸媽媽唔好唔要我呀 db,失身 後會 落仔 爸爸媽媽 唔要我


In [5]:
df.to_csv("token.csv", encoding='utf-8-sig', index=False)