In [1]:
# Import necessary libraries
from bs4 import BeautifulSoup
import re
import gensim
import jieba
import pandas as pd
import warnings
import string

# Ignore DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load dataset
df = pd.read_csv('DISCUSS_merged.csv', encoding="utf-8-sig")

# Load user-defined dictionary
jieba.load_userdict("../dict/dict.txt")

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/6k/lvj277mx6834z6rvr_nxth_00000gn/T/jieba.cache
Loading model cost 0.657 seconds.
Prefix dict has been built successfully.


In [2]:
# Define functions for cleaning and tokenization
def clean(sentences):
    cleaned = []
    for sent in sentences:
        # Remove unwanted characters and patterns
        sent = re.sub(r"^\d{1,2}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}\(\d+\.\d+\s(?:KB|MB)\)\s*", '', sent)
        sent = sent.lower()
#         sent = re.sub("[~^0-9]", "", sent)
        sent = re.sub(r'http\S+|https\S+', '', sent)
        sent = BeautifulSoup(sent, 'html.parser').get_text(separator=' ')
        sent = sent.translate(str.maketrans('', '', string.punctuation + '，。！？；：‘’“”（）《》【】、'))
        sent = sent.replace('emoji', '').replace('已pm', '').replace('pm', '').replace('thx', '').replace('pls', '').replace('kb', '').replace('mb', '').replace('向左走', '').replace('向右走', '').replace('死死', '')
        sent = re.sub('\s+', ' ', sent).strip()
        cleaned.append(sent)
    return cleaned

df['clean'] = clean(df['merged'])
df.head()

Unnamed: 0,thread_id,title,msg,merged,clean
0,27171152,"我個40歲同事教識我：「窮人真係唔好生仔」,我個40歲同事教識我：「窮人真係唔好生仔」",40歲同事應該收入大概兩萬多，但生左個仔之後佢連lunch嗌杯凍檸茶都唔敢，所有支出都要收歸...,"我個40歲同事教識我：「窮人真係唔好生仔」,我個40歲同事教識我：「窮人真係唔好生仔」\n4...",我個40歲同事教識我「窮人真係唔好生仔」我個40歲同事教識我「窮人真係唔好生仔」 40歲同事...
1,27171152,我個40歲同事教識我：「窮人真係唔好生仔」,40歲同事應該收入大概兩萬多，但生左個仔之後佢連lunch嗌杯凍檸茶都唔敢，所有支出都要收歸...,我個40歲同事教識我：「窮人真係唔好生仔」\n40歲同事應該收入大概兩萬多，但生左個仔之後佢...,我個40歲同事教識我「窮人真係唔好生仔」 40歲同事應該收入大概兩萬多但生左個仔之後佢連lu...
2,27154661,都唔明咁多窮L40歲仲PATPAT痕想生仔女,"如果你地有錢都一早生左啦係咪,無錢的你地好心你地就唔好為左怕寂寞害下一代輸在起跑線啦,生仔女...","都唔明咁多窮L40歲仲PATPAT痕想生仔女\n如果你地有錢都一早生左啦係咪,無錢的你地好心...",都唔明咁多窮l40歲仲patpat痕想生仔女 如果你地有錢都一早生左啦係咪無錢的你地好心你地...
3,27154849,鳳尾短鯛生bb！,"養咗唔夠兩星期琴晚發現,下一步應該點做\n無\nthanks, 如果 孵左出嚟上網睇話用煮熟...","鳳尾短鯛生bb！\n養咗唔夠兩星期琴晚發現,下一步應該點做\n無\nthanks, 如果 孵...",鳳尾短鯛生bb 養咗唔夠兩星期琴晚發現下一步應該點做 無 thanks 如果 孵左出嚟上網睇...
4,27169597,食開素會唔會對陀緊嘅BB有影響?,"食左素2年,而家有左4周,,會唔會BB有影響?\n最好做多d 運動,再食下d補充品\n有咩運...","食開素會唔會對陀緊嘅BB有影響?\n食左素2年,而家有左4周,,會唔會BB有影響?\n最好做...",食開素會唔會對陀緊嘅bb有影響 食左素2年而家有左4周會唔會bb有影響 最好做多d 運動再食...


In [3]:
def segment(sentence):
    # Segment Chinese text and remove stopwords
    stopwords = [line.strip() for line in open('../dict/canton.txt', 'r', encoding='utf-8').readlines()]
    outstr = ''
    for word in jieba.cut(sentence.strip()):
        if word not in stopwords and (len(word.strip()) > 0) and (word >= '\u4e00' and word <= '\u9fa5'):
            outstr += word + " "
    return outstr

df['tokenz'] = [segment(sent) for sent in df['clean']]
df.head()

Unnamed: 0,thread_id,title,msg,merged,clean,tokenz
0,27171152,"我個40歲同事教識我：「窮人真係唔好生仔」,我個40歲同事教識我：「窮人真係唔好生仔」",40歲同事應該收入大概兩萬多，但生左個仔之後佢連lunch嗌杯凍檸茶都唔敢，所有支出都要收歸...,"我個40歲同事教識我：「窮人真係唔好生仔」,我個40歲同事教識我：「窮人真係唔好生仔」\n4...",我個40歲同事教識我「窮人真係唔好生仔」我個40歲同事教識我「窮人真係唔好生仔」 40歲同事...,我個 歲 同事 教識 窮人 好生 仔 我個 歲 同事 教識 窮人 好生 仔 歲 同事 收入 ...
1,27171152,我個40歲同事教識我：「窮人真係唔好生仔」,40歲同事應該收入大概兩萬多，但生左個仔之後佢連lunch嗌杯凍檸茶都唔敢，所有支出都要收歸...,我個40歲同事教識我：「窮人真係唔好生仔」\n40歲同事應該收入大概兩萬多，但生左個仔之後佢...,我個40歲同事教識我「窮人真係唔好生仔」 40歲同事應該收入大概兩萬多但生左個仔之後佢連lu...,我個 歲 同事 教識 窮人 好生 仔 歲 同事 收入 大概 兩萬多 生左 個仔 嗌 杯 凍檸...
2,27154661,都唔明咁多窮L40歲仲PATPAT痕想生仔女,"如果你地有錢都一早生左啦係咪,無錢的你地好心你地就唔好為左怕寂寞害下一代輸在起跑線啦,生仔女...","都唔明咁多窮L40歲仲PATPAT痕想生仔女\n如果你地有錢都一早生左啦係咪,無錢的你地好心...",都唔明咁多窮l40歲仲patpat痕想生仔女 如果你地有錢都一早生左啦係咪無錢的你地好心你地...,唔明 咁多 窮 歲仲 痕想 生仔 女 有錢 一早 生左 無錢 好心你 唔好 為左 怕 寂寞 ...
3,27154849,鳳尾短鯛生bb！,"養咗唔夠兩星期琴晚發現,下一步應該點做\n無\nthanks, 如果 孵左出嚟上網睇話用煮熟...","鳳尾短鯛生bb！\n養咗唔夠兩星期琴晚發現,下一步應該點做\n無\nthanks, 如果 孵...",鳳尾短鯛生bb 養咗唔夠兩星期琴晚發現下一步應該點做 無 thanks 如果 孵左出嚟上網睇...,鳳尾 短 鯛生 養 唔夠 兩 星期 琴晚 發現 下一步 點做 孵 左 出嚟 上網 煮熟 蛋黃...
4,27169597,食開素會唔會對陀緊嘅BB有影響?,"食左素2年,而家有左4周,,會唔會BB有影響?\n最好做多d 運動,再食下d補充品\n有咩運...","食開素會唔會對陀緊嘅BB有影響?\n食左素2年,而家有左4周,,會唔會BB有影響?\n最好做...",食開素會唔會對陀緊嘅bb有影響 食左素2年而家有左4周會唔會bb有影響 最好做多d 運動再食...,食開 素 會唔 會對 陀 緊 有影響 食左 素 有左 周 會唔 有影響 最好做 運動 食下 ...


In [4]:
# Define a function to process text data
def process(texts):
    # Build bigram and trigram models and apply to tokenized text
    bigram = gensim.models.Phrases(texts, min_count=10, threshold=10)
    trigram = gensim.models.Phrases(bigram[texts], min_count=10, threshold=10)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Load stopwords and Cantonese-specific words
    stop_words = [line.strip() for line in open('../dict/stopwords.txt', 'r', encoding='UTF-8').readlines()]
    canton = [line.strip() for line in open('../dict/canton.txt', 'r', encoding='UTF-8').readlines()]

    # Define a function to remove stopwords, parts of speech, and Cantonese-specific words
    def remove_words(doc):
        return [word for word in gensim.utils.simple_preprocess(str(doc), min_len=2, deacc=True)
                if word not in stop_words and word not in canton]

    # Apply processing steps to tokenized text
    texts = [[word for word in trigram_mod[bigram_mod[remove_words(doc)]]] for doc in texts]
    texts = [[word for sent in doc for word in sent.split() if not re.match(r'[^\w\s]', word)] for doc in texts]
    return texts

# Apply the processing function to the 'tokenz' column of a DataFrame
df['tokenz'] = process(df['tokenz'])

# Define a function to split sentences into individual words and remove punctuation
def sent_to_words_space(sentences):
    for sent in sentences:
        sent = re.sub(r'[^\w\s]', '', sent)  # remove punctuations
        yield sent

# Convert the 'tokenz' column to a list of strings
df['tokenz'] = df['tokenz'].astype(str)

# Convert the 'tokenz' column into a list of lists of individual words
msg_space = df.tokenz.values.tolist()
msg_space_words = list(sent_to_words_space(msg_space))

# Assign the list of lists of individual words back to the 'tokenz' column
df['tokenz'] = msg_space_words

# Print the final DataFrame
df.head()

Unnamed: 0,thread_id,title,msg,merged,clean,tokenz
0,27171152,"我個40歲同事教識我：「窮人真係唔好生仔」,我個40歲同事教識我：「窮人真係唔好生仔」",40歲同事應該收入大概兩萬多，但生左個仔之後佢連lunch嗌杯凍檸茶都唔敢，所有支出都要收歸...,"我個40歲同事教識我：「窮人真係唔好生仔」,我個40歲同事教識我：「窮人真係唔好生仔」\n4...",我個40歲同事教識我「窮人真係唔好生仔」我個40歲同事教識我「窮人真係唔好生仔」 40歲同事...,同事 教識 窮人 好生 同事 教識 窮人 好生 同事 收入 兩萬多 生左 個仔 凍檸茶 支出...
1,27171152,我個40歲同事教識我：「窮人真係唔好生仔」,40歲同事應該收入大概兩萬多，但生左個仔之後佢連lunch嗌杯凍檸茶都唔敢，所有支出都要收歸...,我個40歲同事教識我：「窮人真係唔好生仔」\n40歲同事應該收入大概兩萬多，但生左個仔之後佢...,我個40歲同事教識我「窮人真係唔好生仔」 40歲同事應該收入大概兩萬多但生左個仔之後佢連lu...,同事 教識 窮人 好生 同事 收入 兩萬多 生左 個仔 凍檸茶 支出 收歸 家有 見到佢 肉...
2,27154661,都唔明咁多窮L40歲仲PATPAT痕想生仔女,"如果你地有錢都一早生左啦係咪,無錢的你地好心你地就唔好為左怕寂寞害下一代輸在起跑線啦,生仔女...","都唔明咁多窮L40歲仲PATPAT痕想生仔女\n如果你地有錢都一早生左啦係咪,無錢的你地好心...",都唔明咁多窮l40歲仲patpat痕想生仔女 如果你地有錢都一早生左啦係咪無錢的你地好心你地...,唔明 歲仲 痕想 生仔 有錢 生左 無錢 好心你 為左 寂寞 下一代 輸在 起跑線 生仔 呢...
3,27154849,鳳尾短鯛生bb！,"養咗唔夠兩星期琴晚發現,下一步應該點做\n無\nthanks, 如果 孵左出嚟上網睇話用煮熟...","鳳尾短鯛生bb！\n養咗唔夠兩星期琴晚發現,下一步應該點做\n無\nthanks, 如果 孵...",鳳尾短鯛生bb 養咗唔夠兩星期琴晚發現下一步應該點做 無 thanks 如果 孵左出嚟上網睇...,鳳尾 鯛生 下一步 上網 煮熟 蛋黃 恭喜 影多 幾張 恭喜 野生 鳳尾 失敗 故事 早晨 ...
4,27169597,食開素會唔會對陀緊嘅BB有影響?,"食左素2年,而家有左4周,,會唔會BB有影響?\n最好做多d 運動,再食下d補充品\n有咩運...","食開素會唔會對陀緊嘅BB有影響?\n食左素2年,而家有左4周,,會唔會BB有影響?\n最好做...",食開素會唔會對陀緊嘅bb有影響 食左素2年而家有左4周會唔會bb有影響 最好做多d 運動再食...,食開 會對 有影響 食左 有影響 最好做 運動 食下 補充品 運動 孕婦 補充品 食邊 行多...


In [6]:
df.to_csv("token.csv", encoding='utf-8-sig', index=False)