In [None]:
# Import necessary libraries
from bs4 import BeautifulSoup
import re
import gensim
import jieba
import pandas as pd
import warnings
import string

# Ignore DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load dataset
df = pd.read_csv('GOLDEN_merged.csv', encoding="utf-8-sig")

# Load user-defined dictionary
jieba.load_userdict("../dict/dict.txt")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/6k/lvj277mx6834z6rvr_nxth_00000gn/T/jieba.cache
Loading model cost 0.568 seconds.
Prefix dict has been built successfully.


In [None]:
# Define functions for cleaning and tokenization
def clean(sentences):
    cleaned = []
    for sent in sentences:
        # Remove unwanted characters and patterns
        sent = re.sub(r"^\d{1,2}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}\(\d+\.\d+\s(?:KB|MB)\)\s*", '', sent)
        sent = sent.lower()
#         sent = re.sub("[~^0-9]", "", sent)
        sent = re.sub(r'http\S+|https\S+', '', sent)
        sent = BeautifulSoup(sent, 'html.parser').get_text(separator=' ')
        sent = sent.translate(str.maketrans('', '', string.punctuation + '，。！？；：‘’“”（）《》【】、'))
        sent = sent.replace('emoji', '').replace('已pm', '').replace('pm', '').replace('thx', '').replace('pls', '').replace('kb', '').replace('mb', '').replace('向左走', '').replace('向右走', '').replace('死死', '')
        sent = re.sub('\s+', ' ', sent).strip()
        cleaned.append(sent)
    return cleaned

df['clean'] = clean(df['merged'])
df.head()

In [None]:
def segment(sentence):
    # Segment Chinese text and remove stopwords
    stopwords = [line.strip() for line in open('../dict/canton.txt', 'r', encoding='utf-8').readlines()]
    outstr = ''
    for word in jieba.cut(sentence.strip()):
        if word not in stopwords and (len(word.strip()) > 0) and (word >= '\u4e00' and word <= '\u9fa5'):
            outstr += word + " "
    return outstr

df['tokenz'] = [segment(sent) for sent in df['clean']]
df.head()

In [None]:
# Define a function to process text data
def process(texts):
    # Build bigram and trigram models and apply to tokenized text
    bigram = gensim.models.Phrases(texts, min_count=10, threshold=10)
    trigram = gensim.models.Phrases(bigram[texts], min_count=10, threshold=10)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Load stopwords and Cantonese-specific words
    stop_words = [line.strip() for line in open('../dict/stopwords.txt', 'r', encoding='UTF-8').readlines()]
    canton = [line.strip() for line in open('../dict/canton.txt', 'r', encoding='UTF-8').readlines()]

    # Define a function to remove stopwords, parts of speech, and Cantonese-specific words
    def remove_words(doc):
        return [word for word in gensim.utils.simple_preprocess(str(doc), min_len=2, deacc=True)
                if word not in stop_words and word not in canton]

    # Apply processing steps to tokenized text
    texts = [[word for word in trigram_mod[bigram_mod[remove_words(doc)]]] for doc in texts]
    texts = [[word for sent in doc for word in sent.split() if not re.match(r'[^\w\s]', word)] for doc in texts]
    return texts

# Apply the processing function to the 'tokenz' column of a DataFrame
df['tokenz'] = process(df['tokenz'])

# Define a function to split sentences into individual words and remove punctuation
def sent_to_words_space(sentences):
    for sent in sentences:
        sent = re.sub(r'[^\w\s]', '', sent)  # remove punctuations
        yield sent

# Convert the 'tokenz' column to a list of strings
df['tokenz'] = df['tokenz'].astype(str)

# Convert the 'tokenz' column into a list of lists of individual words
msg_space = df.tokenz.values.tolist()
msg_space_words = list(sent_to_words_space(msg_space))

# Assign the list of lists of individual words back to the 'tokenz' column
df['tokenz'] = msg_space_words

# Print the final DataFrame
df.head()

In [None]:
df.to_csv("token.csv", encoding='utf-8-sig', index=False)