In [None]:
import re
import jieba
import cntext as ct
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords

import unicodedata


from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic import BERTopic
from umap import UMAP
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from topictuner import TopicModelTuner as TMT


nltk.download('stopwords') 
stop_words = set(stopwords.words('english'))


def remove_stopwords_en(text):
    text = text.lower()
    stop_words_pattern = r'\b(' + '|'.join(re.escape(word) for word in stop_words) + r')\b'
    filtered_text = re.sub(stop_words_pattern, ' ', text)
    filtered_text = re.sub(r'\s+', ' ', filtered_text).strip()

    return filtered_text



stopwords_cn = ct.load_pkl_dict('STOPWORDS.pkl')['STOPWORDS']['chinese'] 

def clean_text(text): 
    words = jieba.lcut(text)
    words = [w for w in words if w not in stopwords_cn]
    return ' '.join(words)

 


def extract_and_remove_emojis(text):
    emojis = re.findall(r'\[.*?\]', text)
    cleaned_text = re.sub(r'\[.*?\]', '', text)
    emojis_str = ' '.join(emojis)
    return cleaned_text, emojis_str

def extract_at(text):
    at = re.findall(r'@\S+ ?', text)
    cleaned_text = re.sub(r'@\S+ ?', '', text)
    at_str = ' '.join(at)
    return cleaned_text, at_str

def remove_meaningless_haha(text): 
    haha = re.findall(r'哈{2,}(?!姐)[^\w]*', text)

    cleaned_text = re.sub(r'哈{2,}(?!姐)[^\w]*', '', text)
    
    haha_str = ' '.join(haha)
    return cleaned_text, haha_str

def remove_meaningless_haha_en(text):
    return re.sub(r'(ha){2,}', '', text)


punctuation_to_remove = r'[（）()：:\[\]【】「」|\/…·—_!！?？，“”"]'

def remove_punctuation(text):
    return re.sub(punctuation_to_remove, '', text)

# Below are the preprocessing function in English

def extract_emojis_and_symbols(text):
    emojis_and_symbols = re.findall(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U00002700-\U000027BF]+|[\U0001F1E6-\U0001F1FF]{2}', text)

    cleaned_text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U00002700-\U000027BF]+|[\U0001F1E6-\U0001F1FF]{2}', '', text)

    emojis_and_symbols_str = ' '.join(emojis_and_symbols)
    return cleaned_text, emojis_and_symbols_str

def remove_fyp(text):
    pattern = r'\bfyp\b|\bforyou\b|\bforyoupage\b|\bfypシ\b|\bfypツ\b'
    return re.sub(pattern, '', text, flags=re.IGNORECASE).strip()


def remove_fyp_strings(text):
    pattern_to_remove = r'\b(fyp|foryou)[a-zA-Z]*\s*'

    return re.sub(pattern_to_remove, '', text)

def remove_tiktok(text):
    pattern = r'\b(抖音|tiktok)\b(?!chinesetiktok)'
    return re.sub(pattern, '', text).strip()

def remove_china(text):
    pattern = r'(中国|\bchina\b)'
    return re.sub(pattern, '', text, flags=re.IGNORECASE).strip()



def remove_invalid_words(text):
    words = re.findall(r'\b\w+\b|\b[\u4e00-\u9fff]+\b', text)
    
    def is_valid_word(word):
        for char in word:
            if re.match(r'[\u4e00-\u9fff]', char):
                continue
            if char.isalpha() or char.isdigit():
                if 'LATIN' in unicodedata.name(char, '') and unicodedata.category(char) in ['Lu', 'Ll', 'Nd']:
                    if not ('WITH' in unicodedata.name(char, '')):
                        continue
            return False
        return True
    
    valid_words = [word for word in words if is_valid_word(word)]
    
    return ' '.join(valid_words)


In [None]:
# Preprocessing Chinese comments

df = pd.read_csv('douyin_CN_relevant_UTF8.csv').astype(str)

# df = pd.read_csv('cnews.csv').astype(str)

#print(df['comment'].head(10))

df['text_cleaned'], df['emojis'] = zip(*df['comment'].apply(extract_and_remove_emojis)) # emoji
df['text_cleaned'], df['at'] = zip(*df['text_cleaned'].apply(extract_at)) # @
df['text_cleaned'], df['haha'] = zip(*df['text_cleaned'].apply(remove_meaningless_haha)) # 哈哈哈
df['text_cleaned'] = df['text_cleaned'].apply(clean_text) # stopwords
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: x.strip() if isinstance(x, str) and x.strip() == '' else x) 



df.replace('nan', pd.NA, inplace=True)

df['text_cleaned'].replace('', pd.NA, inplace=True)



df_cleaned = df.dropna(subset=['text_cleaned'], how='any')


print(df_cleaned['text_cleaned'].head(302))
print(df_cleaned['haha'].head(302))


docs = df_cleaned['text_cleaned'].tolist() 

In [None]:
# Preprocessing English comments

df = pd.read_csv('filtered_tiktok_US_30w.csv').astype(str)


print(df['comment'].head(20))

df['text_cleaned'], df['emojis_and_symbols'] = zip(*df['comment'].apply(extract_emojis_and_symbols)) # emoji
df['text_cleaned'], df['at'] = zip(*df['text_cleaned'].apply(extract_at)) # @

df['text_cleaned'] = df['text_cleaned'].apply(remove_fyp)
df['text_cleaned'] = df['text_cleaned'].apply(remove_fyp_strings)
df['text_cleaned'] = df['text_cleaned'].apply(remove_tiktok)
df['text_cleaned'] = df['text_cleaned'].apply(remove_china)
# df['text_cleaned'] = df['text_cleaned'].str.replace('#', ' ', regex=False)  # "#"
df['text_cleaned'], df['haha'] = zip(*df['text_cleaned'].apply(remove_meaningless_haha)) # 哈哈哈
df['text_cleaned'] = df['text_cleaned'].apply(remove_meaningless_haha_en) # hahaha

df['text_cleaned'] = df['text_cleaned'].apply(remove_punctuation) 

df['text_cleaned'] = df['text_cleaned'].apply(remove_invalid_words) # only English and Chinese

df['text_cleaned'] = df['text_cleaned'].apply(clean_text) 

df['text_cleaned'] = df['text_cleaned'].apply(remove_stopwords_en) 

df['text_cleaned'] = df['text_cleaned'].apply(lambda x: x.strip() if isinstance(x, str) and x.strip() == '' else x) 


df.replace('nan', pd.NA, inplace=True)

df['text_cleaned'].replace('', pd.NA, inplace=True)



df_cleaned = df.dropna(subset=['text_cleaned'], how='any')

print(df_cleaned['text_cleaned'].head(20))


docs = df_cleaned['text_cleaned'].tolist()

In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic import BERTopic
from umap import UMAP
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from topictuner import TopicModelTuner as TMT




sentence_model = SentenceTransformer("/root/paraphrase-multilingual-MiniLM-L12-v2-main") # English

#sentence_model = SentenceTransformer("/root/m3e-base") # Chinese

embeddings = sentence_model.encode(docs, show_progress_bar=True)


def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x


# Initialize and rescale PCA embeddings
pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))





In [None]:
from hdbscan import HDBSCAN


# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.2)

# Start UMAP from PCA embeddings
umap_model = UMAP(
    n_neighbors=30,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    init=pca_embeddings,
    angular_rp_forest=True
)

# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer_model = CountVectorizer()  

hdbscan_model = HDBSCAN(min_cluster_size=300, min_samples=30)



topic_model = BERTopic(
                       hdbscan_model=hdbscan_model, # If TMT is used, comment out the parameter
                       representation_model=representation_model,
                       embedding_model=sentence_model,
                       #embedding_model="/root/m3e-base",
                       umap_model=umap_model,
                       #vectorizer_model=vectorizer_model,
                       verbose=True,
                       low_memory=True,
                       calculate_probabilities=False)

In [None]:
topics, probs = topic_model.fit_transform(docs, embeddings) 
#topic_model.get_topic_info()
new_topics = topic_model.reduce_outliers(docs, topics) 

In [None]:
topic_model.update_topics(docs, topics=new_topics)

topic_model.save("/topic_modeling/model_name") # save your model

# topic_model = BERTopic.load("/topic_modeling/model_name") # load your model

topic_model.get_topic_info()

In [None]:
topic_info = topic_model.get_topic_info()

topic_info.to_csv('tiktok_comment_topic_modeling2.csv', index=False, encoding='utf-8-sig')

In [None]:
# To tune parameters using TMT, run the following blocks

from topictuner import TopicModelTuner as TMT
tuned_model = TMT.wrapBERTopicModel(topic_model)

In [None]:
tuned_model.embeddings = embeddings

In [None]:
tuned_model.reduce()

In [None]:
lastRunResultsDF = tuned_model.randomSearch([*range(100,1200)], [.1, .25, .5, .75, 1]) 
lastRunResultsDF

In [None]:
lastRunResultsDF = tuned_model.pseudoGridSearch([*range(100,1000)], [x/100 for x in range(10,101,10)]) 
tuned_model.summarizeResults(lastRunResultsDF).sort_values(by=['number_uncategorized'])

In [None]:
tuned_model.save('tuned_model')

In [None]:
tuned_model = TMT.load('tuned_model')

In [None]:
bt1 = tuned_model.getBERTopicModel(100,10) # Fill in min_cluster_size and min_samples with the best results measured above
topics, probs = bt1.fit_transform(docs, embeddings)

In [None]:
bt1.get_topic_info()

topic_info.to_csv('topic_info_latest.csv', index=False, encoding='utf-8')