In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_parquet('../processed_data/final_data_2015-23')
for i in ['clean_tok']:
    df[i] = df[i].apply(lambda f: f.tolist())

# Data Cleaning

In [None]:
# Remove more chaos by checking word frequency 
words_des = sum(df['clean_tok'], [])
from collections import Counter
ct = Counter(words_des)
sorted_ct = ct.most_common()
for word, count in sorted_ct:
    print(f"{word}: {count}")

In [None]:
chaos = ['【', '】', '...', '）', '（', '#', '[', ']', '！！！', ',', '——', '-', '！！', '*', '.', '……', '+',
        '!', ':', '/', '--', '|', '(', '~', '…', '—', ')', '～', '？？？', ';', '·', '？？', '。。。', '。。'
        '！！！！', '---', '「', '」', 'quot', '..', '●', '**', '▎', '....', '😜' ]

def remove_chaos(word_list):
    clean_words = []
    for word in word_list:
        if word not in chaos:
            clean_words.append(word)
    return clean_words

df['clean_tok'] = df['clean_tok'].apply(remove_chaos)
any(i is None for i in df['clean_tok'])

## Sentiment Analysis using SnownLP

In [None]:
from snownlp import SnowNLP
from snownlp import sentiment
def get_sentiment(text):
    if len(text) == 0:
        return np.nan
    else:
        return SnowNLP(text).sentiments 

In [None]:
# Calculate by joining tokens back together
from tqdm.notebook import tqdm
tqdm.pandas()
df['joined_tok'] = df['clean_tok'].progress_apply(lambda tokens: ''.join(tokens))
df['sentiment'] = df['joined_tok'].progress_apply(get_sentiment)
df['sentiment'] 

In [None]:
df.to_parquet('../processed_data/with_sentiment.parquet')

In [None]:
# check which of "filtered_title" are empaty 
df['t_empty'] = df['filtered_title'].apply(lambda x: len(x) == 0)
print(df['t_empty'].value_counts()


In [None]:
print((df['sentiment']>0.5).value_counts())

In [None]:
import seaborn as sns
sns.histplot(df['sentiment'])

In [None]:
print((df['sentiment']<0.2).value_counts())

In [None]:
# check top word counts with sentiment < 0.2
from collections import Counter
neg_stmt = df[df['sentiment']<0.2]
neg_stmt_words = sum(neg_stmt['clean_tok'], [])
ct_neg_words = Counter(neg_stmt_words)
print(ct_neg_words.most_common(50))

In [None]:
# average (or median) sentiment over years
# df['year'] = df['date'].apply(lambda x: x[:4])
# df['year'] = pd.to_numeric(df['year'])

yearly_mean_sentiment = {}
for year in sorted(df['year'].unique()):
    df_year = df[df['year'] == year]
    mean_stmt_year = df_year['sentiment'].mean()
    yearly_mean_sentiment[year] = mean_stmt_year

print(yearly_mean_sentiment)

# Plot the results
years = list(yearly_mean_sentiment.keys())
mean_sentiments = list(yearly_mean_sentiment.values())

plt.figure(figsize=(10, 6))
plt.plot(years, mean_sentiments, marker='o')
plt.title('Mean Sentiment Over Years')
plt.xlabel('Year')
plt.ylabel('Mean Sentiment')
plt.grid(True)
plt.show()


## Topic Modeling

In [None]:
# # Generate document-term matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.9, min_df=2, tokenizer=lambda x: x, preprocessor=lambda x: x)
dtm = cv.fit_transform(df['clean_tok'])
dtm

### LDA in Sklearn

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=10, random_state=42)

In [None]:
LDA.fit(dtm)

In [None]:
# Grab the volucabulary of words
cv.get_feature_names_out()[10000]

In [None]:
# Grab the topics
LDA.components_

In [None]:
# Grab the highest probability words per topic
single_topic = LDA.components_[9]

In [None]:
# Argsort --> index positions sorted from least to greast
single_topic.argsort()

In [None]:
top_twenty_words = single_topic.argsort()[-20:] # grad the last 20 values  .argsort()

In [None]:
for index in top_twenty_words:
    print(cv.get_feature_names_out()[index])

In [None]:
for i in range(0,7):
    print(f"The top 15 words for topic #{i}")
    for index in LDA.components_[i].argsort()[-20:]:
        print(cv.get_feature_names_out()[index])

In [None]:
topic_results = LDA.transform(dtm)

In [None]:
topic_results.round(2) #shows probability of each topic for each document

In [None]:
dtm.shape

In [None]:
# Generate a column to assign topics to the dataframe
df['topic_lda'] = topic_results.argmax(axis=1)
df

In [None]:
type(df.iloc[1]['fulltext'])

### LDA using Gensim

In [None]:
# Code learned from: https://zhuanlan.zhihu.com/p/133779883

In [None]:
# ! pip install gensim

In [None]:
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import corpora, models, similarities

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim # don't skip this
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
id2word = corpora.Dictionary(df['clean_tok'])
print(id2word)

In [None]:
# 将字典转换为词袋,为文档中的每一个单词创建唯一的ID
corpus = [id2word.doc2bow(token) for token in df['clean_tok']]
print(corpus)

In [None]:
# 可通过如下预期查询id对应的词
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

In [None]:
# 建立LDA模型
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                id2word=id2word,
                num_topics=10,
                random_state=100,
                update_every=1,
                chunksize=100,
                passes=10,
                alpha='auto',
                per_word_topics=True
                )

In [None]:
# 上述LDA模型由10个不同的主题构建，其中每个主题是关键字的组合，并且每个关键字对主题贡献一定的权重，权重反应了关键字对主体的贡献程度。
# num_word代表每个主题的关键字数
pprint(lda_model.print_topics(num_words=20))

In [None]:
'''
模型复杂度和主题一致性提供了一种方便的方法来判断给定主题模型的好坏程度。
特别是主题一致性得分更有帮助。
'''
# def model():
# Compute Perplexity 模型复杂度
print('Perplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['clean_tok'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda) # 主题一致性得分

In [None]:
# 寻找LDA的最佳主题数
coherence_values = []
model_list = []
for num_topics in range(2,15,1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                    id2word=id2word,
                    num_topics=num_topics,
                    random_state=100,
                    update_every=1,
                    chunksize=200,
                    passes=20,
                    alpha='asymmetric',
                    per_word_topics=True
)
    model_list.append(lda_model)
    coherencemodel = CoherenceModel(model=lda_model, texts=df['clean_tok'], dictionary=id2word, coherence='c_v')
    coherence_values.append(round(coherencemodel.get_coherence(),3))

In [None]:
# 最佳主题数可视化展示
x = range(2,15,1)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.savefig('../Graphs/LDA_coherence_scores')
plt.show()

In [None]:
# 展示不同的主题数对应的一致性分数
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# 根据一致性得分，选择具有最高CV的模型，选择的主题数是
# 选择模型并打印主题   
optimal_model = model_list[8]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=5))

In [None]:
# ----------------------------------------在每个句子中找到主要话题--------------------------------------------
# 每个文档都包含多个主题。但是，通常只有一个主题是主导的。下面的代码为每个文档提取该主要主题，并在格式正确的输出中显示该主题的权重。
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                new_row = pd.DataFrame([[int(topic_num), round(prop_topic,4), topic_keywords]],
                       columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])
                sent_topics_df = pd.concat([sent_topics_df, new_row], ignore_index=True)
            else:
                break

    # Add original text to the end of the output
    contents = pd.Series(texts)
    # print(contents)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    # print(sent_topics_df)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=df['fulltext'])

In [None]:
df_topic_sents_keywords

In [None]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
# df_dominant_topic.to_excel(path+'resultsdatas.xlsx',index=False)
df_dominant_topic.head(10)

In [None]:
# -----------------------------------------每个主题中最具有代表性的句子-----------------------------------------
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
grp.sort_values(['Perc_Contribution'], ascending=False).head(2)],
axis=0)

# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

In [None]:
# 文件中字数的频率分布

df_dominant_topic = df_dominant_topic.dropna(axis=0)
doc_lens = [len(d) for d in df_dominant_topic.Text]

# Plot
plt.figure(figsize=(16,7), dpi=160)
plt.hist(doc_lens, bins = 1000, color='navy')
plt.text(750, 100, "Mean : " + str(round(np.mean(doc_lens))))
plt.text(750, 90, "Median : " + str(round(np.median(doc_lens))))
plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens))))
plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99))))

plt.gca().set(xlim=(0, 1000), ylabel='Number of Documents', xlabel='Document Word Count')
plt.tick_params(size=16)
plt.xticks(np.linspace(0,1000,9))
plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
plt.show()

In [None]:
import seaborn as sns
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS'

fig, axes = plt.subplots(2,2,figsize=(16,14), dpi=160, sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    df_dominant_topic_sub = df_dominant_topic.loc[df_dominant_topic.Dominant_Topic == i, :]
    doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
    ax.hist(doc_lens, bins = 1000, color=cols[i])
    ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i])
    sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())
    ax.set(xlim=(0, 1000), xlabel='Document Word Count')
    ax.set_ylabel('Number of Documents', color=cols[i])
    ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=cols[i]))

fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.xticks(np.linspace(0,1000,9))
fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
plt.show()