In [14]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import matplotlib.pyplot as plt
import nltk
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
nltk.download('stopwords')

df = pd.read_csv('../../data/output_data_chula.csv')

def preprocess_text(text):
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    tokens = tokenizer.tokenize(text.lower())
    filtered_words = [w for w in tokens if not w in stop_words and len(w) > 3]
    return filtered_words

df = df.dropna(subset=['Title'])  
df['clean_abstracts'] = df['Title'].apply(preprocess_text)

df = df[df['clean_abstracts'].map(len) > 0]  

dictionary = corpora.Dictionary(df['clean_abstracts'])
corpus = [dictionary.doc2bow(text) for text in df['clean_abstracts']]

# Applying LDA
lda_model = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Displaying the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Visualizing topics
try:
    pyLDAvis.enable_notebook()
    
    lda_visualization = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)
    pyLDAvis.display(lda_visualization)
except ImportError:
    print("some error occurs")



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sirasittanrattanawong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(0, '0.016*"patients" + 0.013*"study" + 0.012*"thai" + 0.010*"thailand" + 0.008*"among"')
(1, '0.017*"proton" + 0.016*"production" + 0.012*"collisions" + 0.009*"search" + 0.007*"analysis"')
(2, '0.031*"thailand" + 0.010*"thai" + 0.008*"case" + 0.008*"study" + 0.005*"management"')
(3, '0.026*"using" + 0.021*"based" + 0.009*"detection" + 0.006*"model" + 0.006*"system"')
(4, '0.010*"activity" + 0.010*"effect" + 0.009*"properties" + 0.009*"acid" + 0.008*"cells"')


In [15]:
pyLDAvis.display(lda_visualization)