# Init

In [1]:
!pip install nltk
import nltk
!pip install --upgrade gensim
!pip install pyLDAvis



In [2]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
import pyLDAvis
import pyLDAvis.lda_model

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
with open('df.p', 'rb') as file:
    df = pickle.load(file)
df.head()

Unnamed: 0,text
0,programa treinamento jornalismo agroindústr su...
1,ofício enviar juiz sergio morar em este sexta ...
2,ex governador rio sérgio cabral transferir com...
3,aplauso vaia juiz federal sergio Moro responsá...
4,ex Executivo empreiteira odebrecht afirmar aco...


# TF-IDF

In [5]:
tfidf_vectorizer = TfidfVectorizer(
    strip_accents='ascii',
    min_df=10,
    max_features=5000
)

In [6]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])
tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# LDA

In [7]:
tokenized_docs = [word_tokenize(doc.lower(), language="portuguese") for doc in df['text']]

In [8]:
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

In [9]:
def compute_coherence_values(corpus, dictionary, texts, limit, start=2, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=42)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [10]:
limit = 5
model_list, coherence_values = compute_coherence_values(corpus=corpus, dictionary=dictionary, texts=tokenized_docs, limit=limit)

In [11]:
optimal_model_index = coherence_values.index(max(coherence_values))
optimal_model = model_list[optimal_model_index]
optimal_num_topics = optimal_model.num_topics

print(f"Melhor número de tópicos: {optimal_num_topics}")

Melhor número de tópicos: 2


In [12]:
def get_document_topic(lda_model, corpus):
    document_topics = []
    for bow in corpus:
        topic_probs = lda_model.get_document_topics(bow, minimum_probability=0.0)
        dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
        document_topics.append(dominant_topic)
    return document_topics

In [13]:
document_topics = get_document_topic(optimal_model, corpus)

In [14]:
df = pd.DataFrame({'doc': df['text'], 'topic': document_topics})
df.head()

Unnamed: 0,doc,topic
0,programa treinamento jornalismo agroindústr su...,1
1,ofício enviar juiz sergio morar em este sexta ...,0
2,ex governador rio sérgio cabral transferir com...,0
3,aplauso vaia juiz federal sergio Moro responsá...,0
4,ex Executivo empreiteira odebrecht afirmar aco...,0


In [15]:
def get_topic_words(lda_model, num_words=5):
    topic_words = {}
    for i, topic in enumerate(lda_model.show_topics(formatted=False, num_words=num_words)):
        topic_words[i] = [word for word, _ in topic[1]]
    return topic_words

# Obter as palavras representativas para cada tópico
topic_words = get_topic_words(optimal_model, num_words=5)

# Exibir os tópicos e suas palavras mais importantes
for topic, words in topic_words.items():
    print(f"Tópico {topic}: {' '.join(words)}")

Tópico 0: dizer poder fazer ano dia
Tópico 1: ano poder dizer grande em


# LDA visualizer

In [16]:
pyLDAvis.enable_notebook()
pyLDAvis.lda_model.prepare

<function pyLDAvis.lda_model.prepare(lda_model, dtm, vectorizer, **kwargs)>

In [17]:
pyLDAvis.lda_model.prepare(optimal_model, tfidf_matrix, tfidf_vectorizer, sort_topics=False, mds='tsne')

AttributeError: 'Series' object has no attribute 'getA1'