In [None]:
# Import libraries
import pandas as pd
import nltk
import gensim
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
# Load Datasets

# Dataset II
#df = pd.read_csv('./dataset_II.csv')

# Select only Non-Relevant posts
#df = df[df['Relevante'] == 'Não'].reset_index(drop=True)

# Select only Relevant posts
#df = df[df['Relevante'] == 'Sim'].reset_index(drop=True)


# Dataset III
df = pd.read_csv('./dataset_III.csv')

# Select only Non-Relevant posts
#df = df[df['previsao_binaria'] == 0].reset_index(drop=True)

# Select only Relevant posts
df = df[df['previsao_binaria'] == 1].reset_index(drop=True)


# Select only posts predicted as Low Relevance
#df = df[df['Relevância'] == 'Baixa'].reset_index(drop=True)

# Select only posts predicted as Medium Relevance
#df = df[df['Relevância'] == 'Média'].reset_index(drop=True)

# Select only posts predicted as High Relevanc
#df = df[df['Relevância'] == 'Alta'].reset_index(drop=True)

In [None]:
df

In [None]:
'''
# Remover novas stopwors
stop_words = ["aqui", "pode", "sobre", "fazer", "alguem", "tudo", "coisa", "novato", "bem",
              "vou", "sei", "boca", "algum", "alguns", "alguma", "algo", "nada", "bom", "entao",
              "quer", "the", "and", "you", "cara", "coisas", "sim", "ainda", "ver", "usar", 
              "assim", "acho"]
'''

In [None]:
'''
def preprocess_text_b(text):
    # Separa em tokens
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens]
    return tokens

# Aplica a função preprocess_text_b a cada documento e cria uma lista de listas de tokens
documents = [preprocess_text_b(doc) for doc in df['text']]
'''

In [None]:
# Select the column full_text
documents = df['full_text']


# Transform the list of documents into a list of lists of words
texts = [str(doc).split() for doc in documents]


# Create a dictionary with the words
dictionary = corpora.Dictionary(texts)

# Filter out rare words and stopwords
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create the corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Create the LDA model with 10 topics
lda_model = LdaModel(
    corpus=corpus,
    num_topics=10,
    id2word=dictionary,
    random_state=51,
    passes=10,
    iterations=1000,
    minimum_probability=0.01
    
)


# Create lists to store the indices of documents
document_indexes = [[] for _ in range(lda_model.num_topics)]

# Iterate through the topics
for i in range(lda_model.num_topics):
    # Select terms from the topic with a minimum probability of 0.01
    topic_terms = lda_model.get_topic_terms(i)
    # Print the selected terms
    print(f"Tópico {i}: {[dictionary[id] for id, _ in topic_terms]}")
    
   
    # Retrieve documents with the highest probability for the current topic
    top_documents = []
    for doc in corpus:
        doc_topics = lda_model.get_document_topics(doc, minimum_probability=0)
        if doc_topics:
            top_topic = max(doc_topics, key=lambda x: x[1])[0]
            if top_topic == i:
                top_documents.append(doc)
    
    # Store the indices of documents for the current topic    
    document_indexes[i] = [i for i, doc in enumerate(corpus) if doc in top_documents]
    

print()  # Blank line for separation

# Quantify how many documents are associated with each topic
topic_count = [0] * lda_model.num_topics
for doc in corpus:
    doc_topics = lda_model.get_document_topics(doc, minimum_probability=0)
    if doc_topics:
        top_topic = max(doc_topics, key=lambda x: x[1])[0]
        topic_count[top_topic] += 1

total_docs = 0
for i, count in enumerate(topic_count):
    print(f"Quantidade de documentos no tópico {i}: {count}")           
    total_docs += count
print(f"Total de Documentos: {total_docs}")

# Print the lists of document indices
for i in range(lda_model.num_topics):
    print()  # Blank line to separate the topics
    print(f"Documentos do tópico {i}: {document_indexes[i]}")

In [None]:
# Create the text to count the word frequencies

text = ' '.join(df['full_text'].dropna().tolist())


# Split the text into words
words = text.split()

# Count the frequency of each word
word_counts = Counter(words)

# Extract the top 100 most frequent words
top_words = word_counts.most_common(100)

# Extract the words and their counts
word_list, count_list = zip(*top_words)


# Create the numbered list of words
numbered_word_list = [f"{i+1} - {word}" for i, word in enumerate(word_list)]

# Display the numbered list on the console
for numbered_word in numbered_word_list:
    print(numbered_word)

# Save the numbered list to a text file
with open("100_lista_palavras_mais_frequentes_III_rel.txt", "w", encoding="utf-8") as file:
    file.write('\n'.join(numbered_word_list))


# To display the word counts in a bar chart
plt.figure(figsize=(20, 12))
plt.bar(word_list, count_list)
plt.xlabel('Palavras (100 mais frequentes)')
plt.ylabel('Frequência')
plt.xticks(rotation=90)
plt.savefig("100palavras_mais_frequentes_CJ_DADOS_III_relevantes.png", dpi=500, bbox_inches='tight')