In [2]:
import re
import pandas as pd
import json
import numpy as np
import seaborn as sns
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [52]:
def df_tweets_candidatos(json_filename):
    df = pd.read_json(json_filename).drop(columns = ['replies'])
    return df

tweets = df_tweets_candidatos('./datasets/dataset.json')


# Adicionar nomes dos candidatos no dataframe
for index, row in tweets.iterrows():
    candidato = ''
    match row.author_id:
        case 2670726740:
            candidato = 'lula'
        case 128372940:
            candidato = 'bolsonaro'
        case 33374761:
            candidato = 'ciro'
        case _:
            candidato: 'n/d'
    tweets.at[index, 'candidato'] = candidato

# Embaralhar tweets
tweets = tweets.sample(frac=1)
        

print(f'Número de tweets: {len(tweets)}\n')
print(tweets.head())

Número de tweets: 300

      author_id      conversation_id                created_at  \
8    2670726740  1560216543895719936 2022-08-18 10:46:26+00:00   
225   128372940  1559476614760628224 2022-08-16 09:46:13+00:00   
145    33374761  1559633242550050816 2022-08-16 20:08:38+00:00   
138    33374761  1559975507726372864 2022-08-17 18:48:38+00:00   
264   128372940  1557122679328620544 2022-08-09 21:52:31+00:00   

                                                  text                   id  \
8    RT @alexandrekalil: Bom dia, gente. Hoje tem @...  1560216543895719936   
225  . Cornélio Procópio/PR: 343,7 kg do entorpecen...  1559476616320925696   
145  🤷‍♂️ Afinal, nada mais justo que eles paguem u...  1559633252889026560   
138  Quando o discurso da esquerda se assemelha ao ...  1559975507726372864   
264  - Só os investimentos já garantidos nos último...  1557122681648054272   

     candidato  
8         lula  
225  bolsonaro  
145       ciro  
138       ciro  
264  bolsonaro  


In [53]:
# create model 
model = BERTopic(language="multilingual", verbose=True, min_topic_size=3, top_n_words=10, calculate_probabilities=True)

# convert to list 
docs = tweets.text.to_list()
 
tweets_topic, probabilities = model.fit_transform(docs)

Batches: 100%|██████████| 10/10 [00:07<00:00,  1.38it/s]
2022-08-23 18:00:00,790 - BERTopic - Transformed documents to Embeddings
2022-08-23 18:00:02,148 - BERTopic - Reduced dimensionality
2022-08-23 18:00:02,175 - BERTopic - Clustered reduced embeddings


Adicionar no dataframe o tópico identificado para cada tweet

In [55]:
print(tweets_topic)

topic_names = model.generate_topic_labels(nr_words=4)
print(topic_names)

for index, tweet_topic in enumerate(tweets_topic):
    tweets.at[index, 'topico'] = topic_names[tweet_topic + 1]

tweets.head()


[-1, 22, -1, 9, 13, -1, 1, 6, 23, 19, 2, 10, 6, 21, 20, 2, 0, 9, -1, -1, -1, -1, 6, 9, 11, 0, -1, 15, 0, 9, 7, 17, 7, 13, 15, 5, 19, 1, 18, 16, 14, -1, 3, -1, 10, 4, -1, 0, 16, 21, 0, 10, 13, 0, 6, 3, 2, 17, 10, -1, -1, 17, 12, -1, -1, 18, 0, 3, 9, 13, 9, 0, 10, 7, 3, 2, 0, -1, 7, 4, 5, 2, 3, 7, 7, -1, 17, 10, 21, 2, -1, -1, 0, -1, 18, -1, 2, -1, 19, 18, 14, 1, -1, 13, -1, 9, 6, 17, 18, 1, -1, 12, -1, 2, -1, -1, -1, -1, 4, -1, 15, 1, 10, 4, 0, 1, 3, 5, 1, -1, 11, 0, 14, 5, -1, 3, 4, 2, -1, 17, -1, 7, 12, 8, 20, 7, -1, 2, 14, 20, 12, 21, 20, 12, 0, 1, 6, -1, 2, 11, 8, 12, 2, 3, 4, 5, 4, 11, 5, 0, 5, -1, 6, 0, 12, 1, 0, 8, 8, -1, 3, 10, 5, 8, 1, -1, 6, 0, 8, 19, 14, 3, 6, 16, 6, 22, -1, 11, 5, 0, -1, 0, 11, -1, 22, 15, -1, 12, 3, 1, 19, 6, 11, 4, 23, 0, 16, 13, 4, 10, 6, -1, 17, -1, 1, -1, 2, 15, 5, 4, 19, -1, 4, 7, 13, 20, 3, 22, 5, 9, 16, 20, -1, -1, 23, -1, -1, 12, -1, -1, 11, 5, -1, 9, 4, 8, 0, 0, 16, 7, -1, -1, 4, 8, 0, -1, -1, -1, 1, 8, 1, 15, 13, -1, 3, 3, 14, -1, 7, -1, 8, -1, -1

Unnamed: 0,author_id,conversation_id,created_at,text,id,candidato,topico
8,2670726740,1560216543895719936,2022-08-18 10:46:26+00:00,"RT @alexandrekalil: Bom dia, gente. Hoje tem @...",1560216543895719936,lula,23_ao_servir_povo_fome
225,128372940,1559476614760628224,2022-08-16 09:46:13+00:00,". Cornélio Procópio/PR: 343,7 kg do entorpecen...",1559476616320925696,bolsonaro,-1_que_ciro_de_não
145,33374761,1559633242550050816,2022-08-16 20:08:38+00:00,"🤷‍♂️ Afinal, nada mais justo que eles paguem u...",1559633252889026560,ciro,7_recuperar_brasil_para_empregos
138,33374761,1559975507726372864,2022-08-17 18:48:38+00:00,Quando o discurso da esquerda se assemelha ao ...,1559975507726372864,ciro,-1_que_ciro_de_não
264,128372940,1557122679328620544,2022-08-09 21:52:31+00:00,- Só os investimentos já garantidos nos último...,1557122681648054272,bolsonaro,0_https_co_disponível_tiktok


In [16]:
freq = model.get_topic_info()
freq

Unnamed: 0,Topic,Count,Name
0,-1,43,-1_que_de_não_do
1,0,67,0_brasil_para_de_vamos
2,1,23,1_https_co_tiktok_disponível
3,2,20,2_ciro_https_co_todos
4,3,18,3_ano_de_mais_ton
5,4,16,4_bolsonaro_ele_ou_que
6,5,14,5_programa_renda_de_mínima
7,6,12,6_lula_equipelula_co_https
8,7,12,7_que_está_política_reeleição
9,8,12,8_de_em_menor_era


In [8]:
model.visualize_topics()

In [9]:
model.visualize_barchart(top_n_topics=5)

In [11]:
model.visualize_heatmap(width=750, height=750)

In [12]:
model.visualize_term_rank()

In [14]:
model.get_topic(1)

[('que', 0.04081668431231023),
 ('carta', 0.03331833918057665),
 ('democracia', 0.0320500737604863),
 ('uma', 0.03104389681357853),
 ('de', 0.028389795985861044),
 ('está', 0.02590884132384162),
 ('um', 0.02509598946908096),
 ('saudades', 0.02443072678449915),
 ('do', 0.022886887371567875),
 ('com', 0.02207844926745981)]

In [22]:
model.visualize_documents(docs)

TODO:
- Normalize, remove stop words, lower case, etc

Adicionar tópico do tweet no DF original
Adicionar nome do candidato no DF original