In [1]:
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
tqdm.pandas()
import re
import swifter

In [2]:
target_list = [
    'ig',
    'bo', 
    'cl', 
    'co', 
    'gl', 
    'lu'
    ]

dict_cp = {
    'cl':'Hydrox.',
    'lu':'Lula',
    'co':'Sinovac',
    'ig':'Church',
    'gl':'Globo TV',
    'bo':'Bolsonaro',
}


## Train test table

In [3]:
list_df = []

for target in tqdm(target_list):
    
    for split in ['train', 'test']:
    
        df_aux = pd.read_csv( f"../data/raw/r3_{target}_{split}_users.csv", sep = ';', encoding='utf-8-sig')
        df_aux['target'] = target
        df_aux['split'] = split
        list_df.append(df_aux)
    
df_users = pd.concat(list_df)

100%|██████████| 6/6 [00:33<00:00,  5.58s/it]


In [4]:
list_df = []

for target in tqdm(target_list):
    
    for split in ['train', 'test']:
    
        df_aux = pd.read_csv( f"../data/raw/{split}_r3_{target}_top_mentioned_timelines.csv", sep = ';', encoding='utf-8-sig')
        df_aux['target'] = target
        df_aux['split'] = split
        list_df.append(df_aux)
    
df_tmt = pd.concat(list_df)

100%|██████████| 6/6 [00:33<00:00,  5.59s/it]


## Analysis

### Count splits

In [5]:
df_users['target_formated'] = df_users.target.map(dict_cp)
df_users

Unnamed: 0,User_ID,Timeline,Stance,Polarity,Tweet_Seq,target,split,target_formated
0,r2_ig_1,@ posso nem comer meu pãozin de queijo em paz ...,tenho pra mim que grande parte senão todas as ...,against,2953,ig,train,Church
1,r2_ig_4,Fim de jogo ++ uma vitoria do meu Vascão # Hoj...,Cidade de Deus Alicate: quer saber vou entrar ...,for,4792,ig,train,Church
2,r2_ig_7,"Meu chefe é todo aleatório, do nada chega com ...",Acordei já sendo removida do grupo da igreja,against,248,ig,train,Church
3,r2_ig_8,veja a receita FILÉ COM MOLHO DE MOSTARDA # Di...,I liked a @ video culto infantil na igreja Ass...,for,45,ig,train,Church
4,r2_ig_10,"Oq tem de gente boa, tem de irritante # Não te...",Essa turma da igreja sao tão amorzinho smp con...,for,3809,ig,train,Church
...,...,...,...,...,...,...,...,...
267,r2_lu_1086,Gostei de um vídeo @ … com Sweet Carol | The N...,Eu deveria me espelhar no Lula e ler 55 página...,for,381,lu,test,Lula
268,r2_lu_1090,Show de bola! Que venham outros … # Essa renda...,Pqp quanta merda em um Tweet só! Pare de mistu...,against,899,lu,test,Lula
269,r2_lu_1091,"FOOOOOOOOOOOOOOGOOOOOOOOOOOOOOO!!!!!!! # ""200 ...",nem a Venezuela respeita mais o Brasil sem o L...,for,294,lu,test,Lula
270,r2_lu_1093,@ quem prejudica a imagem do Brasil não é o po...,Lula tem uma visão de mundo muito diversa de F...,for,2021,lu,test,Lula


In [6]:
df_count_splits = df_users.groupby(['target_formated', 'split']).size().reset_index(drop=False).rename({0:'count',"target_formated": "target"}, axis=1)
df_count_splits

Unnamed: 0,target,split,count
0,Bolsonaro,test,188
1,Bolsonaro,train,563
2,Church,test,599
3,Church,train,1796
4,Globo TV,test,411
5,Globo TV,train,1231
6,Hydrox.,test,574
7,Hydrox.,train,1721
8,Lula,test,272
9,Lula,train,816


In [7]:
print(df_count_splits.to_latex(index=False))

\begin{tabular}{llr}
\toprule
target & split & count \\
\midrule
Bolsonaro & test & 188 \\
Bolsonaro & train & 563 \\
Church & test & 599 \\
Church & train & 1796 \\
Globo TV & test & 411 \\
Globo TV & train & 1231 \\
Hydrox. & test & 574 \\
Hydrox. & train & 1721 \\
Lula & test & 272 \\
Lula & train & 816 \\
Sinovac & test & 774 \\
Sinovac & train & 2319 \\
\bottomrule
\end{tabular}



### Count per class

In [8]:
# Passo 1: Agrupamento e contagem (já feito)
grouped_df = df_users.groupby(["target_formated", "Polarity"]).size().to_frame(name='Count').reset_index()

# Passo 2: Transformar em uma tabela pivotada
pivot_df = grouped_df.pivot(index='target_formated', columns='Polarity', values='Count').fillna(0)
pivot_df['All'] = pivot_df['against'] + pivot_df['for']

# Passo 3: Adicionar linha totalizadora
pivot_df.loc['Overall'] = pivot_df.sum()

# Reordenar colunas para o formato desejado
pivot_df = pivot_df[['against', 'for', 'All']]

# Renomear colunas e índice
pivot_df.index.name = 'Target'
pivot_df.reset_index(inplace=True)

pivot_df


Polarity,Target,against,for,All
0,Bolsonaro,649,102,751
1,Church,1354,1041,2395
2,Globo TV,668,974,1642
3,Hydrox.,1154,1141,2295
4,Lula,570,518,1088
5,Sinovac,1416,1677,3093
6,Overall,5811,5453,11264


In [9]:
print(pivot_df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
Target & against & for & All \\
\midrule
Bolsonaro & 649 & 102 & 751 \\
Church & 1354 & 1041 & 2395 \\
Globo TV & 668 & 974 & 1642 \\
Hydrox. & 1154 & 1141 & 2295 \\
Lula & 570 & 518 & 1088 \\
Sinovac & 1416 & 1677 & 3093 \\
Overall & 5811 & 5453 & 11264 \\
\bottomrule
\end{tabular}



### Count tokens

In [10]:
WORD = re.compile(r'\w+')
def tokenize(text):
    
    #return word_tokenize(text, language='portuguese')
    return WORD.findall(text)
    

In [11]:
df_users.head()

Unnamed: 0,User_ID,Timeline,Stance,Polarity,Tweet_Seq,target,split,target_formated
0,r2_ig_1,@ posso nem comer meu pãozin de queijo em paz ...,tenho pra mim que grande parte senão todas as ...,against,2953,ig,train,Church
1,r2_ig_4,Fim de jogo ++ uma vitoria do meu Vascão # Hoj...,Cidade de Deus Alicate: quer saber vou entrar ...,for,4792,ig,train,Church
2,r2_ig_7,"Meu chefe é todo aleatório, do nada chega com ...",Acordei já sendo removida do grupo da igreja,against,248,ig,train,Church
3,r2_ig_8,veja a receita FILÉ COM MOLHO DE MOSTARDA # Di...,I liked a @ video culto infantil na igreja Ass...,for,45,ig,train,Church
4,r2_ig_10,"Oq tem de gente boa, tem de irritante # Não te...",Essa turma da igreja sao tão amorzinho smp con...,for,3809,ig,train,Church


#### Stance

In [12]:
# count toknes in each column
df_users['count_tokens_Stance'] = df_users['Stance'].progress_apply(lambda text: len(tokenize(text)))

100%|██████████| 11264/11264 [00:00<00:00, 151717.87it/s]


In [13]:
df_stats_stance = pd.DataFrame({})

In [14]:
count_tk_stance = df_users.groupby(['target_formated'])['count_tokens_Stance'].sum().to_frame().reset_index()
count_tk_stance

Unnamed: 0,target_formated,count_tokens_Stance
0,Bolsonaro,14527
1,Church,54123
2,Globo TV,25399
3,Hydrox.,61487
4,Lula,24018
5,Sinovac,82604


In [15]:
df_stats_stance = pd.concat([df_stats_stance, count_tk_stance])
df_stats_stance

Unnamed: 0,target_formated,count_tokens_Stance
0,Bolsonaro,14527
1,Church,54123
2,Globo TV,25399
3,Hydrox.,61487
4,Lula,24018
5,Sinovac,82604


In [16]:
count_per_target = df_users.groupby(['target_formated']).size().reset_index().rename(columns = {0:"n_tweets"})
count_per_target

Unnamed: 0,target_formated,n_tweets
0,Bolsonaro,751
1,Church,2395
2,Globo TV,1642
3,Hydrox.,2295
4,Lula,1088
5,Sinovac,3093


In [17]:
df_stats_stance['W/tweets'] = count_tk_stance['count_tokens_Stance'] / count_per_target['n_tweets']
df_stats_stance

Unnamed: 0,target_formated,count_tokens_Stance,W/tweets
0,Bolsonaro,14527,19.343542
1,Church,54123,22.59833
2,Globo TV,25399,15.468331
3,Hydrox.,61487,26.791721
4,Lula,24018,22.075368
5,Sinovac,82604,26.706757


In [18]:
print(df_stats_stance.to_latex(
    index=False,
    float_format="%.2f"
))

\begin{tabular}{lrr}
\toprule
target_formated & count_tokens_Stance & W/tweets \\
\midrule
Bolsonaro & 14527 & 19.34 \\
Church & 54123 & 22.60 \\
Globo TV & 25399 & 15.47 \\
Hydrox. & 61487 & 26.79 \\
Lula & 24018 & 22.08 \\
Sinovac & 82604 & 26.71 \\
\bottomrule
\end{tabular}



#### Timeline

In [19]:
df_users['n_tweets_Timeline'] = df_users['Timeline'].progress_apply(lambda x: x.count(" # ") + 1)
df_users

100%|██████████| 11264/11264 [00:25<00:00, 449.35it/s]


Unnamed: 0,User_ID,Timeline,Stance,Polarity,Tweet_Seq,target,split,target_formated,count_tokens_Stance,n_tweets_Timeline
0,r2_ig_1,@ posso nem comer meu pãozin de queijo em paz ...,tenho pra mim que grande parte senão todas as ...,against,2953,ig,train,Church,38,3150
1,r2_ig_4,Fim de jogo ++ uma vitoria do meu Vascão # Hoj...,Cidade de Deus Alicate: quer saber vou entrar ...,for,4792,ig,train,Church,18,5836
2,r2_ig_7,"Meu chefe é todo aleatório, do nada chega com ...",Acordei já sendo removida do grupo da igreja,against,248,ig,train,Church,8,1646
3,r2_ig_8,veja a receita FILÉ COM MOLHO DE MOSTARDA # Di...,I liked a @ video culto infantil na igreja Ass...,for,45,ig,train,Church,13,680
4,r2_ig_10,"Oq tem de gente boa, tem de irritante # Não te...",Essa turma da igreja sao tão amorzinho smp con...,for,3809,ig,train,Church,19,5569
...,...,...,...,...,...,...,...,...,...,...
267,r2_lu_1086,Gostei de um vídeo @ … com Sweet Carol | The N...,Eu deveria me espelhar no Lula e ler 55 página...,for,381,lu,test,Lula,14,4520
268,r2_lu_1090,Show de bola! Que venham outros … # Essa renda...,Pqp quanta merda em um Tweet só! Pare de mistu...,against,899,lu,test,Lula,26,2173
269,r2_lu_1091,"FOOOOOOOOOOOOOOGOOOOOOOOOOOOOOO!!!!!!! # ""200 ...",nem a Venezuela respeita mais o Brasil sem o L...,for,294,lu,test,Lula,17,478
270,r2_lu_1093,@ quem prejudica a imagem do Brasil não é o po...,Lula tem uma visão de mundo muito diversa de F...,for,2021,lu,test,Lula,42,3734


In [20]:
df_users['count_tokens_Timeline'] = df_users['Timeline'].progress_apply(lambda text: len(tokenize(text)))

100%|██████████| 11264/11264 [02:08<00:00, 87.90it/s]


In [21]:
df_stats_timeline = pd.DataFrame({})

df_stats_timeline = df_users.groupby(['target_formated'])[['count_tokens_Timeline', 'n_tweets_Timeline']].sum().reset_index()

df_stats_timeline = pd.concat([
    df_stats_timeline, 
    count_per_target.rename(
        {'n_tweets': 'n_rows'}, axis =1 
        )['n_rows']], axis =1)

df_stats_timeline['Tweets/row'] = df_stats_timeline['n_tweets_Timeline'] / df_stats_timeline['n_rows'] 
df_stats_timeline['Tokens/row'] = df_stats_timeline['count_tokens_Timeline'] / df_stats_timeline['n_rows'] 

df_stats_timeline_total = df_stats_timeline.copy()

# dataframe com o total de tokens e numero total de tweets
df_stats_timeline_total = df_stats_timeline_total[['target_formated', 'count_tokens_Timeline', 'n_tweets_Timeline']]
df_stats_timeline_total.rename({
    'target_formated': 'Alvo', 
    'count_tokens_Timeline': 'Tokens',
    'n_tweets_Timeline': 'Tweets',
    }, axis = 1, inplace=True)


overall = df_stats_timeline_total[['Tokens', 'Tweets']].mean()
overall['Alvo'] = 'Total'
df_stats_timeline_total.loc[len(df_stats_timeline_total)] = overall







# dataframe com a media de tokens e tweets por linha
df_stats_timeline = df_stats_timeline[['target_formated', 'Tweets/row', 'Tokens/row']].rename({'target_formated': 'Target'}, axis =1)

overall = df_stats_timeline[['Tweets/row', 'Tokens/row']].mean()
overall['Target'] = 'Overall'

df_stats_timeline.loc[len(df_stats_timeline)] = overall

In [22]:
print(df_stats_timeline.to_latex(index=False,float_format="%.2f"))

\begin{tabular}{lrr}
\toprule
Target & Tweets/row & Tokens/row \\
\midrule
Bolsonaro & 3893.48 & 45340.20 \\
Church & 4006.55 & 48317.47 \\
Globo TV & 3999.78 & 46779.65 \\
Hydrox. & 2130.51 & 36819.52 \\
Lula & 3795.12 & 50616.54 \\
Sinovac & 2112.44 & 34900.99 \\
Overall & 3322.98 & 43795.73 \\
\bottomrule
\end{tabular}



In [23]:
print(df_stats_timeline_total.to_latex(index=False,float_format="%.2f"))

\begin{tabular}{lrr}
\toprule
Alvo & Tokens & Tweets \\
\midrule
Bolsonaro & 34050492.00 & 2924000.00 \\
Church & 115720349.00 & 9595698.00 \\
Globo TV & 76812192.00 & 6567638.00 \\
Hydrox. & 84500788.00 & 4889523.00 \\
Lula & 55070796.00 & 4129086.00 \\
Sinovac & 107948749.00 & 6533769.00 \\
Total & 79017227.67 & 5773285.67 \\
\bottomrule
\end{tabular}



#### Texts

In [24]:
df_tmt['n_tweets_Texts'] = df_tmt['Texts'].progress_apply(lambda x: x.count(" # ") + 1)
df_tmt

100%|██████████| 11264/11264 [00:14<00:00, 789.74it/s] 


Unnamed: 0,User_ID,Polarity,Texts,target,split,n_tweets_Texts
0,r2_ig_1,against,PQP ESSE DORAMA É MUITO FOADA(Sassy GoGo(Cheer...,ig,train,878
1,r2_ig_4,for,Golaço!!!!!!!!! # Manda geral do time principa...,ig,train,533
2,r2_ig_7,against,"@gabycunha86 Amanhã vou aí, deixa pra terça # ...",ig,train,956
3,r2_ig_8,for,3.4- O Centro de Coordenação da Operação está ...,ig,train,1153
4,r2_ig_10,for,"Me arrependi de excluir meu outro tt, agora ti...",ig,train,91
...,...,...,...,...,...,...
267,r2_lu_1086,for,zona oeste carioca vc sai na rua comete crimes...,lu,test,902
268,r2_lu_1090,against,@ferrazvitor Muito orgulho de estar ao lado de...,lu,test,992
269,r2_lu_1091,for,O jornal mais influente do planeta opina sobre...,lu,test,2931
270,r2_lu_1093,for,"Após atuação do MPF, ANS determina tratamento ...",lu,test,3107


In [25]:
df_tmt['count_tokens_Texts'] = df_tmt['Texts'].progress_apply(lambda text: len(tokenize(text)))

100%|██████████| 11264/11264 [02:02<00:00, 91.60it/s] 


In [26]:
df_tmt['target_formated'] = df_tmt.target.map(dict_cp)

In [27]:
df_stats_tmt = pd.DataFrame({})

df_stats_tmt = df_tmt.groupby(['target_formated'])[['count_tokens_Texts', 'n_tweets_Texts']].sum().reset_index()

df_stats_tmt = pd.concat([
    df_stats_tmt, 
    count_per_target.rename(
        {'n_tweets': 'n_rows'}, axis =1 
        )['n_rows']], axis =1)

df_stats_tmt['Tweets/row'] = df_stats_tmt['n_tweets_Texts'] / df_stats_tmt['n_rows'] 
df_stats_tmt['Tokens/row'] = df_stats_tmt['count_tokens_Texts'] / df_stats_tmt['n_rows'] 

df_stats_tmt_total = df_stats_tmt.copy()

# dataframe com o total de tokens e numero total de tweets
df_stats_tmt_total = df_stats_tmt[['target_formated', 'count_tokens_Texts', 'n_tweets_Texts']]
df_stats_tmt_total.rename({
    'target_formated': 'Alvo', 
    'count_tokens_Texts': 'Tokens',
    'n_tweets_Texts': 'Tweets',
    }, axis = 1, inplace=True)


overall = df_stats_tmt_total[['Tokens', 'Tweets']].mean()
overall['Alvo'] = 'Total'
df_stats_tmt_total.loc[len(df_stats_tmt_total)] = overall








df_stats_tmt = df_stats_tmt[['target_formated', 'Tweets/row', 'Tokens/row']].rename({'target_formated': 'Target'}, axis =1)

overall = df_stats_tmt[['Tweets/row', 'Tokens/row']].mean()
overall['Target'] = 'Overall'

df_stats_tmt.loc[len(df_stats_tmt)] = overall

print(df_stats_tmt.to_latex(
    index=False,
    float_format="%.2f"
))

\begin{tabular}{lrr}
\toprule
Target & Tweets/row & Tokens/row \\
\midrule
Bolsonaro & 1795.14 & 27580.76 \\
Church & 1974.23 & 27302.34 \\
Globo TV & 2243.35 & 33520.23 \\
Hydrox. & 2207.02 & 43878.94 \\
Lula & 1983.69 & 35794.42 \\
Sinovac & 2302.30 & 47583.59 \\
Overall & 2084.29 & 35943.38 \\
\bottomrule
\end{tabular}



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stats_tmt_total.rename({
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stats_tmt_total.loc[len(df_stats_tmt_total)] = overall


In [28]:
print(df_stats_tmt_total.to_latex(index=False,float_format="%.2f"))

\begin{tabular}{lrr}
\toprule
Alvo & Tokens & Tweets \\
\midrule
Bolsonaro & 20713148.00 & 1348147.00 \\
Church & 65389115.00 & 4728279.00 \\
Globo TV & 55040219.00 & 3683586.00 \\
Hydrox. & 100702164.00 & 5065112.00 \\
Lula & 38944325.00 & 2158256.00 \\
Sinovac & 147176039.00 & 7121011.00 \\
Total & 71327501.67 & 4017398.50 \\
\bottomrule
\end{tabular}

