## Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from nltk.tokenize import word_tokenize
import nltk

In [2]:
tqdm.pandas()

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/semcovici/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Definitions

In [4]:
model_name = 'neuralmind/bert-base-portuguese-cased'

random_seed = 42

raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'
results_cr_path = '../reports/classification_reports/'
test_results_path = '../reports/test_results/'
reports_path = '../reports/'

target_list = ['ig','bo', 'cl', 'co', 'gl', 'lu']

## Read Data

In [5]:
data_list = []

for target in tqdm(target_list):
    
    # read data
    data_temp_train = pd.read_csv(
        raw_data_path + f'train_r3_{target}_top_mentioned_timelines.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    data_temp_test = pd.read_csv(
        raw_data_path + f'test_r3_{target}_top_mentioned_timelines.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    data_temp_train['target'] = target
    data_temp_test['target'] = target
    
    data_temp_train['split'] = "train"
    data_temp_test['split'] = "test"
    
    data_list.append(data_temp_train)
    data_list.append(data_temp_test)
    
data_tmt = pd.concat(data_list)

100%|██████████| 6/6 [00:21<00:00,  3.62s/it]


In [6]:
data_list = []

for target in tqdm(target_list):
    
    # read data
    data_temp_train = pd.read_csv(
        raw_data_path + f'r3_{target}_train_users.csv', 
        sep = ';', 
        encoding='utf-8-sig'
    )
    data_temp_test = pd.read_csv(
        raw_data_path + f'r3_{target}_test_users.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    
    
    
    data_temp_train['target'] = target
    data_temp_test['target'] = target
    
    data_temp_train['split'] = "train"
    data_temp_test['split'] = "test"
    
    data_list.append(data_temp_train)
    data_list.append(data_temp_test)
    
data_users = pd.concat(data_list)

100%|██████████| 6/6 [00:27<00:00,  4.55s/it]


In [7]:
data_users.split.value_counts()/len(data_users)

split
train    0.749822
test     0.250178
Name: count, dtype: float64

In [8]:

data_tmt.split.value_counts()/len(data_tmt)

split
train    0.749822
test     0.250178
Name: count, dtype: float64

In [9]:
def separate_comments(
    data,
    Texts_col = 'Texts',
    sep = ' # '
):
    
    # Separates texts into individual lines
    df_sep_comments = data.assign(Texts=data[Texts_col].str.split(sep)).explode(Texts_col)
    
    df_sep_comments.rename({"Texts":Texts_col},axis = 1)

    # Reindex the resulting DataFrame
    df_sep_comments.reset_index(drop=True, inplace = True)

    df_sep_comments.ffill(inplace = True)
    
    return df_sep_comments

In [10]:
dict_cp = {
    'cl':'Hydrox.',
    'lu':'Lula',
    'co':'Sinovac',
    'ig':'Church',
    'gl':'Globo TV',
    'bo':'Bolsonaro',
}

In [11]:
data_users

Unnamed: 0,User_ID,Timeline,Stance,Polarity,Tweet_Seq,target,split
0,r2_ig_1,@ posso nem comer meu pãozin de queijo em paz ...,tenho pra mim que grande parte senão todas as ...,against,2953,ig,train
1,r2_ig_4,Fim de jogo ++ uma vitoria do meu Vascão # Hoj...,Cidade de Deus Alicate: quer saber vou entrar ...,for,4792,ig,train
2,r2_ig_7,"Meu chefe é todo aleatório, do nada chega com ...",Acordei já sendo removida do grupo da igreja,against,248,ig,train
3,r2_ig_8,veja a receita FILÉ COM MOLHO DE MOSTARDA # Di...,I liked a @ video culto infantil na igreja Ass...,for,45,ig,train
4,r2_ig_10,"Oq tem de gente boa, tem de irritante # Não te...",Essa turma da igreja sao tão amorzinho smp con...,for,3809,ig,train
...,...,...,...,...,...,...,...
267,r2_lu_1086,Gostei de um vídeo @ … com Sweet Carol | The N...,Eu deveria me espelhar no Lula e ler 55 página...,for,381,lu,test
268,r2_lu_1090,Show de bola! Que venham outros … # Essa renda...,Pqp quanta merda em um Tweet só! Pare de mistu...,against,899,lu,test
269,r2_lu_1091,"FOOOOOOOOOOOOOOGOOOOOOOOOOOOOOO!!!!!!! # ""200 ...",nem a Venezuela respeita mais o Brasil sem o L...,for,294,lu,test
270,r2_lu_1093,@ quem prejudica a imagem do Brasil não é o po...,Lula tem uma visão de mundo muito diversa de F...,for,2021,lu,test


In [None]:
dict_config = {
    'top_mentioned_timelines':{
        'data': data_tmt,
        'columns': {
            'Texts': 1
        }
    },
    'users':{
        'data': data_users,
        'columns':{
            'Timeline': 1,
            'Stance': 0
        }
    }
    
}

In [12]:






# dict_results = {}

# for name, config in dict_config.items():
    
    
#     data = config["data"]
    
#     for column, multiple_comments in config["columns"].items():
        
        
#         df_anl = pd.DataFrame({
#             "Target": [],
#             "Against": [],
#             "For": [],
#             "All": [],
#             "Words": [],
#             "Comments/User": [],
#             "W/Tweet": []
#         })
        
#         for i, target in enumerate(target_list):
            
#             print(f'##### Start Running {target} ({i+1} of {len(target_list)}) #####')
            
#             df_target = data[data.target == target]
            
#             counts_target = df_target.Polarity.value_counts()
            
#             n_against = counts_target['against']
#             n_for = counts_target['for']
            
#             if multiple_comments:
#                 # separate comments and drop the duplicates (the comments that appears in more the one user)
#                 df_sep_comments = separate_comments(df_target, Texts_col = column).drop_duplicates(subset=[column])
            
#                 # create column with tokens
#                 df_sep_comments['tokens'] = df_sep_comments[column].progress_apply(lambda x: word_tokenize(x, language='portuguese'))
#                 # create column with count of tokens
#                 df_sep_comments['tokens_count'] = df_sep_comments.tokens.progress_apply(len)
                
#                 gpby_userid = df_sep_comments.groupby('User_ID')
#                 count_users = len(df_sep_comments.User_ID.unique())
            
#                 new_row = {
#                     "Target": target,
#                     "Against": n_against,
#                     "For": n_for,
#                     "All": n_against + n_for,
#                     "Words": df_sep_comments.tokens_count.sum(),
#                     "Comments/User": gpby_userid.size().sum() / count_users,
#                     "W/Tweet": df_sep_comments.tokens_count.sum()/len(df_sep_comments)
#                 }
                
#                 df_anl.loc[len(df_anl)] = new_row
                
#             else:
                
#                 # create column with tokens
#                 df_target['tokens'] = df_target[column].progress_apply(lambda x: word_tokenize(x, language='portuguese'))
#                 # create column with count of tokens
#                 df_target['tokens_count'] = df_target.tokens.progress_apply(len)
                
#                 gpby_userid = df_target.groupby('User_ID')
#                 count_users = len(df_target.User_ID.unique())
            
#                 new_row = {
#                     "Target": target,
#                     "Against": n_against,
#                     "For": n_for,
#                     "All": n_against + n_for,
#                     "Words": df_target.tokens_count.sum(),
#                     "Comments/User": "nsa",
#                     "W/Tweet": df_target.tokens_count.sum()/len(df_target)
#                 }
                
#                 df_anl.loc[len(df_anl)] = new_row
                
            
#             print(f'##### End Running {target} ({i+1} of {len(target_list)}) #####')
            
#         df_anl.Target = df_anl.Target.map(dict_cp)

        
#         counts_target = data.Polarity.value_counts()

#         n_against = counts_target['against']
#         n_for = counts_target['for']


#         if multiple_comments:
#             comment_user = df_anl["Comments/User"].sum()/len(df_anl)
#         else:
#             comment_user = "nsa"
            

#         new_row = {
#             "Target": "Overall",
#             "Against": n_against,
#             "For": n_for,
#             "All": n_against + n_for,
#             "Words": df_anl.Words.sum(),
#             "Comments/User": comment_user,
#             "W/Tweet": df_anl["W/Tweet"].sum()/len(df_anl)
#         }

#         df_anl.loc[len(df_anl)] = new_row
#         df_anl = df_anl.round(2)
        
        
#         dict_results.update({f"{name}_{column}":df_anl})

In [13]:
import pandas as pd
from nltk.tokenize import word_tokenize
from tqdm import tqdm

tqdm.pandas()

def separate_comments(df, Texts_col):
    # Assuming 'User_ID' is the column with user IDs
    df_grouped = df.groupby('User_ID')[Texts_col].apply(lambda x: ' '.join(x)).reset_index()
    return df_grouped

def process_target(target, data, column, multiple_comments):
    print(f'##### Start Running {target} #####')
    
    df_target = data[data.target == target]
    counts_target = df_target.Polarity.value_counts()
    n_against = counts_target.get('against', 0)
    n_for = counts_target.get('for', 0)
    
    if multiple_comments:
        df_sep_comments = separate_comments(df_target, column).drop_duplicates(subset=[column])
        df_sep_comments['tokens'] = df_sep_comments[column].progress_apply(lambda x: word_tokenize(x, language='portuguese'))
        df_sep_comments['tokens_count'] = df_sep_comments.tokens.progress_apply(len)
        gpby_userid = df_sep_comments.groupby('User_ID')
        count_users = len(df_sep_comments.User_ID.unique())
        
        new_row = {
            "Target": target,
            "Against": n_against,
            "For": n_for,
            "All": n_against + n_for,
            "Words": df_sep_comments.tokens_count.sum(),
            "Comments/User": gpby_userid.size().sum() / count_users,
            "W/Tweet": df_sep_comments.tokens_count.sum() / len(df_sep_comments)
        }
    else:
        df_target['tokens'] = df_target[column].progress_apply(lambda x: word_tokenize(x, language='portuguese'))
        df_target['tokens_count'] = df_target.tokens.progress_apply(len)
        gpby_userid = df_target.groupby('User_ID')
        count_users = len(df_target.User_ID.unique())
        
        new_row = {
            "Target": target,
            "Against": n_against,
            "For": n_for,
            "All": n_against + n_for,
            "Words": df_target.tokens_count.sum(),
            "Comments/User": "nsa",
            "W/Tweet": df_target.tokens_count.sum() / len(df_target)
        }
        
    print(f'##### End Running {target} #####')
    return new_row

dict_results = {}

for name, config in dict_config.items():
    data = config["data"]
    
    for column, multiple_comments in config["columns"].items():
        df_anl = pd.DataFrame(columns=["Target", "Against", "For", "All", "Words", "Comments/User", "W/Tweet"])
        target_list = data['target'].unique()
        
        # Process each target sequentially, can be optimized further with parallel processing
        results = [process_target(target, data, column, multiple_comments) for target in target_list]
        
        df_anl.loc[len(df_anl)] = results
                
        df_anl['Target'] = df_anl['Target'].map(dict_cp)
        
        counts_target = data.Polarity.value_counts()
        n_against = counts_target.get('against', 0)
        n_for = counts_target.get('for', 0)
        
        if multiple_comments:
            comment_user = df_anl["Comments/User"].sum() / len(df_anl)
        else:
            comment_user = "nsa"
        
        overall_row = {
            "Target": "Overall",
            "Against": n_against,
            "For": n_for,
            "All": n_against + n_for,
            "Words": df_anl.Words.sum(),
            "Comments/User": comment_user,
            "W/Tweet": df_anl["W/Tweet"].sum() / len(df_anl)
        }
        
        df_anl.loc[len(df_anl)] = overall_row
        df_anl = df_anl.round(2)
        
        dict_results[f"{name}_{column}"] = df_anl


NameError: name 'dict_config' is not defined

In [None]:
df_target

Unnamed: 0,User_ID,Timeline,Stance,Polarity,Tweet_Seq,target,split,tokens,tokens_count
0,r2_lu_1,Foda-se # Hey @ palmeiras é grande ? # @ você ...,Lula tem copa e Bolsonaro não,for,736,lu,train,"[Lula, tem, copa, e, Bolsonaro, não]",6
1,r2_lu_2,@ posso nem comer meu pãozin de queijo em paz ...,nao nao nao nao nao nao eu acabei de ver uma p...,for,2898,lu,train,"[nao, nao, nao, nao, nao, nao, eu, acabei, de,...",49
2,r2_lu_3,"Sai da frente !! # No papel, qual é a melhor d...",Reclama de Bolsonaro mas vota em Lula 🤦‍♂️,against,5736,lu,train,"[Reclama, de, Bolsonaro, mas, vota, em, Lula, ...",8
3,r2_lu_5,KKKKJJJJJKKKKKKKKKK a gente na aula de filosof...,Enviei minha poesia que fala de política p mar...,for,402,lu,train,"[Enviei, minha, poesia, que, fala, de, polític...",33
4,r2_lu_9,"O Finazzi jogava assim também, a bola tinha qu...",Por isso q o país está entregue a esses asnos ...,for,3415,lu,train,"[Por, isso, q, o, país, está, entregue, a, ess...",56
...,...,...,...,...,...,...,...,...,...
267,r2_lu_1086,Gostei de um vídeo @ … com Sweet Carol | The N...,Eu deveria me espelhar no Lula e ler 55 página...,for,381,lu,test,"[Eu, deveria, me, espelhar, no, Lula, e, ler, ...",14
268,r2_lu_1090,Show de bola! Que venham outros … # Essa renda...,Pqp quanta merda em um Tweet só! Pare de mistu...,against,899,lu,test,"[Pqp, quanta, merda, em, um, Tweet, só, !, Par...",28
269,r2_lu_1091,"FOOOOOOOOOOOOOOGOOOOOOOOOOOOOOO!!!!!!! # ""200 ...",nem a Venezuela respeita mais o Brasil sem o L...,for,294,lu,test,"[nem, a, Venezuela, respeita, mais, o, Brasil,...",19
270,r2_lu_1093,@ quem prejudica a imagem do Brasil não é o po...,Lula tem uma visão de mundo muito diversa de F...,for,2021,lu,test,"[Lula, tem, uma, visão, de, mundo, muito, dive...",46


In [None]:
counts_target

Polarity
against    5811
for        5453
Name: count, dtype: int64

In [None]:
n_for

5453

In [None]:
df_anl

Unnamed: 0,Target,Against,For,All,Words,Comments/User,W/Tweet
0,Church,1354,1041,2395,59148,nsa,24.7
1,Bolsonaro,649,102,751,15901,nsa,21.17
2,Hydrox.,1154,1141,2295,68961,nsa,30.05
3,Sinovac,1416,1677,3093,92079,nsa,29.77
4,Globo TV,668,974,1642,27484,nsa,16.74
5,Lula,570,518,1088,26567,nsa,24.42
6,Overall,5811,5453,11264,290140,nsa,24.47


In [None]:
n_for

5453

In [None]:
dict_results

{'top_mentioned_timelines_Texts':       Target  Against   For    All      Words  Comments/User  W/Tweet
 0     Church     1354  1041   2395   56364209        1874.95    14.98
 1  Bolsonaro      649   102    751   14608017        1908.32    14.58
 2    Hydrox.     1154  1141   2295   51770007        2172.46    22.17
 3    Sinovac     1416  1677   3093   69421120        2147.33    20.58
 4   Globo TV      668   974   1642   35004176        1932.34    15.15
 5       Lula      570   518   1088   25731928        2030.16    18.24
 6    Overall     5811  5453  11264  252899457        2010.93    17.61,
 'users_Timeline':       Target  Against   For    All      Words  Comments/User   W/Tweet
 0     Church     1354  1041   2395  140207645            1.0  58541.81
 1  Bolsonaro      649   102    751   41224425            1.0  54892.71
 2    Hydrox.     1154  1141   2295  108484625            1.0  47269.99
 3    Sinovac     1416  1677   3093  137333979            1.0  44401.55
 4   Globo TV      6

In [None]:
new_row

{'Target': 'Overall',
 'Against': 5811,
 'For': 5453,
 'All': 11264,
 'Words': 290140,
 'Comments/User': 'nsa',
 'W/Tweet': 24.474061390301298}

In [None]:
counts_target['against']

5811

In [None]:
for key, value in dict_results.items():
    
    print(key)
    
    display(value)
    
    value.to_csv(reports_path + f'describe/{key}.csv')

top_mentioned_timelines_Texts


Unnamed: 0,Target,Against,For,All,Words,Comments/User,W/Tweet
0,Church,1354,1041,2395,56364209,1874.95,14.98
1,Bolsonaro,649,102,751,14608017,1908.32,14.58
2,Hydrox.,1154,1141,2295,51770007,2172.46,22.17
3,Sinovac,1416,1677,3093,69421120,2147.33,20.58
4,Globo TV,668,974,1642,35004176,1932.34,15.15
5,Lula,570,518,1088,25731928,2030.16,18.24
6,Overall,5811,5453,11264,252899457,2010.93,17.61


users_Timeline


Unnamed: 0,Target,Against,For,All,Words,Comments/User,W/Tweet
0,Church,1354,1041,2395,140207645,1.0,58541.81
1,Bolsonaro,649,102,751,41224425,1.0,54892.71
2,Hydrox.,1154,1141,2295,108484625,1.0,47269.99
3,Sinovac,1416,1677,3093,137333979,1.0,44401.55
4,Globo TV,668,974,1642,93326479,1.0,56837.08
5,Lula,570,518,1088,67569624,1.0,62104.43
6,Overall,5811,5453,11264,588146777,1.0,54007.93


users_Stance


Unnamed: 0,Target,Against,For,All,Words,Comments/User,W/Tweet
0,Church,1354,1041,2395,59148,nsa,24.7
1,Bolsonaro,649,102,751,15901,nsa,21.17
2,Hydrox.,1154,1141,2295,68961,nsa,30.05
3,Sinovac,1416,1677,3093,92079,nsa,29.77
4,Globo TV,668,974,1642,27484,nsa,16.74
5,Lula,570,518,1088,26567,nsa,24.42
6,Overall,5811,5453,11264,290140,nsa,24.47
