## Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from nltk.tokenize import word_tokenize
import nltk

In [2]:
tqdm.pandas()

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/semcovici/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Definitions

In [4]:
model_name = 'neuralmind/bert-base-portuguese-cased'

random_seed = 42

raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'
results_cr_path = '../reports/classification_reports/'
test_results_path = '../reports/test_results/'
reports_path = '../reports/'

target_list = ['ig','bo', 'cl', 'co', 'gl', 'lu']

## Read Data

In [5]:
data_list = []

for target in target_list:
    
    # read data
    data_temp = pd.read_csv(
        raw_data_path + f'train_r3_{target}_top_mentioned_timelines.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    data_temp['target'] = target
    
    data_list.append(data_temp)
    
data_tmt = pd.concat(data_list)

In [6]:
data_list = []

for target in target_list:
    
    # read data
    data_temp = pd.read_csv(
        raw_data_path + f'r3_{target}_test_users.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    data_temp['target'] = target
    
    data_list.append(data_temp)
    
data_users = pd.concat(data_list)

In [7]:
def separate_comments(
    data,
    Texts_col = 'Texts',
    sep = ' # '
):
    
    # Separates texts into individual lines
    df_sep_comments = data.assign(Texts=data[Texts_col].str.split(sep)).explode(Texts_col)
    
    df_sep_comments.rename({"Texts":Texts_col},axis = 1)

    # Reindex the resulting DataFrame
    df_sep_comments.reset_index(drop=True, inplace = True)

    df_sep_comments.ffill(inplace = True)
    
    return df_sep_comments

In [8]:
dict_cp = {
    'cl':'Hydrox.',
    'lu':'Lula',
    'co':'Sinovac',
    'ig':'Church',
    'gl':'Globo TV',
    'bo':'Bolsonaro',
}

In [9]:
data_users

Unnamed: 0,User_ID,Timeline,Stance,Polarity,Tweet_Seq,target
0,r2_ig_2,@ ta fzd oq # uma amiga minha tava na rua quan...,nao me diz que isso é de igreja católica pf,against,1874,ig
1,r2_ig_3,@ Ola Como ta # Vamo Seguir @ estamos querendo...,Se a igreja faz isso ela devia ser isenta mesm...,against,3988,ig
2,r2_ig_5,papai me deu um irmão lindo desse # cansada de...,"que pena então, por que se cada espírita for r...",against,4532,ig
3,r2_ig_6,Né primeiro de abril não ta # Pena que um pais...,bglh é entrar p igreja,for,2661,ig
4,r2_ig_9,já acordei nun desânimo que pqp # vontade de n...,já vou levar pra igreja pra Deus benzer pq o q...,for,1441,ig
...,...,...,...,...,...,...
267,r2_lu_1086,Gostei de um vídeo @ … com Sweet Carol | The N...,Eu deveria me espelhar no Lula e ler 55 página...,for,381,lu
268,r2_lu_1090,Show de bola! Que venham outros … # Essa renda...,Pqp quanta merda em um Tweet só! Pare de mistu...,against,899,lu
269,r2_lu_1091,"FOOOOOOOOOOOOOOGOOOOOOOOOOOOOOO!!!!!!! # ""200 ...",nem a Venezuela respeita mais o Brasil sem o L...,for,294,lu
270,r2_lu_1093,@ quem prejudica a imagem do Brasil não é o po...,Lula tem uma visão de mundo muito diversa de F...,for,2021,lu


In [20]:



dict_config = {
    'top_mentioned_timelines':{
        'data': data_tmt,
        'columns': {
            'Texts': 1
        }
    },
    'users':{
        'data': data_users,
        'columns':{
            'Timeline': 1,
            'Stance': 0
        }
    }
    
}


dict_results = {}

for name, config in dict_config.items():
    
    
    data = config["data"]
    
    for column, multiple_comments in config["columns"].items():
        
        
        df_anl = pd.DataFrame({
            "Target": [],
            "Against": [],
            "For": [],
            "All": [],
            "Words": [],
            "W/Tweet": []
        })
        
        for i, target in enumerate(target_list):
            
            print(f'##### Start Running {target} ({i+1} of {len(target_list)}) #####')
            
            df_target = data[data.target == target]
            
            counts_target = df_target.Polarity.value_counts()
            
            n_against = counts_target['against']
            n_for = counts_target['for']
            
            if multiple_comments:
                # separate comments and drop the duplicates (the comments that appears in more the one user)
                df_sep_comments = separate_comments(df_target, Texts_col = column).drop_duplicates(subset=[column])
            
            # create column with tokens
            df_sep_comments['tokens'] = df_sep_comments[column].progress_apply(lambda x: word_tokenize(x, language='portuguese'))
            # create column with count of tokens
            df_sep_comments['tokens_count'] = df_sep_comments.tokens.progress_apply(len)
            
            new_row = {
                "Target": target,
                "Against": n_against,
                "For": n_for,
                "All": n_against + n_for,
                "Words": df_sep_comments.tokens_count.sum(),
                "W/Tweet": df_sep_comments.tokens_count.sum()/len(df_sep_comments)
            }
            
            df_anl.loc[len(df_anl)] = new_row
            
            print(f'##### End Running {target} ({i+1} of {len(target_list)}) #####')
            
        df_anl.Target = df_anl.Target.map(dict_cp)

        
        counts_target = data.Polarity.value_counts()

        n_against = counts_target['against']
        n_for = counts_target['for']


        new_row = {
            "Target": "Overall",
            "Against": n_against,
            "For": n_for,
            "All": n_against + n_for,
            "Words": df_anl.Words.sum(),
            "W/Tweet": df_anl["W/Tweet"].sum()/len(df_anl)
        }

        df_anl.loc[len(df_anl)] = new_row
        df_anl = df_anl.round(2)
        
        
        dict_results.update({f"{name}_{column}":df_anl})

##### Start Running ig (1 of 6) #####


100%|██████████| 2868127/2868127 [02:57<00:00, 16131.97it/s]
100%|██████████| 2868127/2868127 [00:01<00:00, 1683840.57it/s]


##### End Running ig (1 of 6) #####
##### Start Running bo (2 of 6) #####


100%|██████████| 791136/791136 [00:47<00:00, 16575.07it/s]
100%|██████████| 791136/791136 [00:00<00:00, 1733808.44it/s]


##### End Running bo (2 of 6) #####
##### Start Running cl (3 of 6) #####


100%|██████████| 1815389/1815389 [02:43<00:00, 11124.59it/s]
100%|██████████| 1815389/1815389 [00:01<00:00, 1630304.89it/s]


##### End Running cl (3 of 6) #####
##### Start Running co (4 of 6) #####


100%|██████████| 2612606/2612606 [03:42<00:00, 11741.40it/s]
100%|██████████| 2612606/2612606 [00:01<00:00, 1747036.38it/s]


##### End Running co (4 of 6) #####
##### Start Running gl (5 of 6) #####


100%|██████████| 1779778/1779778 [01:55<00:00, 15419.14it/s]
100%|██████████| 1779778/1779778 [00:00<00:00, 1781323.06it/s]


##### End Running gl (5 of 6) #####
##### Start Running lu (6 of 6) #####


100%|██████████| 1101941/1101941 [01:25<00:00, 12857.10it/s]
100%|██████████| 1101941/1101941 [00:00<00:00, 1703596.32it/s]


##### End Running lu (6 of 6) #####
##### Start Running ig (1 of 6) #####


100%|██████████| 599/599 [01:20<00:00,  7.45it/s]
100%|██████████| 599/599 [00:00<00:00, 2298.78it/s]


##### End Running ig (1 of 6) #####
##### Start Running bo (2 of 6) #####


100%|██████████| 188/188 [00:23<00:00,  8.04it/s]
100%|██████████| 188/188 [00:00<00:00, 716193.60it/s]


##### End Running bo (2 of 6) #####
##### Start Running cl (3 of 6) #####


100%|██████████| 574/574 [01:20<00:00,  7.09it/s]
100%|██████████| 574/574 [00:00<00:00, 1201362.52it/s]


##### End Running cl (3 of 6) #####
##### Start Running co (4 of 6) #####


100%|██████████| 774/774 [01:40<00:00,  7.70it/s]
100%|██████████| 774/774 [00:00<00:00, 1168607.38it/s]


##### End Running co (4 of 6) #####
##### Start Running gl (5 of 6) #####


100%|██████████| 411/411 [00:53<00:00,  7.75it/s]
100%|██████████| 411/411 [00:00<00:00, 795504.82it/s]


##### End Running gl (5 of 6) #####
##### Start Running lu (6 of 6) #####


100%|██████████| 272/272 [00:43<00:00,  6.26it/s]
100%|██████████| 272/272 [00:00<00:00, 911950.99it/s]


##### End Running lu (6 of 6) #####
##### Start Running ig (1 of 6) #####


100%|██████████| 272/272 [00:00<00:00, 1584.59it/s]
100%|██████████| 272/272 [00:00<00:00, 728978.08it/s]


##### End Running ig (1 of 6) #####
##### Start Running bo (2 of 6) #####


100%|██████████| 272/272 [00:00<00:00, 11482.92it/s]
100%|██████████| 272/272 [00:00<00:00, 437778.47it/s]


##### End Running bo (2 of 6) #####
##### Start Running cl (3 of 6) #####


100%|██████████| 272/272 [00:00<00:00, 10766.91it/s]
100%|██████████| 272/272 [00:00<00:00, 786251.34it/s]


##### End Running cl (3 of 6) #####
##### Start Running co (4 of 6) #####


100%|██████████| 272/272 [00:00<00:00, 11677.32it/s]
100%|██████████| 272/272 [00:00<00:00, 1081375.06it/s]


##### End Running co (4 of 6) #####
##### Start Running gl (5 of 6) #####


100%|██████████| 272/272 [00:00<00:00, 7827.34it/s]
100%|██████████| 272/272 [00:00<00:00, 768249.62it/s]


##### End Running gl (5 of 6) #####
##### Start Running lu (6 of 6) #####


100%|██████████| 272/272 [00:00<00:00, 9881.86it/s]
100%|██████████| 272/272 [00:00<00:00, 581177.12it/s]

##### End Running lu (6 of 6) #####





In [21]:
dict_results

{'top_mentioned_timelines_Texts':       Target  Against   For   All      Words  W/Tweet
 0     Church     1015   781  1796   43266172    15.09
 1  Bolsonaro      487    76   563   11567146    14.62
 2    Hydrox.      865   856  1721   40312597    22.21
 3    Sinovac     1062  1257  2319   54340534    20.80
 4   Globo TV      501   730  1231   27103849    15.23
 5       Lula      427   389   816   20478148    18.58
 6    Overall     4357  4089  8446  197068446    17.75,
 'users_Timeline':       Target  Against   For   All      Words   W/Tweet
 0     Church      339   260   599   34255228  57187.36
 1  Bolsonaro      162    26   188   10552186  56128.65
 2    Hydrox.      289   285   574   27393289  47723.50
 3    Sinovac      354   420   774   35320226  45633.37
 4   Globo TV      167   244   411   23026488  56025.52
 5       Lula      143   129   272   16608766  61061.64
 6    Overall     1454  1364  2818  147156183  53960.01,
 'users_Stance':       Target  Against   For   All  Words  

In [23]:
for key, value in dict_results.items():
    
    print(key)
    
    display(value)

top_mentioned_timelines_Texts


Unnamed: 0,Target,Against,For,All,Words,W/Tweet
0,Church,1015,781,1796,43266172,15.09
1,Bolsonaro,487,76,563,11567146,14.62
2,Hydrox.,865,856,1721,40312597,22.21
3,Sinovac,1062,1257,2319,54340534,20.8
4,Globo TV,501,730,1231,27103849,15.23
5,Lula,427,389,816,20478148,18.58
6,Overall,4357,4089,8446,197068446,17.75


users_Timeline


Unnamed: 0,Target,Against,For,All,Words,W/Tweet
0,Church,339,260,599,34255228,57187.36
1,Bolsonaro,162,26,188,10552186,56128.65
2,Hydrox.,289,285,574,27393289,47723.5
3,Sinovac,354,420,774,35320226,45633.37
4,Globo TV,167,244,411,23026488,56025.52
5,Lula,143,129,272,16608766,61061.64
6,Overall,1454,1364,2818,147156183,53960.01


users_Stance


Unnamed: 0,Target,Against,For,All,Words,W/Tweet
0,Church,339,260,599,6395,23.51
1,Bolsonaro,162,26,188,6395,23.51
2,Hydrox.,289,285,574,6395,23.51
3,Sinovac,354,420,774,6395,23.51
4,Globo TV,167,244,411,6395,23.51
5,Lula,143,129,272,6395,23.51
6,Overall,1454,1364,2818,38370,23.51
