## Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from nltk.tokenize import word_tokenize
import nltk

In [2]:
tqdm.pandas()

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/semcovici/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Definitions

In [4]:
model_name = 'neuralmind/bert-base-portuguese-cased'

random_seed = 42

raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'
results_cr_path = '../reports/classification_reports/'
test_results_path = '../reports/test_results/'
reports_path = '../reports/'

target_list = ['ig','bo', 'cl', 'co', 'gl', 'lu']

## Read Data

In [5]:
data_list = []

for target in tqdm(target_list):
    
    # read data
    data_temp_train = pd.read_csv(
        raw_data_path + f'train_r3_{target}_top_mentioned_timelines.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    data_temp_test = pd.read_csv(
        raw_data_path + f'test_r3_{target}_top_mentioned_timelines.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    data_temp_train['target'] = target
    data_temp_test['target'] = target
    
    data_temp_train['split'] = "train"
    data_temp_test['split'] = "test"
    
    data_list.append(data_temp_train)
    data_list.append(data_temp_test)
    
data_tmt = pd.concat(data_list)

100%|██████████| 6/6 [00:20<00:00,  3.37s/it]


In [6]:
data_list = []

for target in tqdm(target_list):
    
    # read data
    data_temp_train = pd.read_csv(
        raw_data_path + f'r3_{target}_train_users.csv', 
        sep = ';', 
        encoding='utf-8-sig'
    )
    data_temp_test = pd.read_csv(
        raw_data_path + f'r3_{target}_test_users.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    
    
    
    data_temp_train['target'] = target
    data_temp_test['target'] = target
    
    data_temp_train['split'] = "train"
    data_temp_test['split'] = "test"
    
    data_list.append(data_temp_train)
    data_list.append(data_temp_test)
    
data_users = pd.concat(data_list)

100%|██████████| 6/6 [00:29<00:00,  4.90s/it]


In [7]:
data_users.split.value_counts()/len(data_users)

split
train    0.749822
test     0.250178
Name: count, dtype: float64

In [8]:

data_tmt.split.value_counts()/len(data_tmt)

split
train    0.749822
test     0.250178
Name: count, dtype: float64

In [9]:
def separate_comments(
    data,
    Texts_col = 'Texts',
    sep = ' # '
):
    
    # Separates texts into individual lines
    df_sep_comments = data.assign(Texts=data[Texts_col].str.split(sep)).explode(Texts_col)
    
    df_sep_comments.rename({"Texts":Texts_col},axis = 1)

    # Reindex the resulting DataFrame
    df_sep_comments.reset_index(drop=True, inplace = True)

    df_sep_comments.ffill(inplace = True)
    
    return df_sep_comments

In [10]:
dict_cp = {
    'cl':'Hydrox.',
    'lu':'Lula',
    'co':'Sinovac',
    'ig':'Church',
    'gl':'Globo TV',
    'bo':'Bolsonaro',
}

In [11]:
data_users

Unnamed: 0,User_ID,Timeline,Stance,Polarity,Tweet_Seq,target,split
0,r2_ig_1,@ posso nem comer meu pãozin de queijo em paz ...,tenho pra mim que grande parte senão todas as ...,against,2953,ig,train
1,r2_ig_4,Fim de jogo ++ uma vitoria do meu Vascão # Hoj...,Cidade de Deus Alicate: quer saber vou entrar ...,for,4792,ig,train
2,r2_ig_7,"Meu chefe é todo aleatório, do nada chega com ...",Acordei já sendo removida do grupo da igreja,against,248,ig,train
3,r2_ig_8,veja a receita FILÉ COM MOLHO DE MOSTARDA # Di...,I liked a @ video culto infantil na igreja Ass...,for,45,ig,train
4,r2_ig_10,"Oq tem de gente boa, tem de irritante # Não te...",Essa turma da igreja sao tão amorzinho smp con...,for,3809,ig,train
...,...,...,...,...,...,...,...
267,r2_lu_1086,Gostei de um vídeo @ … com Sweet Carol | The N...,Eu deveria me espelhar no Lula e ler 55 página...,for,381,lu,test
268,r2_lu_1090,Show de bola! Que venham outros … # Essa renda...,Pqp quanta merda em um Tweet só! Pare de mistu...,against,899,lu,test
269,r2_lu_1091,"FOOOOOOOOOOOOOOGOOOOOOOOOOOOOOO!!!!!!! # ""200 ...",nem a Venezuela respeita mais o Brasil sem o L...,for,294,lu,test
270,r2_lu_1093,@ quem prejudica a imagem do Brasil não é o po...,Lula tem uma visão de mundo muito diversa de F...,for,2021,lu,test


In [12]:



dict_config = {
    'top_mentioned_timelines':{
        'data': data_tmt,
        'columns': {
            'Texts': 1
        }
    },
    'users':{
        'data': data_users,
        'columns':{
            'Timeline': 1,
            'Stance': 0
        }
    }
    
}


dict_results = {}

for name, config in dict_config.items():
    
    
    data = config["data"]
    
    for column, multiple_comments in config["columns"].items():
        
        
        df_anl = pd.DataFrame({
            "Target": [],
            "Against": [],
            "For": [],
            "All": [],
            "Words": [],
            "Comments/User": [],
            "W/Tweet": []
        })
        
        for i, target in enumerate(target_list):
            
            print(f'##### Start Running {target} ({i+1} of {len(target_list)}) #####')
            
            df_target = data[data.target == target]
            
            counts_target = df_target.Polarity.value_counts()
            
            n_against = counts_target['against']
            n_for = counts_target['for']
            
            if multiple_comments:
                # separate comments and drop the duplicates (the comments that appears in more the one user)
                df_sep_comments = separate_comments(df_target, Texts_col = column).drop_duplicates(subset=[column])
            
                # create column with tokens
                df_sep_comments['tokens'] = df_sep_comments[column].progress_apply(lambda x: word_tokenize(x, language='portuguese'))
                # create column with count of tokens
                df_sep_comments['tokens_count'] = df_sep_comments.tokens.progress_apply(len)
                
                gpby_userid = df_sep_comments.groupby('User_ID')
                count_users = len(df_sep_comments.User_ID.unique())
            
                new_row = {
                    "Target": target,
                    "Against": n_against,
                    "For": n_for,
                    "All": n_against + n_for,
                    "Words": df_sep_comments.tokens_count.sum(),
                    "Comments/User": gpby_userid.size().sum() / count_users,
                    "W/Tweet": df_sep_comments.tokens_count.sum()/len(df_sep_comments)
                }
                
                df_anl.loc[len(df_anl)] = new_row
                
            else:
                
                # create column with tokens
                df_target['tokens'] = df_target[column].progress_apply(lambda x: word_tokenize(x, language='portuguese'))
                # create column with count of tokens
                df_target['tokens_count'] = df_target.tokens.progress_apply(len)
            
                new_row = {
                    "Target": target,
                    "Against": n_against,
                    "For": n_for,
                    "All": n_against + n_for,
                    "Words": df_sep_comments.tokens_count.sum(),
                    "Comments/User": "nsa",
                    "W/Tweet": df_sep_comments.tokens_count.sum()/len(df_sep_comments)
                }
                
                df_anl.loc[len(df_anl)] = new_row
                
            
            print(f'##### End Running {target} ({i+1} of {len(target_list)}) #####')
            
        df_anl.Target = df_anl.Target.map(dict_cp)

        
        counts_target = data.Polarity.value_counts()

        n_against = counts_target['against']
        n_for = counts_target['for']


        new_row = {
            "Target": "Overall",
            "Against": n_against,
            "For": n_for,
            "All": n_against + n_for,
            "Words": df_anl.Words.sum(),
            "W/Tweet": df_anl["W/Tweet"].sum()/len(df_anl)
        }

        df_anl.loc[len(df_anl)] = new_row
        df_anl = df_anl.round(2)
        
        
        dict_results.update({f"{name}_{column}":df_anl})

##### Start Running ig (1 of 6) #####


100%|██████████| 3763019/3763019 [04:01<00:00, 15576.52it/s]
100%|██████████| 3763019/3763019 [00:02<00:00, 1525155.80it/s]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (7,) + inhomogeneous part.

1874.9471848530145

Unnamed: 0_level_0,Polarity,Texts,target,split,tokens,tokens_count
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
r2_ig_1,0.436971,0.436971,0.436971,0.436971,0.436971,0.436971
r2_ig_10,0.045341,0.045341,0.045341,0.045341,0.045341,0.045341
r2_ig_100,2.314898,2.314898,2.314898,2.314898,2.314898,2.314898
r2_ig_1000,0.904335,0.904335,0.904335,0.904335,0.904335,0.904335
r2_ig_1002,0.910314,0.910314,0.910314,0.910314,0.910314,0.910314
...,...,...,...,...,...,...
r2_ig_994,0.562531,0.562531,0.562531,0.562531,0.562531,0.562531
r2_ig_995,0.882910,0.882910,0.882910,0.882910,0.882910,0.882910
r2_ig_996,0.743896,0.743896,0.743896,0.743896,0.743896,0.743896
r2_ig_998,1.073244,1.073244,1.073244,1.073244,1.073244,1.073244


In [17]:
df_anl.loc[len(df_anl)] = new_row

In [16]:
df_sep_comments.tokens_count.sum()/len(df_sep_comments)

14.978454533447746

In [None]:
for key, value in dict_results.items():
    
    print(key)
    
    display(value)
    
    value.to_csv(reports_path + f'describe/{key}.csv')

top_mentioned_timelines_Texts


Unnamed: 0,Target,Against,For,All,Words,W/Tweet
0,Church,1354,1041,2395,56364209,14.98
1,Bolsonaro,649,102,751,14608017,14.58
2,Hydrox.,1154,1141,2295,51770007,22.17
3,Sinovac,1416,1677,3093,69421120,20.58
4,Globo TV,668,974,1642,35004176,15.15
5,Lula,570,518,1088,25731928,18.24
6,Overall,5811,5453,11264,252899457,17.61


users_Timeline


Unnamed: 0,Target,Against,For,All,Words,W/Tweet
0,Church,1354,1041,2395,140207645,58541.81
1,Bolsonaro,649,102,751,41224425,54892.71
2,Hydrox.,1154,1141,2295,108484625,47269.99
3,Sinovac,1416,1677,3093,137333979,44401.55
4,Globo TV,668,974,1642,93326479,56837.08
5,Lula,570,518,1088,67569624,62104.43
6,Overall,5811,5453,11264,588146777,54007.93


users_Stance


Unnamed: 0,Target,Against,For,All,Words,W/Tweet
0,Church,1354,1041,2395,26567,24.42
1,Bolsonaro,649,102,751,26567,24.42
2,Hydrox.,1154,1141,2295,26567,24.42
3,Sinovac,1416,1677,3093,26567,24.42
4,Globo TV,668,974,1642,26567,24.42
5,Lula,570,518,1088,26567,24.42
6,Overall,5811,5453,11264,159402,24.42
