In [1]:
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from nltk.tokenize import word_tokenize
import nltk
from unidecode import unidecode
import numpy as np

In [2]:
tqdm.pandas()

In [3]:
path_raw_data = '../data/raw/'


In [4]:
# given comments separated by " # " and a list of terms, 
# return all coments that have at least one of terms in the list_terms

def find_relevant_comments(comments, list_terms, L = None):    
    
    list_comments = comments.split(' # ')
    
    terms_set = set(t.casefold() for t in list_terms)
    
    list_rel_comments = [
        com for com in list_comments if any(term in com.casefold() for term in terms_set)
    ]
    
    if L is not None:
        
        list_rel_comments = create_comment_list(list_comments, list_rel_comments, L)
        
    str_rel_comments = ' # '.join(list_rel_comments) if len(list_rel_comments) > 0 else ''
    
    return str_rel_comments

def create_comment_list(A, B, L):
    """
    Creates a new comment list C of size L. If B contains more than L comments,
    randomly selects L comments from B. If B contains fewer than L comments, it
    randomly selects the remaining comments from A to fill up C.

    Parameters:
        A (list): List of comments from the social network.
        B (list): List of comments from a user on the social network.
        L (int): Size of the new comment list C.

    Returns:
        list: New comment list C of size L.
    """
    C = []

    if len(B) >= L:
        # If B has at least L comments, select L comments randomly without replacement
        C = np.random.choice(B, L, replace=False)
    else:
        # If B has fewer than L comments, add all comments from B
        C.extend(B)
        remaining_comments = L - len(B)
        if remaining_comments <= len(A):
            # If there are enough comments in A to fill up C, randomly select the remaining comments from A
            A_comments = np.random.choice(A, remaining_comments, replace=False)
            C.extend(A_comments)
        else:
            pass

    return C


In [5]:
terms_list_ig = [
    "igreja",
    "catedral",
    "capela",
    "templo",
    "paróquia",
    "basílica",
    "padre",
    "pastor",
    "bispo",
    "cardeal",
    "papa",
    "sacerdote",
    "arcebispo",
    "deão",
    "vigário",
    "altar",
    "crucifixo",
    "cálice",
    "hóstia",
    "círio",
    "batistério",
    "sacristia",
    "tabernáculo",
    "missa",
    "culto",
    "batismo",
    "comunhão",
    "confissão",
    "crisma",
    "cerimônia",
    "vaticano",
    "concílio",
    "encíclica",
    "dioceses"
]

terms_list_cl= [
    "hidroxicloroquina",
    "remédio",
    "medicamento",
    "tratamento",
    "antimalárico",
    "antimalárico sintético",
    "droga"
]

terms_list_lu = [
    "lula",
    "presidente",
    "luiz inácio lula da silva",
    "pt",
    "partido dos trabalhadores",
    "ex-presidente",
    "liderança política",
    "governo lula",
    "política"
]

terms_list_co = [
    "sinovac",
    "coronavac",
    "vacina",
    "vacina chinesa",
    "imunização",
    "vacinação",
    "biontech",
    "covid-19",
    "pandemia"
]
terms_list_gl = [
    "globo",
    "tv globo",
    "rede globo",
    "televisão",
    "emissora",
    "rede de televisão",
    "mídia",
    "jornalismo",
    "programação de tv",
    "entretenimento"
]


terms_list_bo = [
    "bolsonaro",
    "jair bolsonaro",
    "presidente",
    "presidente do brasil",
    "governo bolsonaro",
    "partido liberal",
    "política",
    "conservador",
    "ex-presidente"
]

In [6]:
target_terms_dict = {
    'ig': terms_list_ig,
    'bo': terms_list_bo, 
    'cl': terms_list_cl, 
    'co':terms_list_co, 
    'gl': terms_list_gl, 
    'lu': terms_list_lu
}

In [7]:
dict_data_users = {}
dict_data_tmt = {}

for target, terms_list in target_terms_dict.items():
    
    print(f"""
#############################
# {target}
#############################          
          """)
    
    # read data
    data_tmt = pd.read_csv(
        path_raw_data + f'train_r3_{target}_top_mentioned_timelines.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    # read data
    data_users = pd.read_csv(
        path_raw_data + f'r3_{target}_test_users.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    
    print('#### users timeline ####')
    data_users['filtered_Timeline'] = data_users.Timeline.progress_apply(lambda x: find_relevant_comments(x, terms_list))
    print('Removing 0: ', data_users[data_users.filtered_Timeline.apply(lambda x : x.count(' # ')) > 0].shape)
    print('total', data_users.shape)
    
    print('#### top mentioned timelines ####')
    data_tmt['filtered_Texts'] = data_tmt.Texts.progress_apply(lambda x: find_relevant_comments(x, terms_list))
    print('Removing 0: ', data_tmt[data_tmt.filtered_Texts.apply(lambda x : x.count(' # ')) > 0].shape)
    print('total', data_tmt.shape)
    
    
    dict_data_users.update({target:data_users})
    dict_data_tmt.update({target:data_tmt})
    
for target, data in dict_data_users.items():
    
    print(f"""
##################
# {target}
##################      
          """)
    
    data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))
    
    print('case 0 %', len(data[data.count_filtered_Timeline == 0])/len(data))
    display(data.count_filtered_Timeline.describe())
    
for target, data in dict_data_tmt.items():
    
    print(f"""
##################
# {target}
##################      
          """)
    
    data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))
    
    print('case 0 %', len(data[data.count_filtered_Texts == 0])/len(data))
    display(data.count_filtered_Texts.describe())


#############################
# ig
#############################          
          
#### users timeline ####


100%|██████████| 599/599 [00:47<00:00, 12.49it/s]


Removing 0:  (571, 6)
total (599, 6)
#### top mentioned timelines ####


100%|██████████| 1796/1796 [01:26<00:00, 20.86it/s]


Removing 0:  (1522, 4)
total (1796, 4)

#############################
# bo
#############################          
          
#### users timeline ####


100%|██████████| 188/188 [00:04<00:00, 45.64it/s]


Removing 0:  (158, 6)
total (188, 6)
#### top mentioned timelines ####


100%|██████████| 563/563 [00:07<00:00, 70.49it/s]


Removing 0:  (397, 4)
total (563, 4)

#############################
# cl
#############################          
          
#### users timeline ####


100%|██████████| 574/574 [00:08<00:00, 65.36it/s]


Removing 0:  (564, 6)
total (574, 6)
#### top mentioned timelines ####


100%|██████████| 1721/1721 [00:31<00:00, 54.99it/s]


Removing 0:  (1604, 4)
total (1721, 4)

#############################
# co
#############################          
          
#### users timeline ####


100%|██████████| 774/774 [00:13<00:00, 55.62it/s]


Removing 0:  (762, 6)
total (774, 6)
#### top mentioned timelines ####


100%|██████████| 2319/2319 [00:53<00:00, 43.20it/s]


Removing 0:  (2211, 4)
total (2319, 4)

#############################
# gl
#############################          
          
#### users timeline ####


100%|██████████| 411/411 [00:10<00:00, 40.95it/s]


Removing 0:  (333, 6)
total (411, 6)
#### top mentioned timelines ####


100%|██████████| 1231/1231 [00:22<00:00, 54.58it/s]


Removing 0:  (877, 4)
total (1231, 4)

#############################
# lu
#############################          
          
#### users timeline ####


100%|██████████| 272/272 [00:06<00:00, 40.51it/s]


Removing 0:  (272, 6)
total (272, 6)
#### top mentioned timelines ####


100%|██████████| 816/816 [00:14<00:00, 55.10it/s]

Removing 0:  (780, 4)
total (816, 4)

##################
# ig
##################      
          
case 0 % 0.04674457429048414



  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    599.000000
mean      23.025042
std       49.223173
min        0.000000
25%        7.000000
50%       14.000000
75%       24.000000
max      870.000000
Name: filtered_Timeline, dtype: float64


##################
# bo
##################      
          
case 0 % 0.1595744680851064


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    188.000000
mean      35.329787
std       87.623635
min        0.000000
25%        2.000000
50%        6.000000
75%       17.000000
max      544.000000
Name: filtered_Timeline, dtype: float64


##################
# cl
##################      
          
case 0 % 0.017421602787456445


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    574.000000
mean      18.832753
std       23.725964
min        0.000000
25%        7.000000
50%       12.000000
75%       23.000000
max      295.000000
Name: filtered_Timeline, dtype: float64

  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))



##################
# co
##################      
          
case 0 % 0.015503875968992248


count    774.000000
mean      62.751938
std       96.554540
min        0.000000
25%       14.000000
50%       31.000000
75%       68.750000
max      985.000000
Name: filtered_Timeline, dtype: float64


##################
# gl
##################      
          
case 0 % 0.1897810218978102


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    411.000000
mean      14.883212
std       28.316004
min        0.000000
25%        1.000000
50%        5.000000
75%       15.500000
max      308.000000
Name: filtered_Timeline, dtype: float64


##################
# lu
##################      
          
case 0 % 0.0


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count     272.00000
mean      121.62500
std       202.60559
min         1.00000
25%        28.00000
50%        53.00000
75%       127.00000
max      2353.00000
Name: filtered_Timeline, dtype: float64


##################
# ig
##################      
          
case 0 % 0.15256124721603564


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    1796.000000
mean       20.599109
std        82.057903
min         0.000000
25%         2.000000
50%         5.000000
75%        12.000000
max      1145.000000
Name: filtered_Texts, dtype: float64


##################
# bo
##################      
          
case 0 % 0.29484902309058614


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    563.000000
mean      56.216696
std      111.482676
min        0.000000
25%        0.000000
50%        6.000000
75%      121.000000
max      786.000000
Name: filtered_Texts, dtype: float64


##################
# cl
##################      
          
case 0 % 0.06798373038930854


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    1721.000000
mean       36.386984
std        45.104506
min         0.000000
25%         7.000000
50%        22.000000
75%        56.000000
max       502.000000
Name: filtered_Texts, dtype: float64


##################
# co
##################      
          


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


case 0 % 0.04657179818887452


count    2319.000000
mean      208.030185
std       224.687030
min         0.000000
25%        24.000000
50%       130.000000
75%       370.500000
max      1262.000000
Name: filtered_Texts, dtype: float64


##################
# gl
##################      
          
case 0 % 0.2875710804224208


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    1231.000000
mean       32.923639
std        89.068772
min         0.000000
25%         0.000000
50%         4.000000
75%        13.000000
max       500.000000
Name: filtered_Texts, dtype: float64


##################
# lu
##################      
          
case 0 % 0.04411764705882353


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    816.000000
mean     133.973039
std      212.493180
min        0.000000
25%       12.000000
50%       43.000000
75%      127.000000
max      997.000000
Name: filtered_Texts, dtype: float64

L = 30

In [8]:
L = 30

In [9]:
dict_data_users = {}
dict_data_tmt = {}

for target, terms_list in target_terms_dict.items():
    
    print(f"""
#############################
# {target}
#############################          
          """)
    
    # read data
    data_tmt = pd.read_csv(
        path_raw_data + f'train_r3_{target}_top_mentioned_timelines.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    # read data
    data_users = pd.read_csv(
        path_raw_data + f'r3_{target}_test_users.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    
    print('#### users timeline ####')
    data_users['filtered_Timeline'] = data_users.Timeline.progress_apply(lambda x: find_relevant_comments(x, terms_list, L))
    print('Removing 0: ', data_users[data_users.filtered_Timeline.apply(lambda x : x.count(' # ')) > 0].shape)
    print('total', data_users.shape)
    
    print('#### top mentioned timelines ####')
    data_tmt['filtered_Texts'] = data_tmt.Texts.progress_apply(lambda x: find_relevant_comments(x, terms_list, L))
    print('Removing 0: ', data_tmt[data_tmt.filtered_Texts.apply(lambda x : x.count(' # ')) > 0].shape)
    print('total', data_tmt.shape)
    
    
    dict_data_users.update({target:data_users})
    dict_data_tmt.update({target:data_tmt})
    
for target, data in dict_data_users.items():
    
    print(f"""
##################
# {target}
##################      
          """)
    
    data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))
    
    print('case 0 %', len(data[data.count_filtered_Timeline == 0])/len(data))
    display(data.count_filtered_Timeline.describe())
    
for target, data in dict_data_tmt.items():
    
    print(f"""
##################
# {target}
##################      
          """)
    
    data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))
    
    print('case 0 %', len(data[data.count_filtered_Texts == 0])/len(data))
    display(data.count_filtered_Texts.describe())


#############################
# ig
#############################          
          
#### users timeline ####


100%|██████████| 599/599 [00:49<00:00, 12.07it/s]


Removing 0:  (599, 6)
total (599, 6)
#### top mentioned timelines ####


100%|██████████| 1796/1796 [01:26<00:00, 20.80it/s]


Removing 0:  (1758, 4)
total (1796, 4)

#############################
# bo
#############################          
          
#### users timeline ####


100%|██████████| 188/188 [00:04<00:00, 44.06it/s]


Removing 0:  (188, 6)
total (188, 6)
#### top mentioned timelines ####


100%|██████████| 563/563 [00:07<00:00, 73.70it/s]


Removing 0:  (556, 4)
total (563, 4)

#############################
# cl
#############################          
          
#### users timeline ####


100%|██████████| 574/574 [00:08<00:00, 65.08it/s]


Removing 0:  (574, 6)
total (574, 6)
#### top mentioned timelines ####


100%|██████████| 1721/1721 [00:30<00:00, 55.80it/s]


Removing 0:  (1710, 4)
total (1721, 4)

#############################
# co
#############################          
          
#### users timeline ####


100%|██████████| 774/774 [00:13<00:00, 56.65it/s]


Removing 0:  (774, 6)
total (774, 6)
#### top mentioned timelines ####


100%|██████████| 2319/2319 [00:52<00:00, 44.42it/s]


Removing 0:  (2289, 4)
total (2319, 4)

#############################
# gl
#############################          
          
#### users timeline ####


100%|██████████| 411/411 [00:10<00:00, 39.23it/s]


Removing 0:  (411, 6)
total (411, 6)
#### top mentioned timelines ####


100%|██████████| 1231/1231 [00:22<00:00, 53.91it/s]


Removing 0:  (1210, 4)
total (1231, 4)

#############################
# lu
#############################          
          
#### users timeline ####


100%|██████████| 272/272 [00:06<00:00, 41.13it/s]


Removing 0:  (272, 6)
total (272, 6)
#### top mentioned timelines ####


100%|██████████| 816/816 [00:14<00:00, 54.68it/s]

Removing 0:  (804, 4)
total (816, 4)

##################
# ig
##################      
          
case 0 % 0.0



  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    599.0
mean      29.0
std        0.0
min       29.0
25%       29.0
50%       29.0
75%       29.0
max       29.0
Name: filtered_Timeline, dtype: float64


##################
# bo
##################      
          
case 0 % 0.0


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    188.0
mean      29.0
std        0.0
min       29.0
25%       29.0
50%       29.0
75%       29.0
max       29.0
Name: filtered_Timeline, dtype: float64


##################
# cl
##################      
          
case 0 % 0.0


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    574.0
mean      29.0
std        0.0
min       29.0
25%       29.0
50%       29.0
75%       29.0
max       29.0
Name: filtered_Timeline, dtype: float64


##################
# co
##################      
          
case 0 % 0.0


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    774.0
mean      29.0
std        0.0
min       29.0
25%       29.0
50%       29.0
75%       29.0
max       29.0
Name: filtered_Timeline, dtype: float64


##################
# gl
##################      
          
case 0 % 0.0


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    411.0
mean      29.0
std        0.0
min       29.0
25%       29.0
50%       29.0
75%       29.0
max       29.0
Name: filtered_Timeline, dtype: float64


##################
# lu
##################      
          
case 0 % 0.0


  data.count_filtered_Timeline = data.filtered_Timeline.apply(lambda x: x.count(' # '))


count    272.0
mean      29.0
std        0.0
min       29.0
25%       29.0
50%       29.0
75%       29.0
max       29.0
Name: filtered_Timeline, dtype: float64


##################
# ig
##################      
          
case 0 % 0.021158129175946547


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    1796.000000
mean       28.386414
std         4.174590
min         0.000000
25%        29.000000
50%        29.000000
75%        29.000000
max        29.000000
Name: filtered_Texts, dtype: float64


##################
# bo
##################      
          
case 0 % 0.012433392539964476


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    563.000000
mean      28.639432
std        3.216341
min        0.000000
25%       29.000000
50%       29.000000
75%       29.000000
max       29.000000
Name: filtered_Texts, dtype: float64


##################
# cl
##################      
          
case 0 % 0.006391632771644393


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    1721.000000
mean       28.814643
std         2.311733
min         0.000000
25%        29.000000
50%        29.000000
75%        29.000000
max        29.000000
Name: filtered_Texts, dtype: float64


##################
# co
##################      
          
case 0 % 0.0129366106080207


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    2319.000000
mean       28.624838
std         3.277739
min         0.000000
25%        29.000000
50%        29.000000
75%        29.000000
max        29.000000
Name: filtered_Texts, dtype: float64


##################
# gl
##################      
          
case 0 % 0.017059301380991064


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    1231.000000
mean       28.505280
std         3.756806
min         0.000000
25%        29.000000
50%        29.000000
75%        29.000000
max        29.000000
Name: filtered_Texts, dtype: float64


##################
# lu
##################      
          
case 0 % 0.014705882352941176


  data.count_filtered_Texts = data.filtered_Texts.apply(lambda x: x.count(' # '))


count    816.000000
mean      28.573529
std        3.492953
min        0.000000
25%       29.000000
50%       29.000000
75%       29.000000
max       29.000000
Name: filtered_Texts, dtype: float64

In [10]:
data[data.count_filtered_Texts == 0]

Unnamed: 0,User_ID,Polarity,Texts,filtered_Texts
59,r2_lu_73,against,na,
248,r2_lu_322,against,na,
306,r2_lu_393,for,na,
314,r2_lu_402,against,na,
350,r2_lu_447,for,na,
419,r2_lu_543,against,na,
457,r2_lu_594,against,na,
613,r2_lu_832,against,na,
695,r2_lu_942,against,na,
699,r2_lu_946,for,na,
