In [1]:
# -*- coding: utf-8 -*-

# Bibliotecas padrão
import os
import sys
import warnings
from collections import Counter

# Bibliotecas de terceiros
import pandas as pd
from joblib import dump, load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    cross_val_predict,
    cross_validate,
    train_test_split
)
from sklearn.pipeline import Pipeline


# Imports locais
sys.path.append('../src/')
from utils import list_to_sent
from models.classification_methods import create_test_results_df
from data.lambdas import int_to_label, label_to_int

# Configurações globais
warnings.filterwarnings('ignore')


In [2]:
reports_path = '../reports/'


# test_results_path = f"{reports_path}test_results/{estimator_name}_{target}_{exp_name}_test_results.csv"
# train_results_path = f"{reports_path}train_results/{estimator_name}_{target}_{exp_name}_train_results.csv"
# val_results_path = f"{reports_path}val_results/{estimator_name}_{target}_{exp_name}_val_results.csv"

# Conjuntos de dados

Primeiro, faremos a leitura dos conjuntos de dados.

In [3]:
os.listdir()

['Experimentos UstanceBR V3.ipynb',
 '0.1-analysis_users.ipynb',
 'old',
 '0.2-comparison_top_mentioned_timelines.ipynb',
 '0.1-analysis_top_mentioned_timelines.ipynb',
 '3.results_analysis.ipynb',
 '0.analysis_relat.ipynb',
 'pablocosta',
 '0.2-analysis_coments_about_target.ipynb',
 'relevant_terms.ipynb']



Primeiro, leremos todos os dados textuais, na aba "Texto".

In [4]:
data_dir = '../data/raw'

In [5]:
dir = f'{data_dir}/Texto_v3/'
arquivos = os.listdir(dir)
topicos = list(set(['_'.join(i.split('_')[1:3]) for i in arquivos]))
texto = {
    topico: pd.read_csv(dir+arquivo, sep=';')
    for topico, arquivo in zip(sorted(topicos), sorted(arquivos))
}

In [6]:
arquivos

['r3_lu_train_statements.csv',
 'r3_gl_train_statements.csv',
 'r3_gl_test_statements.csv',
 'r3_bo_test_statements.csv',
 'r3_cl_train_statements.csv',
 'r3_co_test_statements.csv',
 'r3_ig_train_statements.csv',
 'r3_bo_train_statements.csv',
 'r3_lu_test_statements.csv',
 'r3_ig_test_statements.csv',
 'r3_cl_test_statements.csv',
 'r3_co_train_statements.csv']

Temos o texto do tweet, a polaridade (contra ou a favor), *o que é factual?*, o id do usuário, o id do tweet, *o que é tweet seq?*, as POS tags, lista de menções e substantivos.

In [7]:
for t in texto:
  texto[t]['texto_id'] = texto[t].index

In [8]:
for t in [topico for topico in topicos if 'test' not in topico]:
  print(t)
  print(texto[t].Polarity.value_counts())
  print()

ig_train
Polarity
against    1015
for         781
Name: count, dtype: int64

bo_train
Polarity
against    487
for         76
Name: count, dtype: int64

cl_train
Polarity
against    865
for        856
Name: count, dtype: int64

gl_train
Polarity
for        730
against    501
Name: count, dtype: int64

lu_train
Polarity
against    427
for        389
Name: count, dtype: int64

co_train
Polarity
for        1257
against    1062
Name: count, dtype: int64



Agora, leremos os conjuntos de dados de menções.

In [9]:
dir = f'{data_dir}/Mencoes/'
arquivos = os.listdir(dir)
mencoes = {
    topico: pd.merge(
        pd.read_excel(dir+arquivo),
        texto[topico][['User_ID', 'Polarity', 'texto_id']],
        on='User_ID', how='inner'
    )
    for topico, arquivo in zip(sorted(topicos), sorted(arquivos*2))
}

In [10]:
mencoes['bo_test'].sort_values(by='texto_id')

Unnamed: 0,User_ID,N_Contacts,Timeline,Anon_Contacts,Polarity,texto_id
4,r2_bo_18,123,"['02-Jun-2019 17:27', '03-Jun-2019 00:57', '03...","[5795.1, 2929.0, 4890.4, 1400.6, 4373.4, 4527....",against,0
15,r2_bo_69,196,"['24-Mar-2020 16:12', '24-Mar-2020 16:30', '24...","[5157.9, 4230.3, 4952.6, 2275.5, 5676.6, 454.0...",against,1
20,r2_bo_80,2,"['29-Jun-2016 19:42', '29-Jun-2016 19:47', '30...","[123.9, 2003.2]",against,2
11,r2_bo_57,40,"['24-May-2019 16:57', '24-May-2019 17:01', '24...","[5691.2, 3192.0, 2004.6, 1270.3, 5522.3, 1257....",against,3
16,r2_bo_71,18,"['23-Feb-2019 01:53', '23-Feb-2019 01:54', '23...","[2644.6, 2402.9, 4504.2, 3876.9, 5949.7, 306.0...",against,4
...,...,...,...,...,...,...
39,r2_bo_154,76,"['25-Apr-2017 15:07', '02-May-2017 22:29', '03...","[4470.4, 665.5, 1946.3, 1840.5, 2774.5, 5362.3...",against,183
60,r2_bo_248,43,"['11-Jun-2018 00:54', '11-Jun-2018 00:57', '11...","[5381.7, 1698.7, 156.8, 2102.5, 2509.7, 2203.8...",against,184
141,r2_bo_573,38,"['29-Nov-2016 21:41', '29-Nov-2016 21:43', '29...","[754.7, 2265.1, 1318.8, 4647.6, 4464.0, 2276.8...",for,185
145,r2_bo_591,43,"['14-Sep-2017 04:05', '14-Sep-2017 04:07', '14...","[754.7, 1311.5, 2777.4, 953.7, 2155.9, 1904.1,...",against,186


Temos o id do usuário, o número de contatos, sua timeline e os contatos.

Por fim, os dados de rede.

In [11]:
dir = f'{data_dir}/Rede/'
arquivos = os.listdir(dir)
rede = {
    topico: pd.merge(
        pd.read_excel(dir+arquivo),
        texto[topico][['User_ID', 'Polarity', 'texto_id']],
        on='User_ID', how='inner'
    )
    for topico, arquivo in zip(sorted(topicos), sorted(arquivos*2))
}

Temos o id do usuário, o número de status, amigos, seguidores, e a lista de amigos e de seguidores.

In [12]:
rede['bo_train'].head()

Unnamed: 0,User_ID,Statuses,N_Friends,N_Followers,Friends_Anon,Followers_Anon,Polarity,texto_id
0,r2_bo_1,1387,1986,438,"[3930382, 2702214, 2136127, 3959950, 1724473, ...","[3917523, 3008888, 2702214, 4059653, 2834105, ...",against,15
1,r2_bo_2,6042,494,202,"[3789053, 2319304, 2861841, 834345, 3569043, 5...","[507072, 4124188, 967649, 2584069, 1675733, 43...",against,191
2,r2_bo_5,8187,1447,261,"[3503961, 2556472, 2386283, 1625881, 4413129, ...","[2612737, 2684337, 1750113, 3197553, 2098010, ...",for,110
3,r2_bo_6,8134,505,387,"[3995969, 4126043, 3336755, 719969, 1010094, 2...","[4126043, 719969, 3216265, 2646055, 3411718, 3...",against,525
4,r2_bo_7,140060,1559,4123,"[3582639, 3817031, 2094229, 2907984, 2713010, ...","[724711, 3582639, 4378687, 2907984, 2713010, 4...",against,198


# Dados de rede

Primeiro, vamos trabalhar com o conjunto de dados de rede.

## Preparação do conjunto de dados

Vamos precisar juntar as listas numa "frase" para podermos criar o "Bag-of-friends" e "Bag-of-followers".

In [13]:
for k in rede.keys():
  rede[k].Friends_Anon = rede[k].Friends_Anon.apply(list_to_sent)
  rede[k].Followers_Anon = rede[k].Followers_Anon.apply(list_to_sent)

In [14]:
rede['bo_test'].head()

Unnamed: 0,User_ID,Statuses,N_Friends,N_Followers,Friends_Anon,Followers_Anon,Polarity,texto_id
0,r2_bo_3,12019,881,187,4047120 916068 3586417 2096367 1584057 2002837...,920686 19557 1060163 520948 4020305 2572839 22...,against,124
1,r2_bo_4,38342,6645,7735,230097 918724 1430184 1672828 3835582 4122868 ...,2085767 283735 230097 1173216 918724 1197457 4...,against,148
2,r2_bo_10,54066,690,1379,2040675 3300177 3668805 3257783 3593536 160273...,2334462 3967745 2040675 3528948 3903925 330017...,against,97
3,r2_bo_12,7369,499,500,1386614 843924 1676100 1837160 3739472 1986840...,2843532 1386614 157197 81339 843924 1555505 16...,for,108
4,r2_bo_18,71910,776,499,3139803 978571 831940 1576431 377871 310723 41...,3139803 1202535 4154262 2144382 4396019 117356...,against,0


Agora, vamos ver se os conjuntos de dados estão balanceados.

In [15]:
train_keys = [r for r in rede.keys() if 'test' not in r]
test_keys = [r for r in rede.keys() if 'test' in r]

In [16]:
X = dict()
y = dict()

X_teste = dict()
y_teste = dict()

for r in train_keys:
  X[r] = rede[r][['Friends_Anon', 'Followers_Anon', 'texto_id']]
  y[r] = rede[r]['Polarity']

  print(r)
  print(y[r].value_counts(normalize='True'))
  print()

for r in test_keys:
  X_teste[r] = rede[r][['Friends_Anon', 'Followers_Anon', 'texto_id']]
  y_teste[r] = rede[r]['Polarity']

bo_train
Polarity
against    0.865009
for        0.134991
Name: proportion, dtype: float64

cl_train
Polarity
against    0.502615
for        0.497385
Name: proportion, dtype: float64

co_train
Polarity
for        0.542044
against    0.457956
Name: proportion, dtype: float64

gl_train
Polarity
for        0.593014
against    0.406986
Name: proportion, dtype: float64

ig_train
Polarity
against    0.565145
for        0.434855
Name: proportion, dtype: float64

lu_train
Polarity
against    0.523284
for        0.476716
Name: proportion, dtype: float64



Os conjuntos de dados são aproximadamente balanceados.

## Bag-of-Friends

O primeiro teste que faremos é com o campo "Friends_Anon".

In [17]:
x_train = dict()
x_test = dict()
y_train = dict()
y_test = dict()

for r in train_keys:
  x_train[r], x_test[r], y_train[r], y_test[r] = train_test_split(
      X[r], y[r], test_size=0.2, random_state=123, stratify=y[r]
  )

In [18]:
Cs = [i/10 for i in range(1, 21, 2)]
tols = [1/(10**i) for i in range(2, 6)]
parameters = {
    'model__penalty': ['l2', 'l1'],
    'model__C': Cs, 'model__tol': tols
}

reglog = LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=500, random_state=123)

pipe_grid = Pipeline(steps=[
    ('tf-idf',  TfidfVectorizer(token_pattern='\d+')),
    ('20.000 best', SelectKBest(k=20000)),
    ('model', reglog)
])

clf = GridSearchCV(pipe_grid, parameters)

for r in train_keys:
  tfidf = TfidfVectorizer(token_pattern=r'\d+')
  X_tfidf = tfidf.fit_transform(x_train[r]['Friends_Anon'])

  # Imprimindo a dimensionalidade
  print(r, "Dimensionalidade do conjunto de dados antes do SelectKBest:", X_tfidf.shape[1])

  # clf.fit(x_train[r]['Friends_Anon'], y_train[r])

  # print(r)
  # print(clf.best_params_)
  # print(clf.best_score_)
  # print()

bo_train Dimensionalidade do conjunto de dados antes do SelectKBest: 229536
cl_train Dimensionalidade do conjunto de dados antes do SelectKBest: 405090
co_train Dimensionalidade do conjunto de dados antes do SelectKBest: 562272
gl_train Dimensionalidade do conjunto de dados antes do SelectKBest: 476345
ig_train Dimensionalidade do conjunto de dados antes do SelectKBest: 612846
lu_train Dimensionalidade do conjunto de dados antes do SelectKBest: 302731


In [19]:
penaltys = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2']
Cs = [0.3, 1.3, 1.7, 1.9, 1.9, 1.9]
tols = [0.01, 0.01, 0.01, 0.01, 0.01, 0.01]


for r,i in zip(test_keys, range(len(test_keys))):
    
    X_test_aux = X_teste[r]['Friends_Anon']
    y_test_aux = y_teste[r]
    X_train_aux = X[r.split('_')[0] + '_train']['Friends_Anon']
    y_train_aux = y[r.split('_')[0] + '_train']
    
    
    model = LogisticRegression(
            class_weight='balanced', max_iter=500, solver='liblinear',
            C=Cs[i], penalty=penaltys[i], tol=tols[i]
        )
    pipe_grid = Pipeline(steps=[
        ('tf-idf',  TfidfVectorizer(token_pattern='\d+')),
        ('20.000 best', SelectKBest(k=20000)),
        ('model', model)
    ])

    fit = pipe_grid.fit(X_train_aux, y_train_aux)
    # dump(pipe_grid, r+'_friends.joblib')

    print(r)
    
    
    
    
    
    
    ###########################################
    # SAVE RESULTS
    ###########################################
    
    exp_name = 'BagOfFriends'
    target = r.split('_')[0]
    estimator_name = 'TfidfVectorizer_SelectKBest_LogisticRegression'
    
    test_results_path = f"{reports_path}test_results/{estimator_name}_{target}_{exp_name}_test_results.csv"
    train_results_path = f"{reports_path}train_results/{estimator_name}_{target}_{exp_name}_train_results.csv"
    
    
    # cria df de resultados no teste
    test_pred = pipe_grid.predict(X_test_aux)
    test_pred_probs = pipe_grid.predict_proba(X_test_aux)
    test_pred_proba_0 = [float(probas[0]) for probas in test_pred_probs]
    test_pred_proba_1 = [float(probas[1]) for probas in test_pred_probs]
    y_test_formated = y_test_aux.tolist()
    y_test_pred_formated = test_pred.tolist()
    df_test_results = create_test_results_df(y_test_formated, y_test_pred_formated, test_pred_proba_0, test_pred_proba_1)
    df_test_results.to_csv(test_results_path, index=False)
    
    
    print(classification_report(y_test_aux, test_pred))
    
    
    # cria df de resultados no treino
    train_pred = pipe_grid.predict(X_train_aux)
    train_pred_probs = pipe_grid.predict_proba(X_train_aux)
    train_pred_proba_0 = [float(probas[0]) for probas in train_pred_probs]
    train_pred_proba_1 = [float(probas[1]) for probas in train_pred_probs]
    y_train_formated = y_train_aux.tolist()
    y_train_pred_formated = train_pred.tolist()
    df_train_results = create_test_results_df(y_train_formated, y_train_pred_formated, train_pred_proba_0, train_pred_proba_1)
    df_train_results.to_csv(train_results_path, index=False)
    

    print()

bo_test
              precision    recall  f1-score   support

     against       0.93      0.93      0.93       162
         for       0.58      0.58      0.58        26

    accuracy                           0.88       188
   macro avg       0.75      0.75      0.75       188
weighted avg       0.88      0.88      0.88       188


cl_test
              precision    recall  f1-score   support

     against       0.80      0.92      0.86       289
         for       0.91      0.76      0.83       285

    accuracy                           0.84       574
   macro avg       0.85      0.84      0.84       574
weighted avg       0.85      0.84      0.84       574


co_test
              precision    recall  f1-score   support

     against       0.95      0.68      0.79       354
         for       0.78      0.97      0.86       420

    accuracy                           0.84       774
   macro avg       0.86      0.82      0.83       774
weighted avg       0.86      0.84      0.83     

## Bag-of-Followers

Agora usaremos o campo "Followers_Anon".

In [20]:
x_train = dict()
x_test = dict()
y_train = dict()
y_test = dict()

for r in train_keys:
  x_train[r], x_test[r], y_train[r], y_test[r] = train_test_split(
      X[r], y[r], test_size=0.2, random_state=123, stratify=y[r]
  )

In [21]:
Cs = [i/10 for i in range(1, 21, 2)]
tols = [1/(10**i) for i in range(2, 6)]
parameters = {
    'model__penalty': ['l2', 'l1'],
    'model__C': Cs, 'model__tol': tols
}

reglog = LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=500)

pipe_grid = Pipeline(steps=[
    ('tf-idf',  TfidfVectorizer(token_pattern='\d+')),
    ('20.000 best', SelectKBest(k=20000)),
    ('model', reglog)
])

clf = GridSearchCV(pipe_grid, parameters)

In [22]:
for r in train_keys:
  tfidf = TfidfVectorizer(token_pattern=r'\d+')
  X_tfidf = tfidf.fit_transform(x_train[r]['Followers_Anon'])

  # Imprimindo a dimensionalidade
  print(r, "Dimensionalidade do conjunto de dados antes do SelectKBest:", X_tfidf.shape[1])

  # clf.fit(x_train[r]['Followers_Anon'], y_train[r])

  # print(r)
  # print(clf.best_params_)
  # print(clf.best_score_)
  # print()

# bo_train
# {'model__C': 0.1, 'model__penalty': 'l2', 'model__tol': 0.01}
# 0.9

# cl_train
# {'model__C': 1.9, 'model__penalty': 'l2', 'model__tol': 0.01}
# 0.7935915678524375

# co_train
# {'model__C': 1.5, 'model__penalty': 'l2', 'model__tol': 0.01}
# 0.7886792452830189

# gl_train
# {'model__C': 1.7, 'model__penalty': 'l2', 'model__tol': 0.01}
# 0.6331658551745571

# ig_train
# {'model__C': 0.1, 'model__penalty': 'l2', 'model__tol': 0.01}
# 0.6525091947348045

# lu_train
# {'model__C': 0.3, 'model__penalty': 'l2', 'model__tol': 0.01}
# 0.7009982384028185

bo_train Dimensionalidade do conjunto de dados antes do SelectKBest: 347157
cl_train Dimensionalidade do conjunto de dados antes do SelectKBest: 500866
co_train Dimensionalidade do conjunto de dados antes do SelectKBest: 761433
gl_train Dimensionalidade do conjunto de dados antes do SelectKBest: 738364
ig_train Dimensionalidade do conjunto de dados antes do SelectKBest: 976610
lu_train Dimensionalidade do conjunto de dados antes do SelectKBest: 435977


In [23]:
penaltys = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2']
Cs = [0.1, 1.9, 1.5, 1.7, 0.1, 0.3]
tols = [0.01, 0.01, 0.01, 0.01, 0.01, 0.01]

for r,i in zip(test_keys, range(len(test_keys))):
    
    X_test_aux = X_teste[r]['Followers_Anon']
    y_test_aux = y_teste[r]
    X_train_aux = X[r.split('_')[0] + '_train']['Followers_Anon']
    y_train_aux = y[r.split('_')[0] + '_train']
    
    model = LogisticRegression(
            class_weight='balanced', max_iter=500, solver='liblinear',
            C=Cs[i], penalty=penaltys[i], tol=tols[i]
        )
    pipe_grid = Pipeline(steps=[
        ('tf-idf',  TfidfVectorizer(token_pattern='\d+')),
        ('20.000 best', SelectKBest(k=20000)),
        ('model', model)
    ])

    fit = pipe_grid.fit(X_train_aux, y_train_aux)
    # dump(pipe_grid, r+'_followers.joblib')

    print(r)
    ###########################################
    # SAVE RESULTS
    ###########################################
    
    exp_name = 'BagOfFollowers'
    target = r.split('_')[0]
    estimator_name = 'TfidfVectorizer_SelectKBest_LogisticRegression'
    
    test_results_path = f"{reports_path}test_results/{estimator_name}_{target}_{exp_name}_test_results.csv"
    train_results_path = f"{reports_path}train_results/{estimator_name}_{target}_{exp_name}_train_results.csv"
    
    
    # cria df de resultados no teste
    test_pred = pipe_grid.predict(X_test_aux)
    test_pred_probs = pipe_grid.predict_proba(X_test_aux)
    test_pred_proba_0 = [float(probas[0]) for probas in test_pred_probs]
    test_pred_proba_1 = [float(probas[1]) for probas in test_pred_probs]
    y_test_formated = y_test_aux.tolist()
    y_test_pred_formated = test_pred.tolist()
    df_test_results = create_test_results_df(y_test_formated, y_test_pred_formated, test_pred_proba_0, test_pred_proba_1)
    df_test_results.to_csv(test_results_path, index=False)
    
    
    print(classification_report(y_test_aux, test_pred))
    
    
    # cria df de resultados no treino
    train_pred = pipe_grid.predict(X_train_aux)
    train_pred_probs = pipe_grid.predict_proba(X_train_aux)
    train_pred_proba_0 = [float(probas[0]) for probas in train_pred_probs]
    train_pred_proba_1 = [float(probas[1]) for probas in train_pred_probs]
    y_train_formated = y_train_aux.tolist()
    y_train_pred_formated = train_pred.tolist()
    df_train_results = create_test_results_df(y_train_formated, y_train_pred_formated, train_pred_proba_0, train_pred_proba_1)
    df_train_results.to_csv(train_results_path, index=False)

    print()

bo_test
              precision    recall  f1-score   support

     against       0.90      0.99      0.94       162
         for       0.89      0.31      0.46        26

    accuracy                           0.90       188
   macro avg       0.89      0.65      0.70       188
weighted avg       0.90      0.90      0.88       188


cl_test
              precision    recall  f1-score   support

     against       0.76      0.94      0.84       289
         for       0.93      0.69      0.79       285

    accuracy                           0.82       574
   macro avg       0.84      0.82      0.82       574
weighted avg       0.84      0.82      0.82       574


co_test
              precision    recall  f1-score   support

     against       0.93      0.56      0.70       354
         for       0.72      0.96      0.82       420

    accuracy                           0.78       774
   macro avg       0.82      0.76      0.76       774
weighted avg       0.82      0.78      0.77     

# Dados de menções

## Preparação do conjunto de dados

Vamos precisar juntar as listas numa "frase" para podermos criar o "Bag-of-mentions".

In [24]:
for k in mencoes.keys():
  mencoes[k].Anon_Contacts = mencoes[k].Anon_Contacts.apply(list_to_sent)

In [25]:
mencoes['bo_train'].head()

Unnamed: 0,User_ID,N_Contacts,Timeline,Anon_Contacts,Polarity,texto_id
0,r2_bo_1,53,"['17-Jul-2015 04:11', '17-Jul-2015 04:21', '17...",4415x7 4094x0 3595x8 3701x2 3711x4 2307x1 2842...,against,15
1,r2_bo_2,87,"['27-Jan-2015 14:49', '28-Jan-2015 01:14', '29...",5631x4 2775x3 192x0 863x8 2867x2 5882x1 5203x4...,against,191
2,r2_bo_5,389,"['06-Jun-2010 02:47', '06-Jun-2010 03:34', '06...",3575x7 358x6 988x5 4673x0 1405x6 4761x9 5835x6...,for,110
3,r2_bo_6,124,"['04-Jun-2020 01:13', '04-Jun-2020 01:43', '04...",4303x1 3949x1 4839x6 1871x3 634x4 2611x1 5481x...,against,525
4,r2_bo_7,4,"['02-Apr-2020 16:16', '02-Apr-2020 16:19', '02...",3705x7 3239x1 5888x7 284x1,against,198


Agora, vamos ver se os conjuntos de dados estão balanceados.

In [26]:
train_keys = [r for r in mencoes.keys() if 'test' not in r]
test_keys = [r for r in mencoes.keys() if 'test' in r]

In [27]:
X = dict()
y = dict()

X_teste = dict()
y_teste = dict()

for r in train_keys:
  X[r] = mencoes[r][['Anon_Contacts', 'texto_id']]
  y[r] = mencoes[r]['Polarity']

  print(r)
  print(y[r].value_counts(normalize='True'))
  print()

for r in test_keys:
  X_teste[r] = mencoes[r][['Anon_Contacts', 'texto_id']]
  y_teste[r] = mencoes[r]['Polarity']

bo_train
Polarity
against    0.865009
for        0.134991
Name: proportion, dtype: float64

cl_train
Polarity
against    0.502615
for        0.497385
Name: proportion, dtype: float64

co_train
Polarity
for        0.542044
against    0.457956
Name: proportion, dtype: float64

gl_train
Polarity
for        0.593014
against    0.406986
Name: proportion, dtype: float64

ig_train
Polarity
against    0.565145
for        0.434855
Name: proportion, dtype: float64

lu_train
Polarity
against    0.523284
for        0.476716
Name: proportion, dtype: float64



Os conjuntos de dados são aproximadamente balanceados.

## Bag-of-Mentions

Usaremos o campo "Anon_Contacts".

In [28]:
x_train = dict()
x_test = dict()
y_train = dict()
y_test = dict()

for r in train_keys:
  x_train[r], x_test[r], y_train[r], y_test[r] = train_test_split(
      X[r], y[r], test_size=0.2, random_state=123, stratify=y[r]
  )

In [29]:
Cs = [i/10 for i in range(1, 21, 2)]
tols = [1/(10**i) for i in range(2, 6)]
parameters = {
    'model__penalty': ['l2', 'l1'],
    'model__solver': ['liblinear', 'saga'],
    'model__C': Cs, 'model__tol': tols
}
reglog = LogisticRegression(class_weight='balanced', max_iter=900)

pipe_grid = Pipeline(steps=[
    ('tf-idf',  TfidfVectorizer(token_pattern='\d+x\d+')),
    ('20.000 best', SelectKBest(k=20000)),
    ('model', reglog)
])

clf = GridSearchCV(pipe_grid, parameters)

for r in train_keys:
  tfidf = TfidfVectorizer(token_pattern=r'\d+x\d+')
  X_tfidf = tfidf.fit_transform(x_train[r]['Anon_Contacts'])

  # Imprimindo a dimensionalidade
  print(r, "Dimensionalidade do conjunto de dados antes do SelectKBest:", X_tfidf.shape[1])


  # clf.fit(x_train[r]['Anon_Contacts'], y_train[r])

  # print(r)
  # print(clf.best_params_)
  # print(clf.best_score_)
  # print()

# bo_train
# {'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'saga', 'model__tol': 0.01}
# 0.8911111111111112

# cl_train
# {'model__C': 1.9, 'model__penalty': 'l2', 'model__solver': 'saga', 'model__tol': 0.001}
# 0.8524822134387351

# co_train
# {'model__C': 1.9, 'model__penalty': 'l2', 'model__solver': 'liblinear', 'model__tol': 0.01}
# 0.8485175202156334

# gl_train
# {'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'saga', 'model__tol': 0.01}
# 0.6636693255982596

# ig_train
# {'model__C': 1.9, 'model__penalty': 'l2', 'model__solver': 'liblinear', 'model__tol': 0.01}
# 0.6754839334107626

# lu_train
# {'model__C': 0.5, 'model__penalty': 'l2', 'model__solver': 'liblinear', 'model__tol': 0.01}
# 0.6887022900763358

bo_train Dimensionalidade do conjunto de dados antes do SelectKBest: 36980
cl_train Dimensionalidade do conjunto de dados antes do SelectKBest: 282532
co_train Dimensionalidade do conjunto de dados antes do SelectKBest: 344974
gl_train Dimensionalidade do conjunto de dados antes do SelectKBest: 75316
ig_train Dimensionalidade do conjunto de dados antes do SelectKBest: 108839
lu_train Dimensionalidade do conjunto de dados antes do SelectKBest: 66804


In [30]:
Cs = [0.1, 1.9, 1.9, 0.1, 1.9, 0.5]
tols = [0.01, 0.001, 0.01, 0.01, 0.01, 0.01]
solvers = ['saga', 'saga', 'liblinear', 'saga', 'liblinear', 'liblinear']

for r,i in zip(test_keys, range(len(test_keys))):
    
    X_test_aux = X_teste[r]['Anon_Contacts']
    y_test_aux = y_teste[r]
    X_train_aux = X[r.split('_')[0] + '_train']['Anon_Contacts']
    y_train_aux = y[r.split('_')[0] + '_train']
    
    
    model = LogisticRegression(
            class_weight='balanced', max_iter=500, solver=solvers[i],
            C=Cs[i], penalty='l2', tol=tols[i]
        )
    pipe_grid = Pipeline(steps=[
        ('tf-idf',  TfidfVectorizer(token_pattern='\d+x\d+')),
        ('20.000 best', SelectKBest(k=20000)),
        ('model', model)
    ])

    fit = pipe_grid.fit(X_train_aux, y_train_aux)
    
    print(r)
    ###########################################
    # SAVE RESULTS
    ###########################################
    
    exp_name = 'BagOfMentions'
    target = r.split('_')[0]
    estimator_name = 'TfidfVectorizer_SelectKBest_LogisticRegression'
    
    test_results_path = f"{reports_path}test_results/{estimator_name}_{target}_{exp_name}_test_results.csv"
    train_results_path = f"{reports_path}train_results/{estimator_name}_{target}_{exp_name}_train_results.csv"
    
    
    # cria df de resultados no teste
    test_pred = pipe_grid.predict(X_test_aux)
    test_pred_probs = pipe_grid.predict_proba(X_test_aux)
    test_pred_proba_0 = [float(probas[0]) for probas in test_pred_probs]
    test_pred_proba_1 = [float(probas[1]) for probas in test_pred_probs]
    y_test_formated = y_test_aux.tolist()
    y_test_pred_formated = test_pred.tolist()
    df_test_results = create_test_results_df(y_test_formated, y_test_pred_formated, test_pred_proba_0, test_pred_proba_1)
    df_test_results.to_csv(test_results_path, index=False)
    
    
    print(classification_report(y_test_aux, test_pred))
    
    
    # cria df de resultados no treino
    train_pred = pipe_grid.predict(X_train_aux)
    train_pred_probs = pipe_grid.predict_proba(X_train_aux)
    train_pred_proba_0 = [float(probas[0]) for probas in train_pred_probs]
    train_pred_proba_1 = [float(probas[1]) for probas in train_pred_probs]
    y_train_formated = y_train_aux.tolist()
    y_train_pred_formated = train_pred.tolist()
    df_train_results = create_test_results_df(y_train_formated, y_train_pred_formated, train_pred_proba_0, train_pred_proba_1)
    df_train_results.to_csv(train_results_path, index=False)

bo_test
              precision    recall  f1-score   support

     against       0.93      0.92      0.92       162
         for       0.52      0.54      0.53        26

    accuracy                           0.87       188
   macro avg       0.72      0.73      0.73       188
weighted avg       0.87      0.87      0.87       188

cl_test
              precision    recall  f1-score   support

     against       0.86      0.88      0.87       289
         for       0.87      0.85      0.86       285

    accuracy                           0.86       574
   macro avg       0.86      0.86      0.86       574
weighted avg       0.86      0.86      0.86       574

co_test
              precision    recall  f1-score   support

     against       0.88      0.76      0.82       354
         for       0.82      0.91      0.86       420

    accuracy                           0.84       774
   macro avg       0.85      0.84      0.84       774
weighted avg       0.85      0.84      0.84       