## Setup

### imports

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import sys
sys.path.append('../src/')
from models.classification_methods import process_classification 

### definitions

In [4]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'
path_results_cr = '../reports/classification_reports/'
path_test_results = '../reports/test_results/'

In [5]:
list_target = ['ig','bo', 'cl', 'co', 'gl', 'lu']

In [6]:
corpus = 'ig'
model_name = 'facebook/fasttext-pt-vectors'
model_name = 'neuralmind/bert-base-portuguese-cased'

## Classification

### Top mentioned timelines

In [7]:
# create a list of tuples with (data_train, data_test, target)

list_tuples_top_ment = []

for target in tqdm(list_target):
    
    path_data_train = path_processed_data + f'train_r3_{target}_top_mentioned_timelines_{model_name.replace("/", "_")}.parquet'
    path_data_test = path_processed_data + f'test_r3_{target}_top_mentioned_timelines_{model_name.replace("/", "_")}.parquet'

    data_train = pd.read_parquet(path_data_train)
    data_test = pd.read_parquet(path_data_test)
    
    list_tuples_top_ment.append((data_train, data_test, target))

100%|██████████| 6/6 [00:13<00:00,  2.30s/it]


In [8]:
X_cols = [col for col in data_test.columns if 'emb' in col]

['Texts_emb_1',
 'Texts_emb_2',
 'Texts_emb_3',
 'Texts_emb_4',
 'Texts_emb_5',
 'Texts_emb_6',
 'Texts_emb_7',
 'Texts_emb_8',
 'Texts_emb_9',
 'Texts_emb_10',
 'Texts_emb_11',
 'Texts_emb_12',
 'Texts_emb_13',
 'Texts_emb_14',
 'Texts_emb_15',
 'Texts_emb_16',
 'Texts_emb_17',
 'Texts_emb_18',
 'Texts_emb_19',
 'Texts_emb_20',
 'Texts_emb_21',
 'Texts_emb_22',
 'Texts_emb_23',
 'Texts_emb_24',
 'Texts_emb_25',
 'Texts_emb_26',
 'Texts_emb_27',
 'Texts_emb_28',
 'Texts_emb_29',
 'Texts_emb_30',
 'Texts_emb_31',
 'Texts_emb_32',
 'Texts_emb_33',
 'Texts_emb_34',
 'Texts_emb_35',
 'Texts_emb_36',
 'Texts_emb_37',
 'Texts_emb_38',
 'Texts_emb_39',
 'Texts_emb_40',
 'Texts_emb_41',
 'Texts_emb_42',
 'Texts_emb_43',
 'Texts_emb_44',
 'Texts_emb_45',
 'Texts_emb_46',
 'Texts_emb_47',
 'Texts_emb_48',
 'Texts_emb_49',
 'Texts_emb_50',
 'Texts_emb_51',
 'Texts_emb_52',
 'Texts_emb_53',
 'Texts_emb_54',
 'Texts_emb_55',
 'Texts_emb_56',
 'Texts_emb_57',
 'Texts_emb_58',
 'Texts_emb_59',
 'Text

In [9]:
X_cols

['Texts_emb_1',
 'Texts_emb_2',
 'Texts_emb_3',
 'Texts_emb_4',
 'Texts_emb_5',
 'Texts_emb_6',
 'Texts_emb_7',
 'Texts_emb_8',
 'Texts_emb_9',
 'Texts_emb_10',
 'Texts_emb_11',
 'Texts_emb_12',
 'Texts_emb_13',
 'Texts_emb_14',
 'Texts_emb_15',
 'Texts_emb_16',
 'Texts_emb_17',
 'Texts_emb_18',
 'Texts_emb_19',
 'Texts_emb_20',
 'Texts_emb_21',
 'Texts_emb_22',
 'Texts_emb_23',
 'Texts_emb_24',
 'Texts_emb_25',
 'Texts_emb_26',
 'Texts_emb_27',
 'Texts_emb_28',
 'Texts_emb_29',
 'Texts_emb_30',
 'Texts_emb_31',
 'Texts_emb_32',
 'Texts_emb_33',
 'Texts_emb_34',
 'Texts_emb_35',
 'Texts_emb_36',
 'Texts_emb_37',
 'Texts_emb_38',
 'Texts_emb_39',
 'Texts_emb_40',
 'Texts_emb_41',
 'Texts_emb_42',
 'Texts_emb_43',
 'Texts_emb_44',
 'Texts_emb_45',
 'Texts_emb_46',
 'Texts_emb_47',
 'Texts_emb_48',
 'Texts_emb_49',
 'Texts_emb_50',
 'Texts_emb_51',
 'Texts_emb_52',
 'Texts_emb_53',
 'Texts_emb_54',
 'Texts_emb_55',
 'Texts_emb_56',
 'Texts_emb_57',
 'Texts_emb_58',
 'Texts_emb_59',
 'Text

In [38]:
# define pipeline steps 
scaling = MaxAbsScaler()
selection = None
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_top_ment,
        X_cols = X_cols
)
df_cr.to_csv(path_results_cr + 'dummy_classifier_users_timeline_classification_report.csv')
df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[16:54:36] AllReduce: 0.023712s, 1 calls @ 23712us

[16:54:36] MakeCuts: 0.035969s, 1 calls @ 35969us

[16:54:36] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[16:54:36] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[16:54:36] InitCompressedData: 0.001512s, 1 calls @ 1512us

[16:54:38] Configure: 0.009921s, 1 calls @ 9921us

[16:54:38] EvalOneIter: 0.000848s, 100 calls @ 848us

[16:54:38] GetGradient: 0.006654s, 100 calls @ 6654us

[16:54:38] PredictRaw: 0.000132s, 100 calls @ 132us

[16:54:38] UpdateOneIter: 1.72563s, 100 calls @ 1725634us

[16:54:38] BoostNewTrees: 1.706s, 100 calls @ 1705997us

[16:54:38] CommitModel: 5.8e-05s, 100 calls @ 58us

[

Unnamed: 0,class,precision,recall,f1-score,support,corpus
3,macro avg,0.682796,0.516144,0.498358,188.0,bo
3,macro avg,0.56635,0.544726,0.526831,411.0,gl
3,macro avg,0.58266,0.57985,0.578352,272.0,lu
3,macro avg,0.58429,0.57943,0.578806,599.0,ig
3,macro avg,0.604328,0.602307,0.600649,574.0,cl
3,macro avg,0.65422,0.654701,0.65439,774.0,co


### Users

In [39]:
# create a list of tuples with (data_train, data_test, target)

list_tuples_users = []

for target in tqdm(list_target):

    path_data_train = path_processed_data + f'r3_{target}_train_users_{model_name.replace("/", "_")}.parquet'
    path_data_test = path_processed_data + f'r3_{target}_test_users_{model_name.replace("/", "_")}.parquet'

    data_train = pd.read_parquet(path_data_train)
    data_test = pd.read_parquet(path_data_test)
    
    list_tuples_users.append((data_train, data_test, target))

100%|██████████| 6/6 [00:23<00:00,  3.98s/it]


In [14]:
pd.read_parquet(path_data_train, **{})

Unnamed: 0,User_ID,Polarity,Texts,Texts_emb_1,Texts_emb_2,Texts_emb_3,Texts_emb_4,Texts_emb_5,Texts_emb_6,Texts_emb_7,...,Texts_emb_759,Texts_emb_760,Texts_emb_761,Texts_emb_762,Texts_emb_763,Texts_emb_764,Texts_emb_765,Texts_emb_766,Texts_emb_767,Texts_emb_768
0,r2_lu_1,for,Bastidores do Logo mais novas palestras e cont...,-0.228853,-0.202725,0.508440,-0.006737,0.586556,0.266518,-0.067279,...,-0.078371,-0.204703,-0.780542,-0.497446,0.385078,-0.414355,0.050656,0.012453,-0.067538,-0.350661
1,r2_lu_2,for,PQP ESSE DORAMA É MUITO FOADA(Sassy GoGo(Cheer...,-0.107667,-0.151855,0.441127,0.037583,0.520407,0.679378,-0.029682,...,0.006313,0.026672,-0.765249,-0.240724,0.398015,-0.335633,0.090770,0.056602,0.063500,-0.374083
2,r2_lu_3,against,"@Gremio E que domínio, hein campeão? # @Analis...",-0.159751,-0.175113,0.287834,-0.063407,0.524479,0.426202,0.115893,...,-0.037945,-0.006513,-0.776340,-0.320336,0.408296,-0.454143,0.057517,-0.155906,-0.106136,-0.307476
3,r2_lu_5,for,a vontade de cortar o cabelo curtinho não pass...,-0.204866,-0.195960,0.342440,-0.008330,0.581180,0.559819,-0.025620,...,0.018827,0.013892,-0.643602,-0.244813,0.475716,-0.391150,0.268533,0.040979,-0.040325,-0.285664
4,r2_lu_9,for,Para mais informações sigam os perfil do @govb...,-0.161962,-0.293159,0.410162,-0.022136,0.492353,0.250965,0.029293,...,0.133500,-0.203925,-0.487742,-0.325804,0.384087,-0.418149,0.102530,-0.095404,-0.091220,-0.275685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,r2_lu_1089,for,".@GuilhermeBoulos convida. É domingo, dia 13. ...",-0.195680,-0.102688,0.375545,0.028601,0.552920,0.321834,0.033052,...,0.120257,-0.086251,-0.673046,-0.309781,0.414674,-0.438636,0.014682,-0.075526,-0.013378,-0.408907
812,r2_lu_1092,for,(...) p/ conscientizar e explicar como elimina...,-0.094237,-0.217138,0.524669,0.005611,0.488059,0.370081,-0.089034,...,0.114370,-0.112064,-0.733496,-0.348119,0.266663,-0.431373,0.149339,-0.134464,-0.057407,-0.331929
813,r2_lu_1095,for,"@TVJustica @RadioJustica O min. Marco Aurélio,...",-0.198064,-0.166758,0.530498,0.133116,0.373059,0.479163,-0.018344,...,0.113587,-0.043571,-0.714946,-0.401168,0.422786,-0.384292,-0.050555,0.076332,-0.154675,-0.129023
814,r2_lu_1096,for,IBGE sugere vacinar equipes para Censo e discu...,-0.137815,-0.194538,0.517122,-0.037006,0.614969,0.288686,0.126937,...,-0.030401,-0.216349,-0.628359,-0.308201,0.408606,-0.290563,-0.011164,-0.139985,-0.089073,-0.112610


#### Timelines

In [44]:
X_cols = [col for col in data_test.columns if 'emb' in col and 'Timeline' in col]

In [46]:
# define pipeline steps 
scaling = MaxAbsScaler()
selection = None
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_users,
        X_cols = X_cols
)

df_cr.to_csv(path_results_cr + 'dummy_classifier_users_timeline_classification_report.csv')
df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[17:00:41] AllReduce: 0.029834s, 1 calls @ 29834us

[17:00:41] MakeCuts: 0.035246s, 1 calls @ 35246us

[17:00:41] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[17:00:41] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[17:00:41] InitCompressedData: 0.000158s, 1 calls @ 158us

[17:00:43] Configure: 0.018689s, 1 calls @ 18689us

[17:00:43] EvalOneIter: 0.001097s, 100 calls @ 1097us

[17:00:43] GetGradient: 0.006715s, 100 calls @ 6715us

[17:00:43] PredictRaw: 0.000138s, 100 calls @ 138us

[17:00:43] UpdateOneIter: 1.65281s, 100 calls @ 1652807us

[17:00:43] BoostNewTrees: 1.62354s, 100 calls @ 1623543us

[17:00:43] CommitModel: 5.2e-05s, 100 calls @ 52us

Unnamed: 0,class,precision,recall,f1-score,support,corpus
3,macro avg,0.613325,0.59987,0.599599,411.0,gl
3,macro avg,0.620309,0.617472,0.616969,272.0,lu
3,macro avg,0.629701,0.626611,0.627343,599.0,ig
2,macro avg,0.682637,0.681473,0.680757,574.0,cl
3,macro avg,0.712332,0.711602,0.711908,774.0,co
3,macro avg,0.853595,0.756885,0.794085,188.0,bo


#### Stance

In [47]:
X_cols = [col for col in data_test.columns if 'emb' in col and 'Stance' in col]

In [48]:
# define pipeline steps 
scaling = MaxAbsScaler()
selection = None
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_users,
        X_cols = X_cols
)

df_cr.to_csv(path_results_cr + 'dummy_classifier_users_timeline_classification_report.csv')
df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[17:01:52] AllReduce: 0.016668s, 1 calls @ 16668us

[17:01:52] MakeCuts: 0.024559s, 1 calls @ 24559us

[17:01:52] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[17:01:52] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[17:01:52] InitCompressedData: 0.000219s, 1 calls @ 219us

[17:01:54] Configure: 0.01335s, 1 calls @ 13350us

[17:01:54] EvalOneIter: 0.000814s, 100 calls @ 814us

[17:01:54] GetGradient: 0.005933s, 100 calls @ 5933us

[17:01:54] PredictRaw: 0.000124s, 100 calls @ 124us

[17:01:54] UpdateOneIter: 1.40831s, 100 calls @ 1408313us

[17:01:54] BoostNewTrees: 1.38688s, 100 calls @ 1386876us

[17:01:54] CommitModel: 5e-05s, 100 calls @ 50us

[1

Unnamed: 0,class,precision,recall,f1-score,support,corpus
3,macro avg,0.933155,0.519231,0.50122,188.0,bo
3,macro avg,0.678808,0.674879,0.675571,774.0,co
3,macro avg,0.690307,0.689977,0.690104,272.0,lu
3,macro avg,0.716314,0.715881,0.715834,574.0,cl
3,macro avg,0.765804,0.763129,0.764342,411.0,gl
3,macro avg,0.79349,0.782602,0.785899,599.0,ig
