## Setup

### imports

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import sys
sys.path.append('../src/')
from models.classification_methods import process_classification 

### definitions

In [4]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [5]:
list_target = ['ig','bo', 'cl', 'co', 'gl', 'lu']

In [6]:
corpus = 'ig'
model_name = 'facebook/fasttext-pt-vectors'
model_name = 'neuralmind/bert-base-portuguese-cased'

## Classification

### Top mentioned timelines

In [7]:
# create a list of tuples with (data_train, data_test, target)

list_tuples_top_ment = []

for target in tqdm(list_target):
    
    path_data_train = path_raw_data + f'train_r3_{target}_top_mentioned_timelines.csv'
    path_data_test = path_raw_data + f'test_r3_{target}_top_mentioned_timelines.csv'

    data_train = pd.read_csv(
        path_data_train, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    data_test = pd.read_csv(
        path_data_test, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    list_tuples_top_ment.append((data_train, data_test, target))

100%|██████████| 6/6 [00:22<00:00,  3.80s/it]


In [8]:
X_cols = 'Texts'

In [9]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=50000
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer = text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_top_ment,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...


### Users

In [None]:
# create a list of tuples with (data_train, data_test, target)

list_tuples_users = []

for target in tqdm(list_target):

    path_data_train = path_raw_data + f'r3_{target}_train_users.csv'
    path_data_test = path_raw_data + f'r3_{target}_test_users.csv'

    data_train = pd.read_csv(
        path_data_train, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    data_test = pd.read_csv(
        path_data_test, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    list_tuples_users.append((data_train, data_test, target))

100%|██████████| 6/6 [00:36<00:00,  6.03s/it]


#### Timelines

In [None]:
X_cols = 'Timeline'

In [None]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=50000
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer= text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_users,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[17:00:41] AllReduce: 0.029834s, 1 calls @ 29834us

[17:00:41] MakeCuts: 0.035246s, 1 calls @ 35246us

[17:00:41] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[17:00:41] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[17:00:41] InitCompressedData: 0.000158s, 1 calls @ 158us

[17:00:43] Configure: 0.018689s, 1 calls @ 18689us

[17:00:43] EvalOneIter: 0.001097s, 100 calls @ 1097us

[17:00:43] GetGradient: 0.006715s, 100 calls @ 6715us

[17:00:43] PredictRaw: 0.000138s, 100 calls @ 138us

[17:00:43] UpdateOneIter: 1.65281s, 100 calls @ 1652807us

[17:00:43] BoostNewTrees: 1.62354s, 100 calls @ 1623543us

[17:00:43] CommitModel: 5.2e-05s, 100 calls @ 52us

Unnamed: 0,class,precision,recall,f1-score,support,corpus
3,macro avg,0.613325,0.59987,0.599599,411.0,gl
3,macro avg,0.620309,0.617472,0.616969,272.0,lu
3,macro avg,0.629701,0.626611,0.627343,599.0,ig
2,macro avg,0.682637,0.681473,0.680757,574.0,cl
3,macro avg,0.712332,0.711602,0.711908,774.0,co
3,macro avg,0.853595,0.756885,0.794085,188.0,bo


#### Stance

In [None]:
X_cols = 'Stance'

In [None]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=50000
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer= text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_users,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[17:01:52] AllReduce: 0.016668s, 1 calls @ 16668us

[17:01:52] MakeCuts: 0.024559s, 1 calls @ 24559us

[17:01:52] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[17:01:52] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[17:01:52] InitCompressedData: 0.000219s, 1 calls @ 219us

[17:01:54] Configure: 0.01335s, 1 calls @ 13350us

[17:01:54] EvalOneIter: 0.000814s, 100 calls @ 814us

[17:01:54] GetGradient: 0.005933s, 100 calls @ 5933us

[17:01:54] PredictRaw: 0.000124s, 100 calls @ 124us

[17:01:54] UpdateOneIter: 1.40831s, 100 calls @ 1408313us

[17:01:54] BoostNewTrees: 1.38688s, 100 calls @ 1386876us

[17:01:54] CommitModel: 5e-05s, 100 calls @ 50us

[1

Unnamed: 0,class,precision,recall,f1-score,support,corpus
3,macro avg,0.933155,0.519231,0.50122,188.0,bo
3,macro avg,0.678808,0.674879,0.675571,774.0,co
3,macro avg,0.690307,0.689977,0.690104,272.0,lu
3,macro avg,0.716314,0.715881,0.715834,574.0,cl
3,macro avg,0.765804,0.763129,0.764342,411.0,gl
3,macro avg,0.79349,0.782602,0.785899,599.0,ig
