## Setup

### imports

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import sys
sys.path.append('../src/')
from models.classification_methods import process_classification 

### definitions

In [4]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [5]:
list_target = ['ig','bo', 'cl', 'co', 'gl', 'lu']

In [6]:
corpus = 'ig'
model_name = 'facebook/fasttext-pt-vectors'
model_name = 'neuralmind/bert-base-portuguese-cased'

## Classification

### Top mentioned timelines

In [7]:
# create a list of tuples with (data_train, data_test, target)

list_tuples_top_ment = []

for target in tqdm(list_target):
    
    path_data_train = path_raw_data + f'train_r3_{target}_top_mentioned_timelines.csv'
    path_data_test = path_raw_data + f'test_r3_{target}_top_mentioned_timelines.csv'

    data_train = pd.read_csv(
        path_data_train, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    data_test = pd.read_csv(
        path_data_test, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    list_tuples_top_ment.append((data_train, data_test, target))

100%|██████████| 6/6 [00:21<00:00,  3.54s/it]


In [8]:
X_cols = 'Texts'

In [9]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=50000
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer = text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_top_ment,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total= 4.2min
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.3s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[14:14:45] AllReduce: 0.05373s, 1 calls @ 53730us

[14:14:45] MakeCuts: 0.121144s, 1 calls @ 121144us

[14:14:45] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[14:14:45] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[14:14:45] InitCompressedData: 0.000664s, 1 calls @ 664us

[14:14:45] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:14:45] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:14:45] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:14:45] DEBUG: /workspace/src/tree/gpu_hist/

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total= 1.1min
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.2s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[14:16:38] AllReduce: 0.008179s, 1 calls @ 8179us

[14:16:38] MakeCuts: 0.020358s, 1 calls @ 20358us

[14:16:38] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[14:16:38] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[14:16:38] InitCompressedData: 0.000459s, 1 calls @ 459us

[14:16:38] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:16:39] Configure: 0.008361s, 1 calls @ 8361us

[14:16:39] EvalOneIter: 0.000391s, 100 calls @ 391us

[14:16:39] GetGradient: 0.003927s, 100 calls @ 3927us

[14:16:39] PredictRaw: 8.1e-05s, 100 calls @ 81us

[14:16:39] UpdateOneIter: 1.22583s, 100 calls @ 1225834us

[1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total= 4.7min
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.5s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[14:21:33] AllReduce: 0.053086s, 1 calls @ 53086us

[14:21:33] MakeCuts: 0.141634s, 1 calls @ 141634us

[14:21:33] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[14:21:33] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[14:21:33] InitCompressedData: 0.001005s, 1 calls @ 1005us

[14:21:33] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:21:34] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:21:34] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:21:34] DEBUG: /workspace/src/tree/gpu_his

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total= 2.0min
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.2s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[14:37:09] AllReduce: 0.019305s, 1 calls @ 19305us

[14:37:09] MakeCuts: 0.036242s, 1 calls @ 36242us

[14:37:09] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[14:37:09] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[14:37:09] InitCompressedData: 0.00059s, 1 calls @ 590us

[14:37:09] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:37:10] Configure: 0.008639s, 1 calls @ 8639us

[14:37:10] EvalOneIter: 0.000464s, 100 calls @ 464us

[14:37:10] GetGradient: 0.004233s, 100 calls @ 4233us

[14:37:10] PredictRaw: 8.8e-05s, 100 calls @ 88us

[14:37:10] UpdateOneIter: 1.61189s, 100 calls @ 1611893us

[1

Unnamed: 0,class,precision,recall,f1-score,support,corpus
2,macro avg,0.498208,0.499806,0.354818,574.0,cl
3,macro avg,0.282972,0.5,0.361407,599.0,ig
3,macro avg,0.296837,0.5,0.372519,411.0,gl
3,macro avg,0.536691,0.518838,0.460963,272.0,lu
3,macro avg,0.430851,0.5,0.462857,188.0,bo
2,macro avg,0.648479,0.644532,0.634779,774.0,co


### Users

In [7]:
# create a list of tuples with (data_train, data_test, target)

list_tuples_users = []

for target in tqdm(list_target):

    path_data_train = path_raw_data + f'r3_{target}_train_users.csv'
    path_data_test = path_raw_data + f'r3_{target}_test_users.csv'

    data_train = pd.read_csv(
        path_data_train, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    data_test = pd.read_csv(
        path_data_test, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    list_tuples_users.append((data_train, data_test, target))

100%|██████████| 6/6 [00:32<00:00,  5.48s/it]


#### Timelines

In [11]:
X_cols = 'Timeline'

In [12]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=50000
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer= text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_users,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...


#### Stance

In [8]:
X_cols = 'Stance'

In [10]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3)
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer= text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_users,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

Training ...
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=   0.1s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[14:22:15] AllReduce: 0.000964s, 1 calls @ 964us

[14:22:15] MakeCuts: 0.002772s, 1 calls @ 2772us

[14:22:15] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[14:22:15] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[14:22:15] InitCompressedData: 9.2e-05s, 1 calls @ 92us

[14:22:15] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:22:15] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:22:15] DEBUG: /workspace/src/tree/gpu_hist/../../common/device_helpers.cuh:291: Skipping empty CUDA kernel.
[14:22:15] DEBUG: /workspace/src/tree/gpu_hist/../..

Unnamed: 0,class,precision,recall,f1-score,support,corpus
3,macro avg,0.512963,0.500379,0.351459,272.0,lu
3,macro avg,0.642871,0.50882,0.359875,574.0,cl
2,macro avg,0.516899,0.504944,0.380169,774.0,co
3,macro avg,0.466654,0.494157,0.388301,599.0,ig
3,macro avg,0.675248,0.513866,0.40832,411.0,gl
3,macro avg,0.937838,0.557692,0.570307,188.0,bo
