In [19]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [20]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [37]:
list_corpus = ['ig','bo', 'cl', 'co', 'gl', 'lu']

In [23]:
corpus = 'ig'
model_name = 'facebook/fasttext-pt-vectors'
model_name = 'neuralmind/bert-base-portuguese-cased'

In [39]:
df_results = pd.DataFrame({})


for corpus in list_corpus:


    path_data = path_processed_data + f'train_r3_{corpus}_{model_name.replace("/", "_")}.parquet'

    data = pd.read_parquet(path_data)

    X = data[[col for col in data.columns if 'emb' in col]]
    y = data.Polarity

    y_encoded = y.map({'against': 0, 'for': 1})


    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

    pipe = IMBPipeline(
        steps = [
            ('scaling', MaxAbsScaler()),
            ('selection', None),
            ('estimator', XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                ))
        ],
        verbose = True
        )

    print('Training ...')
    pipe_trained = pipe.fit(X_train, y_train)

    y_pred = pipe_trained.predict(X_test)
    y_pred_proba = pipe_trained.predict_proba(X_test)
    
    df_classification_report = get_classification_report(y_test, y_pred)
    
    df_classification_report = df_classification_report.reset_index().rename(columns = {"index": "class"})
    
    df_classification_report['corpus'] = corpus

    df_results = pd.concat([df_results, df_classification_report])

Training ...
[Pipeline] ........... (step 1 of 3) Processing scaling, total=   0.0s
[Pipeline] ......... (step 2 of 3) Processing selection, total=   0.0s
[09:19:58] AllReduce: 0.008482s, 1 calls @ 8482us

[09:19:58] MakeCuts: 0.011222s, 1 calls @ 11222us

[09:19:58] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[09:19:58] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[09:19:58] InitCompressedData: 0.000117s, 1 calls @ 117us

[09:20:00] Configure: 0.0794s, 1 calls @ 79400us

[09:20:00] EvalOneIter: 0.000885s, 100 calls @ 885us

[09:20:00] GetGradient: 0.006604s, 100 calls @ 6604us

[09:20:00] PredictRaw: 0.00021s, 100 calls @ 210us

[09:20:00] UpdateOneIter: 1.68782s, 100 calls @ 1687819us

[09:20:00] BoostNewTrees: 1.59067s, 100 calls @ 1590671us

[09:20:00] CommitModel: 5.3e-05s, 100 calls @ 53us

[09:20:00] Peak memory usage: 387MiB
[09:20:00] Number of allocations: 5531
[09:20:00] InitData: 0.000515s, 100 calls @ 515us

[09:20:00] InitDat

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Training ...
[Pipeline] ........... (step 1 of 3) Processing scaling, total=   0.0s
[Pipeline] ......... (step 2 of 3) Processing selection, total=   0.0s
[09:20:00] AllReduce: 0.004221s, 1 calls @ 4221us

[09:20:00] MakeCuts: 0.005464s, 1 calls @ 5464us

[09:20:00] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[09:20:00] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[09:20:00] InitCompressedData: 2.2e-05s, 1 calls @ 22us

[09:20:01] Configure: 0.000713s, 1 calls @ 713us

[09:20:01] EvalOneIter: 0.000566s, 100 calls @ 566us

[09:20:01] GetGradient: 0.00435s, 100 calls @ 4350us

[09:20:01] PredictRaw: 0.0001s, 100 calls @ 100us

[09:20:01] UpdateOneIter: 0.52857s, 100 calls @ 528570us

[09:20:01] BoostNewTrees: 0.521946s, 100 calls @ 521946us

[09:20:01] CommitModel: 3.8e-05s, 100 calls @ 38us

[09:20:01] Peak memory usage: 387MiB
[09:20:01] Number of allocations: 10339
[09:20:01] InitData: 0.000353s, 100 calls @ 353us

[09:20:01] InitDataOnce

In [40]:
print(display(df_results[df_results['class'] == 'macro avg'].sort_values('f1-score')))

Unnamed: 0,class,precision,recall,f1-score,support,corpus
3,macro avg,0.686937,0.528231,0.522938,113.0,bo
3,macro avg,0.539817,0.538909,0.537716,164.0,lu
3,macro avg,0.555382,0.548081,0.544272,247.0,gl
3,macro avg,0.625186,0.618587,0.618925,360.0,ig
3,macro avg,0.64418,0.637367,0.633232,345.0,cl
3,macro avg,0.684989,0.68385,0.68426,464.0,co


None
