In [12]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [13]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [16]:
corpus = 'ig'
model_name = 'facebook/fasttext-pt-vectors'
model_name = 'neuralmind/bert-base-portuguese-cased'

In [17]:
path_data = path_processed_data + f'train_r3_{corpus}_top_mentioned_timelines_separated_comments_{model_name.replace("/", "_")}_tsfresh.parquet'

In [18]:
data = pd.read_parquet(path_data)

In [19]:
X = data[[col for col in data.columns if 'emb' in col]]
y = data.Polarity

In [20]:
y_encoded = y.map({'against': 0, 'for': 1})
y_encoded

0    0
0    1
0    0
0    1
0    1
    ..
0    0
0    0
0    0
0    0
0    0
Name: Polarity, Length: 1522, dtype: int64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [22]:
pipe = IMBPipeline(
    steps = [
        ('scaling', MaxAbsScaler()),
        # ('selection', RFE(
        #     estimator = RandomForestClassifier(),
        #     step = 50,
        #     verbose = 4
        #     )),
        ('estimator', XGBClassifier(
            random_state = 42,
            verbosity = 3,
            device = 'cuda',
            tree_method = 'hist'
            ))
    ],
    verbose = True
    )

print('Training ...')
pipe_trained = pipe.fit(X_train, y_train)

y_pred = pipe_trained.predict(X_test)
y_pred_proba = pipe_trained.predict_proba(X_test)

Training ...
[Pipeline] ........... (step 1 of 2) Processing scaling, total=   0.1s
[15:46:52] AllReduce: 0.071451s, 1 calls @ 71451us

[15:46:52] MakeCuts: 0.085878s, 1 calls @ 85878us

[15:46:52] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[15:46:52] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[15:46:52] InitCompressedData: 0.000262s, 1 calls @ 262us

[15:47:01] Configure: 0.026183s, 1 calls @ 26183us

[15:47:01] EvalOneIter: 0.00086s, 100 calls @ 860us

[15:47:01] GetGradient: 0.004765s, 100 calls @ 4765us

[15:47:01] PredictRaw: 0.00013s, 100 calls @ 130us

[15:47:01] UpdateOneIter: 9.55167s, 100 calls @ 9551666us

[15:47:01] BoostNewTrees: 9.51033s, 100 calls @ 9510333us

[15:47:01] CommitModel: 6.3e-05s, 100 calls @ 63us

[15:47:01] Peak memory usage: 2542MiB
[15:47:01] Number of allocations: 5173
[15:47:01] InitData: 0.000478s, 100 calls @ 478us

[15:47:01] InitDataOnce: 0.000464s, 1 calls @ 464us

[15:47:01] Update: 9.50337s, 100 

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [23]:
df_classification_report = get_classification_report(y_test, y_pred)

In [24]:
y_test

0    0
0    1
0    0
0    1
0    0
    ..
0    1
0    0
0    0
0    0
0    0
Name: Polarity, Length: 305, dtype: int64

In [25]:
y_test.to_numpy()

array([0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,

In [26]:
y_pred

array([0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,

In [27]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.618497,0.644578,0.631268,166.0
accuracy,0.590164,0.590164,0.590164,0.590164
weighted avg,0.588661,0.590164,0.589102,305.0
macro avg,0.585764,0.584879,0.585007,305.0
1,0.55303,0.52518,0.538745,139.0


In [28]:
y_encoded.value_counts()/len(y_encoded)

Polarity
0    0.544021
1    0.455979
Name: count, dtype: float64