In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [2]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [5]:
corpus = 'ig'
model_name = 'facebook/fasttext-pt-vectors'
model_name = 'neuralmind/bert-base-portuguese-cased'

In [6]:
path_data = path_processed_data + f'train_r3_{corpus}_separated_comments_{model_name.replace("/", "_")}_tsfresh.parquet'

In [7]:
data = pd.read_parquet(path_data)

In [8]:
data.shape

(1800, 7682)

In [9]:
data.head()

Unnamed: 0,User_ID,emb_1__sum_values,emb_1__median,emb_1__mean,emb_1__length,emb_1__standard_deviation,emb_1__variance,emb_1__root_mean_square,emb_1__maximum,emb_1__absolute_maximum,...,emb_768__median,emb_768__mean,emb_768__length,emb_768__standard_deviation,emb_768__variance,emb_768__root_mean_square,emb_768__maximum,emb_768__absolute_maximum,emb_768__minimum,Polarity
0,r2_ig_1,-21.394181,-0.029887,-0.024367,878.0,0.134303,0.018037,0.136496,0.67144,0.67144,...,-0.224691,-0.222255,878.0,0.129926,0.016881,0.257446,0.250205,0.686336,-0.686336,against
1,r2_ig_10,1.394637,0.01659,0.015326,91.0,0.132188,0.017474,0.133074,0.345923,0.345923,...,-0.178725,-0.195514,91.0,0.137983,0.019039,0.239301,0.044307,0.725221,-0.725221,for
2,r2_ig_100,129.244744,0.026289,0.025766,5016.0,0.126746,0.016065,0.129339,0.523251,0.523251,...,-0.228821,-0.232339,5016.0,0.143872,0.020699,0.273277,0.264378,0.745202,-0.745202,against
3,r2_ig_101,13.903916,-0.000133,0.010753,1293.0,0.144235,0.020804,0.144636,0.520573,0.592314,...,-0.260851,-0.260714,1293.0,0.140515,0.019744,0.29617,0.243582,0.732912,-0.732912,for
4,r2_ig_102,-0.538206,-0.012598,-0.000312,1723.0,0.139171,0.019369,0.139172,0.576313,0.576313,...,-0.244546,-0.241903,1723.0,0.134479,0.018085,0.27677,0.267556,0.804279,-0.804279,for


In [10]:
X = data[[col for col in data.columns if 'emb' in col]]
y = data.Polarity

In [11]:
y_encoded = y.map({'against': 0, 'for': 1})
y_encoded

0      0
1      1
2      0
3      1
4      1
      ..
175    0
176    0
177    0
178    0
179    0
Name: Polarity, Length: 1800, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [13]:
pipe = IMBPipeline(
    steps = [
        ('scaling', MaxAbsScaler()),
        ('selection', RFE(
            estimator = RandomForestClassifier(),
            step = 50,
            verbose = 4
            )),
        ('estimator', XGBClassifier(
            random_state = 42,
            verbosity = 3,
            # device = 'cuda',
            # tree_method = 'hist'
            ))
    ],
    verbose = True
    )

print('Training ...')
pipe_trained = pipe.fit(X_train, y_train)

y_pred = pipe_trained.predict(X_test)
y_pred_proba = pipe_trained.predict_proba(X_test)

Training ...
[Pipeline] ........... (step 1 of 3) Processing scaling, total=   0.1s
Fitting estimator with 7680 features.
Fitting estimator with 7630 features.
Fitting estimator with 7580 features.
Fitting estimator with 7530 features.
Fitting estimator with 7480 features.
Fitting estimator with 7430 features.
Fitting estimator with 7380 features.
Fitting estimator with 7330 features.
Fitting estimator with 7280 features.
Fitting estimator with 7230 features.
Fitting estimator with 7180 features.
Fitting estimator with 7130 features.
Fitting estimator with 7080 features.
Fitting estimator with 7030 features.
Fitting estimator with 6980 features.
Fitting estimator with 6930 features.
Fitting estimator with 6880 features.
Fitting estimator with 6830 features.
Fitting estimator with 6780 features.
Fitting estimator with 6730 features.
Fitting estimator with 6680 features.
Fitting estimator with 6630 features.
Fitting estimator with 6580 features.
Fitting estimator with 6530 features.
Fitt

In [14]:
df_classification_report = get_classification_report(y_test, y_pred)

In [15]:
y_test

50     1
37     0
97     1
45     0
161    0
      ..
40     1
173    1
130    0
128    0
79     1
Name: Polarity, Length: 360, dtype: int64

In [16]:
y_test.to_numpy()

array([1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,

In [17]:
y_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,

In [18]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.668142,0.743842,0.703963,203.0
accuracy,0.647222,0.647222,0.647222,0.647222
weighted avg,0.643632,0.647222,0.642738,360.0
macro avg,0.640041,0.633068,0.633768,360.0
1,0.61194,0.522293,0.563574,157.0


In [None]:
y_encoded.value_counts()/len(y_encoded)