## Imports

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import MaxAbsScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from tqdm import tqdm
import nltk

In [2]:
tqdm.pandas()

In [3]:
import sys
sys.path.append('../src/')
from models.ClassificationPipeline import ClassificationPipeline 

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

## definitions

In [6]:
random_seed = 42

In [7]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [8]:
corpus = 'ig'

In [9]:
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=30000
            
            )
sampling = RandomOverSampler(random_state=random_seed)
scaling = MaxAbsScaler()

classifier = XGBClassifier(
            random_state = 42,
            verbosity = 3,
            # device = 'cuda',
            # tree_method = 'hist'
            )

## data 

In [10]:
### read data
data = pd.read_csv(path_processed_data + f'train_r3_{corpus}_filtered.csv', index_col = 0)

In [11]:
data.head()

Unnamed: 0,User_ID,Polarity,Texts
0,r2_ig_1,against,PQP ESSE DORAMA É MUITO FOADA(Sassy GoGo(Cheer...
1,r2_ig_4,for,Golaço!!!!!!!!! # Manda geral do time principa...
2,r2_ig_7,against,"@gabycunha86 Amanhã vou aí, deixa pra terça # ..."
3,r2_ig_8,for,3.4- O Centro de Coordenação da Operação está ...
4,r2_ig_10,for,"Me arrependi de excluir meu outro tt, agora ti..."


In [12]:
X = data['Texts']
y = data.Polarity

In [13]:
X.head()

0    PQP ESSE DORAMA É MUITO FOADA(Sassy GoGo(Cheer...
1    Golaço!!!!!!!!! # Manda geral do time principa...
2    @gabycunha86 Amanhã vou aí, deixa pra terça # ...
3    3.4- O Centro de Coordenação da Operação está ...
4    Me arrependi de excluir meu outro tt, agora ti...
Name: Texts, dtype: object

In [14]:
# encode label
y_encoded = y.map({'against': 0, 'for': 1})
y_encoded

0       0
1       1
2       0
3       1
4       1
       ..
1790    0
1792    0
1793    0
1794    0
1795    0
Name: Polarity, Length: 1522, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

## Classification

In [16]:
clf_pipe = ClassificationPipeline(
    vectorizer=text_vect,
    sampling = sampling,
    scaling =scaling, 
    estimator= classifier
)

clf_pipe.train(X_train, y_train)

y_pred, y_pred_proba = clf_pipe.predict(X_test)

[Pipeline] ........ (step 1 of 4) Processing vectorizer, total= 3.4min
[Pipeline] .......... (step 2 of 4) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 4) Processing scaling, total=   0.2s
[01:30:11] AllReduce: 0.037871s, 1 calls @ 37871us

[01:30:11] MakeCuts: 0.072468s, 1 calls @ 72468us

[01:30:11] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[01:31:24] Configure: 0.002297s, 1 calls @ 2297us

[01:31:24] EvalOneIter: 0.00143s, 100 calls @ 1430us

[01:31:24] GetGradient: 0.299058s, 100 calls @ 299058us

[01:31:24] PredictRaw: 0.000151s, 100 calls @ 151us

[01:31:24] UpdateOneIter: 73.752s, 100 calls @ 73752041us

[01:31:24] BoostNewTrees: 73.4149s, 100 calls @ 73414899us

[01:31:24] CommitModel: 8e-05s, 100 calls @ 80us

[01:31:24] BuildHistogram: 30.1989s, 483 calls @ 30198864us

[01:31:24] EvaluateSplits: 28.9595s, 583 calls @ 28959452us

[01:31:24] InitData: 0.318034s, 100 calls @ 318034us

[01:31:24] InitRoot: 12.4795s, 100 calls @ 12479469

## results

In [17]:
df_classification_report = get_classification_report(y_test, y_pred)

In [18]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.674556,0.686747,0.680597,166.0
accuracy,0.64918,0.64918,0.64918,0.64918
weighted avg,0.648621,0.64918,0.648838,305.0
macro avg,0.646102,0.645532,0.645753,305.0
1,0.617647,0.604317,0.610909,139.0
