In [29]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer

In [30]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

In [31]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [33]:
data = pd.read_csv(path_processed_data + 'data_filtered.csv', index_col = 0)

In [34]:
data.head()

Unnamed: 0,User_ID,Polarity,Texts
0,r2_bo_1,for,Saudadinhaa dl :/ # Gente que beija com mordid...
3,r2_bo_3,for,"@Gremio E que domínio, hein campeão? # @Analis..."
24,r2_bo_5,for,"@This_IsK Hahahaha olha, salada caesar é fácil..."
30,r2_bo_7,against,@Chiico_Sant vai passar aqui ou quer que eu vá...
32,r2_bo_8,against,"@gabycunha86 Amanhã vou aí, deixa pra terça # ..."


In [35]:
data['n_comments'] = data.Texts.apply(lambda x: len(x.split(' # ')))
data

Unnamed: 0,User_ID,Polarity,Texts,n_comments
0,r2_bo_1,for,Saudadinhaa dl :/ # Gente que beija com mordid...,1370
3,r2_bo_3,for,"@Gremio E que domínio, hein campeão? # @Analis...",2969
24,r2_bo_5,for,"@This_IsK Hahahaha olha, salada caesar é fácil...",2250
30,r2_bo_7,against,@Chiico_Sant vai passar aqui ou quer que eu vá...,1574
32,r2_bo_8,against,"@gabycunha86 Amanhã vou aí, deixa pra terça # ...",956
...,...,...,...,...
5984,r2_bo_720,against,"Pesquisa mostra que, além do Brasil, apenas Rú...",3207
6820,r2_bo_740,for,"eu odeio trânsito com todas minhas forças, vai...",1862
6825,r2_bo_743,against,A rainha do pop está de volta. Madonna lançou ...,2667
6983,r2_bo_751,for,@toko_tkd Indo # @toko_tkd Demorou pra respond...,2139


In [36]:
#X = data[['Texts', 'n_comments']]
X = data['Texts']
y = data.Polarity

In [37]:
X.head()

0     Saudadinhaa dl :/ # Gente que beija com mordid...
3     @Gremio E que domínio, hein campeão? # @Analis...
24    @This_IsK Hahahaha olha, salada caesar é fácil...
30    @Chiico_Sant vai passar aqui ou quer que eu vá...
32    @gabycunha86 Amanhã vou aí, deixa pra terça # ...
Name: Texts, dtype: object

In [38]:
y_encoded = y.map({'against': 0, 'for': 1})
y_encoded

0       1
3       1
24      1
30      0
32      0
       ..
5984    0
6820    1
6825    0
6983    1
7039    0
Name: Polarity, Length: 471, dtype: int64

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [40]:
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3)
            
            )

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ('Texts', text_vect , 'Texts'),
        ('n_comments', 'passthrough',['n_comments'])
    ]
)

In [42]:
pipe = IMBPipeline(
    steps = [
        ('vectorizer', text_vect),
        ('sampling', RandomOverSampler(random_state=42,sampling_strategy='minority')),
        ('scaling', MaxAbsScaler()),
        ('estimator', XGBClassifier(
            random_state = 42,
            eval_metric = 'aucpr',
            verbosity = 3,
            # device = 'cuda',
            # tree_method = 'hist'
            ))
    ],
    verbose = True
    )

print('Training ...')
pipe_trained = pipe.fit(X_train, y_train)

y_pred = pipe_trained.predict(X_test)
y_pred_proba = pipe_trained.predict_proba(X_test)

Training ...
[Pipeline] ........ (step 1 of 4) Processing vectorizer, total=  60.0s
[Pipeline] .......... (step 2 of 4) Processing sampling, total=   0.1s
[Pipeline] ........... (step 3 of 4) Processing scaling, total=   2.8s
[00:18:29] AllReduce: 0.400936s, 1 calls @ 400936us

[00:18:29] MakeCuts: 1.02167s, 1 calls @ 1021674us

[00:18:30] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[00:20:46] Configure: 0.015379s, 1 calls @ 15379us

[00:20:46] EvalOneIter: 0.001043s, 100 calls @ 1043us

[00:20:46] GetGradient: 0.443093s, 100 calls @ 443093us

[00:20:46] PredictRaw: 0.000142s, 100 calls @ 142us

[00:20:46] UpdateOneIter: 135.871s, 100 calls @ 135871048us

[00:20:46] BoostNewTrees: 135.378s, 100 calls @ 135378469us

[00:20:46] CommitModel: 0.000122s, 100 calls @ 122us

[00:20:46] BuildHistogram: 44.9213s, 190 calls @ 44921259us

[00:20:46] EvaluateSplits: 52.8257s, 290 calls @ 52825748us

[00:20:46] InitData: 2.70716s, 100 calls @ 2707155us

[00:20:46] InitRoot: 43.236

In [43]:
df_classification_report = get_classification_report(y_test, y_pred)

In [44]:
y_test

4160    0
2883    0
2933    0
4210    0
24      1
       ..
3227    0
1283    0
629     0
2996    0
1923    1
Name: Polarity, Length: 95, dtype: int64

In [45]:
y_test.to_numpy()

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1])

In [46]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [47]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.913978,0.977011,0.944444,87.0
accuracy,0.894737,0.894737,0.894737,0.894737
weighted avg,0.837012,0.894737,0.864912,95.0
macro avg,0.456989,0.488506,0.472222,95.0
1,0.0,0.0,0.0,8.0
