## Imports

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import MaxAbsScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from tqdm import tqdm
import nltk
from sklearn.compose import ColumnTransformer

In [2]:
tqdm.pandas()

In [3]:
import sys
sys.path.append('../src/')
from models.ClassificationPipeline import ClassificationPipeline 

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

## definitions

In [6]:
random_seed = 42

In [7]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [8]:
corpus = 'ig'

## data 

In [9]:
### read data
data = pd.read_csv(path_processed_data + f'train_r3_{corpus}_filtered.csv', index_col = 0)

In [10]:
data.head()

Unnamed: 0,User_ID,Polarity,Texts
0,r2_ig_1,against,PQP ESSE DORAMA É MUITO FOADA(Sassy GoGo(Cheer...
1,r2_ig_4,for,Golaço!!!!!!!!! # Manda geral do time principa...
2,r2_ig_7,against,"@gabycunha86 Amanhã vou aí, deixa pra terça # ..."
3,r2_ig_8,for,3.4- O Centro de Coordenação da Operação está ...
4,r2_ig_10,for,"Me arrependi de excluir meu outro tt, agora ti..."


In [11]:
X = data['Texts']
y = data.Polarity

In [12]:
X.head()

0    PQP ESSE DORAMA É MUITO FOADA(Sassy GoGo(Cheer...
1    Golaço!!!!!!!!! # Manda geral do time principa...
2    @gabycunha86 Amanhã vou aí, deixa pra terça # ...
3    3.4- O Centro de Coordenação da Operação está ...
4    Me arrependi de excluir meu outro tt, agora ti...
Name: Texts, dtype: object

In [13]:
# encode label
y_encoded = y.map({'against': 0, 'for': 1})
y_encoded

0       0
1       1
2       0
3       1
4       1
       ..
1790    0
1792    0
1793    0
1794    0
1795    0
Name: Polarity, Length: 1522, dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

## Classification

### sem oversampling

In [15]:
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=30000
            
            )
sampling = None
scaling = MaxAbsScaler()

classifier = XGBClassifier(
            random_state = 42,
            verbosity = 3,
            # device = 'cuda',
            # tree_method = 'hist'
            )

In [16]:
clf_pipe = ClassificationPipeline(
    vectorizer=text_vect,
    sampling = sampling,
    scaling =scaling, 
    estimator= classifier
)

clf_pipe.train(X_train, y_train)

y_pred, y_pred_proba = clf_pipe.predict(X_test)

df_classification_report = get_classification_report(y_test, y_pred)

df_classification_report

[Pipeline] ........ (step 1 of 4) Processing vectorizer, total= 3.3min
[Pipeline] .......... (step 2 of 4) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 4) Processing scaling, total=   0.2s
[19:05:25] AllReduce: 0.031764s, 1 calls @ 31764us

[19:05:25] MakeCuts: 0.065356s, 1 calls @ 65356us

[19:05:25] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[19:06:31] Configure: 0.00252s, 1 calls @ 2520us

[19:06:31] EvalOneIter: 0.001215s, 100 calls @ 1215us

[19:06:31] GetGradient: 0.115392s, 100 calls @ 115392us

[19:06:31] PredictRaw: 0.000142s, 100 calls @ 142us

[19:06:31] UpdateOneIter: 65.9791s, 100 calls @ 65979132us

[19:06:31] BoostNewTrees: 65.8596s, 100 calls @ 65859645us

[19:06:31] CommitModel: 7.9e-05s, 100 calls @ 79us

[19:06:31] BuildHistogram: 27.7277s, 483 calls @ 27727652us

[19:06:31] EvaluateSplits: 27.5139s, 583 calls @ 27513896us

[19:06:31] InitData: 0.11024s, 100 calls @ 110240us

[19:06:31] InitRoot: 11.0042s, 100 calls @ 110041

Unnamed: 0,precision,recall,f1-score,support
0,0.660819,0.680723,0.670623,166.0
accuracy,0.636066,0.636066,0.636066,0.636066
weighted avg,0.635142,0.636066,0.635433,305.0
macro avg,0.632648,0.631728,0.632015,305.0
1,0.604478,0.582734,0.593407,139.0


### com oversampling

In [17]:
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=30000
            
            )
sampling = RandomOverSampler(random_state=random_seed)
scaling = MaxAbsScaler()

classifier = XGBClassifier(
            random_state = 42,
            verbosity = 3,
            # device = 'cuda',
            # tree_method = 'hist'
            )

In [18]:
clf_pipe = ClassificationPipeline(
    vectorizer=text_vect,
    sampling = sampling,
    scaling =scaling, 
    estimator= classifier
)

clf_pipe.train(X_train, y_train)

y_pred, y_pred_proba = clf_pipe.predict(X_test)

df_classification_report = get_classification_report(y_test, y_pred)

df_classification_report

[Pipeline] ........ (step 1 of 4) Processing vectorizer, total= 3.2min
[Pipeline] .......... (step 2 of 4) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 4) Processing scaling, total=   0.2s
[19:10:05] AllReduce: 0.04035s, 1 calls @ 40350us

[19:10:05] MakeCuts: 0.066544s, 1 calls @ 66544us

[19:10:05] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[19:11:12] Configure: 0.000331s, 1 calls @ 331us

[19:11:12] EvalOneIter: 0.001106s, 100 calls @ 1106us

[19:11:12] GetGradient: 0.121384s, 100 calls @ 121384us

[19:11:12] PredictRaw: 0.000148s, 100 calls @ 148us

[19:11:12] UpdateOneIter: 67.1137s, 100 calls @ 67113677us

[19:11:12] BoostNewTrees: 66.9907s, 100 calls @ 66990683us

[19:11:12] CommitModel: 8.8e-05s, 100 calls @ 88us

[19:11:12] BuildHistogram: 27.8145s, 483 calls @ 27814487us

[19:11:12] EvaluateSplits: 27.8319s, 583 calls @ 27831948us

[19:11:12] InitData: 0.137498s, 100 calls @ 137498us

[19:11:12] InitRoot: 12.0574s, 100 calls @ 120573

Unnamed: 0,precision,recall,f1-score,support
0,0.674556,0.686747,0.680597,166.0
accuracy,0.64918,0.64918,0.64918,0.64918
weighted avg,0.648621,0.64918,0.648838,305.0
macro avg,0.646102,0.645532,0.645753,305.0
1,0.617647,0.604317,0.610909,139.0


### Com feature indicando a quantidade de textos

In [28]:
data['n_comments'] = data.Texts.apply(lambda x: len(x.split(' # ')))
data

Unnamed: 0,User_ID,Polarity,Texts,n_comments
0,r2_ig_1,against,PQP ESSE DORAMA É MUITO FOADA(Sassy GoGo(Cheer...,878
1,r2_ig_4,for,Golaço!!!!!!!!! # Manda geral do time principa...,533
2,r2_ig_7,against,"@gabycunha86 Amanhã vou aí, deixa pra terça # ...",956
3,r2_ig_8,for,3.4- O Centro de Coordenação da Operação está ...,1153
4,r2_ig_10,for,"Me arrependi de excluir meu outro tt, agora ti...",91
...,...,...,...,...
1790,r2_ig_2395,against,ontem a rafaela me abandonou e eu fui pro omeg...,312
1792,r2_ig_2398,against,Em todos os 0 estados dos EUA a partir de hoje...,94
1793,r2_ig_2399,against,isso daqui so eu jogando prime 0 # se rolar ví...,2579
1794,r2_ig_2400,against,@amndwz_ nao acredito! # @amndwz_ MUDANÇA # @a...,2129


In [29]:
X = data[['Texts','n_comments']]
y = data.Polarity
y_encoded = y.map({'against': 0, 'for': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [34]:
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=30000
            
            )
sampling = RandomOverSampler(random_state=random_seed)
scaling = MaxAbsScaler()

classifier = XGBClassifier(
            random_state = 42,
            verbosity = 3,
            # device = 'cuda',
            # tree_method = 'hist'
            )

In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        ('Texts', text_vect , 'Texts'),
        ('n_comments', 'passthrough',['n_comments'])
    ]
)

In [32]:
clf_pipe = ClassificationPipeline(
    vectorizer=preprocessor,
    sampling = sampling,
    scaling =scaling, 
    estimator= classifier
)

clf_pipe.train(X_train, y_train)

y_pred, y_pred_proba = clf_pipe.predict(X_test)

df_classification_report = get_classification_report(y_test, y_pred)

df_classification_report

[Pipeline] ........ (step 1 of 4) Processing vectorizer, total= 3.2min
[Pipeline] .......... (step 2 of 4) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 4) Processing scaling, total=   0.2s
[19:19:57] AllReduce: 0.05322s, 1 calls @ 53220us

[19:19:57] MakeCuts: 0.092076s, 1 calls @ 92076us

[19:19:57] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[19:21:07] Configure: 0.000706s, 1 calls @ 706us

[19:21:07] EvalOneIter: 0.001146s, 100 calls @ 1146us

[19:21:07] GetGradient: 0.092159s, 100 calls @ 92159us

[19:21:07] PredictRaw: 0.000135s, 100 calls @ 135us

[19:21:07] UpdateOneIter: 69.4355s, 100 calls @ 69435540us

[19:21:07] BoostNewTrees: 69.341s, 100 calls @ 69340959us

[19:21:07] CommitModel: 8.2e-05s, 100 calls @ 82us

[19:21:07] BuildHistogram: 27.6426s, 483 calls @ 27642623us

[19:21:07] EvaluateSplits: 30.1437s, 583 calls @ 30143700us

[19:21:07] InitData: 0.078161s, 100 calls @ 78161us

[19:21:07] InitRoot: 12.371s, 100 calls @ 12370955us

Unnamed: 0,precision,recall,f1-score,support
0,0.674556,0.686747,0.680597,166.0
accuracy,0.64918,0.64918,0.64918,0.64918
weighted avg,0.648621,0.64918,0.648838,305.0
macro avg,0.646102,0.645532,0.645753,305.0
1,0.617647,0.604317,0.610909,139.0
