In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer

from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [2]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [5]:
data = pd.read_parquet(path_processed_data + 'data_filtered_separated_fasttext_emb.parquet')

In [6]:
data.head()

Unnamed: 0,User_ID,comment_id,comment,Polarity,embeddings
0,r2_bo_1,0,Saudadinhaa dl :/,for,"[0.0063374573, 0.014696721, -0.10064547, -0.07..."
1,r2_bo_1,1,Gente que beija com mordidinha no final,for,"[-0.009004079, -0.06354138, 0.026768943, -0.02..."
2,r2_bo_1,2,Fiz quase tudo certo. Errei quando coloquei se...,for,"[-0.0038966113, -0.050558098, 0.070876375, 0.0..."
3,r2_bo_1,3,Meu primo é muito sarna mds,for,"[0.044733923, -0.042570617, 0.062495235, -0.01..."
4,r2_bo_1,4,Vontade de falar com ela,for,"[-0.023665277, -0.04709114, 0.06338957, -0.018..."


In [7]:
df_emb = data.embeddings.progress_apply(pd.Series)

100%|██████████| 920681/920681 [00:54<00:00, 16756.15it/s]


In [8]:
df_emb.columns = [f'emb_{i}' for i in range(300)]

In [9]:
df_emb.head()

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_290,emb_291,emb_292,emb_293,emb_294,emb_295,emb_296,emb_297,emb_298,emb_299
0,0.006337,0.014697,-0.100645,-0.071696,0.077942,0.15326,-0.008381,-0.037087,0.108373,0.110981,...,0.057867,0.086457,-0.097246,-0.138224,-0.070663,-0.013261,0.116717,-0.062122,0.14016,0.148228
1,-0.009004,-0.063541,0.026769,-0.02003,0.001166,-0.068906,-0.00516,-0.007385,-0.006171,-0.06738,...,-0.024035,-0.089168,-0.007937,0.026329,-0.035562,0.01047,0.00247,0.000997,-0.045128,0.068493
2,-0.003897,-0.050558,0.070876,0.017981,-0.025209,-0.043963,-0.024445,0.003771,-0.045305,-0.041071,...,0.003181,-0.03958,-0.013224,0.021,0.02175,-0.009772,0.02161,0.021908,-0.000911,0.046222
3,0.044734,-0.042571,0.062495,-0.01683,0.030426,-0.044967,-0.052269,-0.03881,0.015974,-0.078171,...,0.026244,0.049217,0.024683,-0.019835,0.06096,-0.07766,0.032868,-0.054822,-0.021258,-0.030915
4,-0.023665,-0.047091,0.06339,-0.018911,-0.079447,-0.078596,0.020408,0.000289,-0.036697,-0.02442,...,-0.013555,0.000597,0.008922,0.006682,-0.000225,0.027159,0.048046,-0.052628,-0.034904,0.087749


In [10]:
data = pd.concat([data,df_emb], axis =1)

In [11]:
data = data.drop(['comment_id', 'comment', 'embeddings'], axis = 1)

In [12]:
data

Unnamed: 0,User_ID,Polarity,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,emb_290,emb_291,emb_292,emb_293,emb_294,emb_295,emb_296,emb_297,emb_298,emb_299
0,r2_bo_1,for,0.006337,0.014697,-0.100645,-0.071696,0.077942,0.153260,-0.008381,-0.037087,...,0.057867,0.086457,-0.097246,-0.138224,-0.070663,-0.013261,0.116717,-0.062122,0.140160,0.148228
1,r2_bo_1,for,-0.009004,-0.063541,0.026769,-0.020030,0.001166,-0.068906,-0.005160,-0.007385,...,-0.024035,-0.089168,-0.007937,0.026329,-0.035562,0.010470,0.002470,0.000997,-0.045128,0.068493
2,r2_bo_1,for,-0.003897,-0.050558,0.070876,0.017981,-0.025209,-0.043963,-0.024445,0.003771,...,0.003181,-0.039580,-0.013224,0.021000,0.021750,-0.009772,0.021610,0.021908,-0.000911,0.046222
3,r2_bo_1,for,0.044734,-0.042571,0.062495,-0.016830,0.030426,-0.044967,-0.052269,-0.038810,...,0.026244,0.049217,0.024683,-0.019835,0.060960,-0.077660,0.032868,-0.054822,-0.021258,-0.030915
4,r2_bo_1,for,-0.023665,-0.047091,0.063390,-0.018911,-0.079447,-0.078596,0.020408,0.000289,...,-0.013555,0.000597,0.008922,0.006682,-0.000225,0.027159,0.048046,-0.052628,-0.034904,0.087749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920676,r2_bo_756,against,0.036233,0.027431,0.048374,0.004733,-0.053605,0.041728,-0.090050,-0.020994,...,-0.041192,-0.033724,0.036530,0.014907,-0.007710,0.008400,-0.011096,-0.038817,0.017063,0.070087
920677,r2_bo_756,against,0.041809,-0.060842,0.012836,-0.022509,-0.031469,0.011581,0.027040,-0.049265,...,0.008967,-0.025660,0.052434,-0.033161,0.013729,0.020330,-0.019272,-0.114484,-0.015707,0.059878
920678,r2_bo_756,against,0.036286,-0.030287,0.103372,0.023286,0.043280,-0.002571,-0.021945,-0.001568,...,-0.013526,-0.019172,-0.008919,-0.010848,0.004138,-0.006908,-0.001383,-0.057218,-0.016914,0.133511
920679,r2_bo_756,against,-0.011904,0.017070,0.036247,-0.015507,-0.042479,-0.073684,0.054537,-0.020854,...,0.019198,-0.024881,0.027292,0.010557,0.033339,-0.033322,-0.015329,-0.002178,0.007338,0.074364


In [13]:

settings = MinimalFCParameters()

In [14]:
df_featured = extract_features(
    data.drop('Polarity',axis =1), 
    column_id = 'User_ID',
    default_fc_parameters=settings,
    
    )

Feature Extraction: 100%|██████████| 30/30 [00:17<00:00,  1.67it/s]


In [25]:
X = df_featured
y = data[['User_ID','Polarity']].drop_duplicates().Polarity

In [27]:
y_encoded = y.map({'against': 0, 'for': 1})
y_encoded

0         1
1370      1
4339      1
6589      0
8163      0
         ..
907899    0
911106    1
912968    0
915635    1
917774    0
Name: Polarity, Length: 471, dtype: int64

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [29]:
pipe = IMBPipeline(
    steps = [
        #('sampling', RandomOverSampler(random_state=42)),
        ('scaling', MaxAbsScaler()),
        ('estimator', XGBClassifier(
            random_state = 42,
            verbosity = 3,
            # device = 'cuda',
            # tree_method = 'hist'
            ))
    ],
    verbose = True
    )

print('Training ...')
pipe_trained = pipe.fit(X_train, y_train)

y_pred = pipe_trained.predict(X_test)
y_pred_proba = pipe_trained.predict_proba(X_test)

Training ...
[Pipeline] ........... (step 1 of 2) Processing scaling, total=   0.0s
[00:45:05] AllReduce: 0.00666s, 1 calls @ 6660us

[00:45:05] MakeCuts: 0.011864s, 1 calls @ 11864us

[00:45:05] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[00:45:10] Configure: 0.001767s, 1 calls @ 1767us

[00:45:10] EvalOneIter: 0.001086s, 100 calls @ 1086us

[00:45:10] GetGradient: 0.042724s, 100 calls @ 42724us

[00:45:10] PredictRaw: 0.000177s, 100 calls @ 177us

[00:45:10] UpdateOneIter: 4.9215s, 100 calls @ 4921499us

[00:45:10] BoostNewTrees: 4.87078s, 100 calls @ 4870784us

[00:45:10] CommitModel: 7.3e-05s, 100 calls @ 73us

[00:45:10] BuildHistogram: 2.10941s, 200 calls @ 2109410us

[00:45:10] EvaluateSplits: 1.26333s, 300 calls @ 1263332us

[00:45:10] InitData: 0.037991s, 100 calls @ 37991us

[00:45:10] InitRoot: 1.29838s, 100 calls @ 1298385us

[00:45:10] LeafPartition: 4.5e-05s, 100 calls @ 45us

[00:45:10] UpdatePosition: 0.335711s, 205 calls @ 335711us

[00:45:10] Update

In [30]:
df_classification_report = get_classification_report(y_test, y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
y_test

725357    0
405724    0
408960    0
754697    0
4339      1
         ..
522103    0
100097    0
82048     0
438976    0
145832    1
Name: Polarity, Length: 95, dtype: int64

In [32]:
y_test.to_numpy()

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1])

In [33]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [34]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.915789,1.0,0.956044,87.0
accuracy,0.915789,0.915789,0.915789,0.915789
weighted avg,0.83867,0.915789,0.875535,95.0
macro avg,0.457895,0.5,0.478022,95.0
1,0.0,0.0,0.0,8.0


In [None]:
y_encoded.value_counts()/len(y_encoded)