In [2]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer

from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [3]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [6]:
corpus = 'ig'
model_name = 'facebook/fasttext-pt-vectors'

In [7]:
path_data = path_processed_data + f'train_r3_{corpus}_separated_comments_{model_name.replace("/", "_")}_tsfresh.parquet'

In [8]:
data = pd.read_parquet(path_data)

In [9]:
data.head()

Unnamed: 0,User_ID,emb_0__sum_values,emb_0__median,emb_0__mean,emb_0__length,emb_0__standard_deviation,emb_0__variance,emb_0__root_mean_square,emb_0__maximum,emb_0__absolute_maximum,...,emb_299__median,emb_299__mean,emb_299__length,emb_299__standard_deviation,emb_299__variance,emb_299__root_mean_square,emb_299__maximum,emb_299__absolute_maximum,emb_299__minimum,Polarity
0,r2_ig_1,18.705984,0.016341,0.021305,878.0,0.034774,0.001209,0.040782,0.161842,0.161842,...,0.023159,0.018238,878.0,0.061676,0.003804,0.064316,0.221551,0.22258,-0.22258,against
1,r2_ig_10,-0.694554,-0.007306,-0.007632,91.0,0.025381,0.000644,0.026504,0.077938,0.094406,...,-0.000196,0.0028,91.0,0.06365,0.004051,0.063712,0.14063,0.245795,-0.245795,for
2,r2_ig_100,134.651776,0.021312,0.026844,5016.0,0.034851,0.001215,0.043991,0.277412,0.277412,...,0.046704,0.042799,5016.0,0.063111,0.003983,0.076254,0.236323,0.331171,-0.331171,against
3,r2_ig_1003,8.989551,0.004784,0.007261,1238.0,0.029329,0.00086,0.030214,0.167369,0.167369,...,0.009481,0.004476,1238.0,0.060376,0.003645,0.060541,0.163734,0.239796,-0.239796,for
4,r2_ig_1005,-0.023906,-0.023906,-0.023906,1.0,0.0,0.0,0.023906,-0.023906,0.023906,...,0.385951,0.385951,1.0,0.0,0.0,0.385951,0.385951,0.385951,0.385951,for


In [11]:
X = data[[col for col in data.columns if 'emb' in col]]
y = data.Polarity

In [12]:
y_encoded = y.map({'against': 0, 'for': 1})
y_encoded

0       0
1       1
2       0
3       1
4       1
       ..
1791    1
1792    1
1793    0
1794    0
1795    0
Name: Polarity, Length: 1796, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [14]:
pipe = IMBPipeline(
    steps = [
        #('sampling', RandomOverSampler(random_state=42)),
        ('scaling', MaxAbsScaler()),
        ('estimator', XGBClassifier(
            random_state = 42,
            verbosity = 3,
            # device = 'cuda',
            # tree_method = 'hist'
            ))
    ],
    verbose = True
    )

print('Training ...')
pipe_trained = pipe.fit(X_train, y_train)

y_pred = pipe_trained.predict(X_test)
y_pred_proba = pipe_trained.predict_proba(X_test)

Training ...
[Pipeline] ........... (step 1 of 2) Processing scaling, total=   0.1s
[13:31:50] AllReduce: 0.033447s, 1 calls @ 33447us

[13:31:50] MakeCuts: 0.040282s, 1 calls @ 40282us

[13:31:50] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[13:32:06] Configure: 0.000795s, 1 calls @ 795us

[13:32:06] EvalOneIter: 0.001045s, 100 calls @ 1045us

[13:32:06] GetGradient: 0.003723s, 100 calls @ 3723us

[13:32:06] PredictRaw: 0.000288s, 100 calls @ 288us

[13:32:06] UpdateOneIter: 16.2232s, 100 calls @ 16223198us

[13:32:06] BoostNewTrees: 16.217s, 100 calls @ 16217033us

[13:32:06] CommitModel: 8.8e-05s, 100 calls @ 88us

[13:32:06] BuildHistogram: 7.40995s, 500 calls @ 7409948us

[13:32:06] EvaluateSplits: 6.71078s, 600 calls @ 6710782us

[13:32:06] InitData: 0.003426s, 100 calls @ 3426us

[13:32:06] InitRoot: 2.09015s, 100 calls @ 2090145us

[13:32:06] LeafPartition: 3.8e-05s, 100 calls @ 38us

[13:32:06] UpdatePosition: 0.300496s, 600 calls @ 300496us

[13:32:06] Updat

In [15]:
df_classification_report = get_classification_report(y_test, y_pred)

In [16]:
y_test

729     1
282     0
222     0
111     0
104     1
       ..
1659    1
998     0
1779    0
1719    1
1373    1
Name: Polarity, Length: 360, dtype: int64

In [17]:
y_test.to_numpy()

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,

In [18]:
y_pred

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,

In [19]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.666667,0.729064,0.696471,203.0
accuracy,0.641667,0.641667,0.641667,0.641667
weighted avg,0.638225,0.641667,0.638137,360.0
macro avg,0.634058,0.628863,0.629591,360.0
1,0.601449,0.528662,0.562712,157.0


In [20]:
y_encoded.value_counts()/len(y_encoded)

Polarity
0    0.565145
1    0.434855
Name: count, dtype: float64