# Importação das bibliotecas necessárias

In [1]:
import pandas as pd 

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn import set_config
set_config(display='diagram')
import datetime

# Importação dos dados

In [2]:
data = pd.read_parquet('../dataset/processed/artigos_tratados/bertimbau/artigos_tratados_bert_lg.parquet')
rem_cols = ['Conteudo', 'Partido', 'URL']
data.drop(rem_cols, axis=1, inplace=True)

In [3]:
data.head() # visualização das primeiras 5 linhas do dataframe

Unnamed: 0,Vies,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023,emb_1024
0,direita,0.401686,-0.142705,0.023679,0.192193,0.323548,-0.067125,-0.598261,-0.355111,0.14975,...,0.186607,-0.147967,-0.487217,0.110254,-0.125077,-0.159934,0.125114,0.134895,-0.527953,0.196096
1,direita,-0.128541,0.215314,-0.069348,0.088915,0.408865,-0.052515,-0.267552,-0.15783,-0.072159,...,0.552647,0.091742,-0.605148,0.11292,0.09964,-0.174978,0.192274,0.250751,-0.157322,0.310022
2,direita,0.016604,0.137099,-0.1135,0.109841,0.290769,0.015612,-0.27526,-0.178999,-0.17713,...,0.497988,0.113761,-0.157607,0.001557,0.053836,0.133553,-0.021814,0.095863,-0.13713,0.318082
3,direita,-0.032087,-0.067949,-0.03627,0.208884,-0.090851,0.005983,-0.093461,-0.463273,-0.08833,...,0.601618,0.132886,-0.403809,0.213245,-0.007711,-0.157867,-0.053459,0.401732,-0.279196,0.16506
4,direita,0.181898,0.011968,-0.062858,0.162305,0.247084,0.055331,-0.51853,-0.171481,0.040109,...,0.447449,-0.02097,-0.419976,0.198067,-0.142836,-0.334448,-0.083704,0.290215,-0.176763,0.452248


In [4]:
# conversao dos rotulos categoricos para numericos
data['Vies'] = data['Vies'].map({
                                    'direita':2,
                                    'centro': 1,
                                    'esquerda': 0})

In [5]:
data.head()

Unnamed: 0,Vies,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023,emb_1024
0,2,0.401686,-0.142705,0.023679,0.192193,0.323548,-0.067125,-0.598261,-0.355111,0.14975,...,0.186607,-0.147967,-0.487217,0.110254,-0.125077,-0.159934,0.125114,0.134895,-0.527953,0.196096
1,2,-0.128541,0.215314,-0.069348,0.088915,0.408865,-0.052515,-0.267552,-0.15783,-0.072159,...,0.552647,0.091742,-0.605148,0.11292,0.09964,-0.174978,0.192274,0.250751,-0.157322,0.310022
2,2,0.016604,0.137099,-0.1135,0.109841,0.290769,0.015612,-0.27526,-0.178999,-0.17713,...,0.497988,0.113761,-0.157607,0.001557,0.053836,0.133553,-0.021814,0.095863,-0.13713,0.318082
3,2,-0.032087,-0.067949,-0.03627,0.208884,-0.090851,0.005983,-0.093461,-0.463273,-0.08833,...,0.601618,0.132886,-0.403809,0.213245,-0.007711,-0.157867,-0.053459,0.401732,-0.279196,0.16506
4,2,0.181898,0.011968,-0.062858,0.162305,0.247084,0.055331,-0.51853,-0.171481,0.040109,...,0.447449,-0.02097,-0.419976,0.198067,-0.142836,-0.334448,-0.083704,0.290215,-0.176763,0.452248


# Divisão dos dados

In [6]:
# a seguir os dados serão divididos entre features (X) e label (y)
X_columns = [column for column in data.columns if column != 'Vies']
X = data[X_columns]
X.head() # features

Unnamed: 0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,...,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023,emb_1024
0,0.401686,-0.142705,0.023679,0.192193,0.323548,-0.067125,-0.598261,-0.355111,0.14975,-0.180951,...,0.186607,-0.147967,-0.487217,0.110254,-0.125077,-0.159934,0.125114,0.134895,-0.527953,0.196096
1,-0.128541,0.215314,-0.069348,0.088915,0.408865,-0.052515,-0.267552,-0.15783,-0.072159,0.033104,...,0.552647,0.091742,-0.605148,0.11292,0.09964,-0.174978,0.192274,0.250751,-0.157322,0.310022
2,0.016604,0.137099,-0.1135,0.109841,0.290769,0.015612,-0.27526,-0.178999,-0.17713,0.065621,...,0.497988,0.113761,-0.157607,0.001557,0.053836,0.133553,-0.021814,0.095863,-0.13713,0.318082
3,-0.032087,-0.067949,-0.03627,0.208884,-0.090851,0.005983,-0.093461,-0.463273,-0.08833,0.23826,...,0.601618,0.132886,-0.403809,0.213245,-0.007711,-0.157867,-0.053459,0.401732,-0.279196,0.16506
4,0.181898,0.011968,-0.062858,0.162305,0.247084,0.055331,-0.51853,-0.171481,0.040109,0.020867,...,0.447449,-0.02097,-0.419976,0.198067,-0.142836,-0.334448,-0.083704,0.290215,-0.176763,0.452248


In [7]:
y = data['Vies'] # label
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Vies, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
                                                   stratify=y)

# Comparação

In [11]:
def compara(iteracoes, modelos, sampler):
    
    # Escrita no arquivo
    nome_arquivo = 'compara-nb-svc-bert.txt'
    with open(nome_arquivo, "w") as arquivo:
        pass

    samplers = [RandomOverSampler(random_state=42), None]

    selection = SelectKBest() # printar kbest
    
    for model in modelos:

        for sampler in samplers:
    
            param_grid = None
            scaler = None

            if isinstance(model, MultinomialNB):
                scaler = MinMaxScaler() # valores nao negativos
            else:
                scaler = MaxAbsScaler()
            
            pipeline = Pipeline([
                    ('scaling', scaler), 
                    ('selection', selection),
                    ('ros', sampler), # printar sampler
                    ('estimator', model)
                    ])
    
            if isinstance(model, MultinomialNB):
                    param_grid = {
                    "selection__k": [200,400,600,800,1024],
                    "estimator__alpha": [50, 15, 10, 5, 1, 0.5, 0.3, 0.1, 0.05, 0.03, 0.02, 0.01,  0.001],
                    "estimator__fit_prior": [True, False],
                    }
    
            if isinstance(model, SVC):
                    param_grid = {
                    "selection__k": [200,400,600,800,1024],
                    "estimator__gamma": [1, 0.1, 0.01, 0.001],
                    "estimator__kernel": ['linear', 'sigmoid'],
                    "estimator__C": [0.1, 1, 10, 100]
                    }
    
    
            if isinstance(model, RandomForestClassifier):
                param_grid = {
                "selection__k": [200,400,600,800,1024],
                "estimator__n_estimators": np.arange(20,150), 
                "estimator__max_features": ['log2', 'sqrt'],
                "estimator__max_depth": np.arange(10,110),
                "estimator__min_samples_split": np.arange(2,11),
                "estimator__min_samples_leaf": np.arange(1,5),
                "estimator__bootstrap": [True, False]
                }
                
            if isinstance(model, XGBClassifier):
                param_grid = {
                "selection__k": [200,400,600,800,1024],
                "estimator__gamma": np.linspace(0,9,100, dtype=np.int64),
                "estimator__alpha": np.linspace(0,40,100, dtype=np.int64),
                "estimator__lambda": np.linspace(0,3,10, dtype=np.int64),
                "estimator__colsample_bytree": np.linspace(0.2,1,10, dtype=np.int64)
                }
    
            
            # Prints do modelo e da vetorização
            print(f'Modelo: {model}')
                
            
            # Random Search
            comeco_random_search = datetime.datetime.now()
            print(f'Começo da Random Search: {comeco_random_search}')
                
            random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid,cv=StratifiedKFold(n_splits=5),
                                                n_iter=iteracoes, n_jobs=2, random_state=42)
            
            model_trained = random_search.fit(X_train, y_train)
            
            final_random_search = datetime.datetime.now()
            print(f'Final da Random Search: {final_random_search}')
            
            score_random_search = model_trained.best_score_
            score_random_search *= 100
            score_random_search = round(score_random_search,2)
            print(f'Melhor resultado na Random Search: {score_random_search}%')
            
            print('Melhores parâmetros encontrados:')
            print(model_trained.best_params_)
            
            # Predição
            y_pred = model_trained.predict(X_test)
            acc_pred = accuracy_score(y_test, y_pred)
            acc_pred *= 100
            acc_pred = round(acc_pred,2)
            print(f'Acurácia predita = {acc_pred}%')
    
            report = classification_report(y_test, y_pred)
            print(report)
                    
            
            print('----------------------------------------------')
            
            # Escrita no arquivo
            with open(nome_arquivo, "a") as arquivo:
                
                arquivo.write(f'Modelo: {model}\n')
                arquivo.write(f'Começo da Random Search: {comeco_random_search}\n')
                arquivo.write(f'Final da Random Search: {final_random_search}\n')
                arquivo.write(f'Melhor resultado na Random Search: {score_random_search}\n')
                arquivo.write('Melhores parâmetros encontrados:\n')
                arquivo.write(str(model_trained.best_params_))
                arquivo.write('\n')
                arquivo.write(f'Acurácia predita = {acc_pred}%\n')
                arquivo.write(f'Classification report: \n')
                arquivo.write(report)
                arquivo.write('----------------------------------------------\n')

In [12]:
modelos = [MultinomialNB(), SVC(random_state=42)]

compara(1, modelos)

Modelo: MultinomialNB()
Começo da Random Search: 2023-11-22 22:03:38.621749


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Final da Random Search: 2023-11-22 22:03:41.568295
Melhor resultado na Random Search: 79.44%
Melhores parâmetros encontrados:
{'selection__k': 600, 'estimator__fit_prior': True, 'estimator__alpha': 0.02}
Acurácia predita = 80.7%
              precision    recall  f1-score   support

           0       0.80      0.43      0.56       494
           1       0.69      0.88      0.77       716
           2       0.91      0.94      0.92      1039

    accuracy                           0.81      2249
   macro avg       0.80      0.75      0.75      2249
weighted avg       0.81      0.81      0.79      2249

----------------------------------------------
Modelo: SVC(random_state=42)
Começo da Random Search: 2023-11-22 22:03:41.645310


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Final da Random Search: 2023-11-22 22:05:15.427861
Melhor resultado na Random Search: 90.36%
Melhores parâmetros encontrados:
{'selection__k': 600, 'estimator__kernel': 'linear', 'estimator__gamma': 0.01, 'estimator__C': 10}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Acurácia predita = 89.6%
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       494
           1       0.86      0.85      0.86       716
           2       0.97      0.95      0.96      1039

    accuracy                           0.90      2249
   macro avg       0.88      0.88      0.88      2249
weighted avg       0.90      0.90      0.90      2249

----------------------------------------------
