# Importação das bibliotecas necessárias

In [1]:
import pandas as pd 

from sklearn.preprocessing import MaxAbsScaler

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn import set_config
set_config(display='diagram')
import datetime

# Importação dos dados

In [5]:
data = pd.read_parquet('../dataset/processed/artigos_tratados/bertimbau/artigos_tratados_bert_lg.parquet')
rem_cols = ['Conteudo', 'URL']
data.drop(rem_cols, axis=1, inplace=True)

In [6]:
data.head() # visualização das primeiras 5 linhas do dataframe

Unnamed: 0,Partido,Vies,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023,emb_1024
0,Novo,direita,0.401686,-0.142705,0.023679,0.192193,0.323548,-0.067125,-0.598261,-0.355111,...,0.186607,-0.147967,-0.487217,0.110254,-0.125077,-0.159934,0.125114,0.134895,-0.527953,0.196096
1,Novo,direita,-0.128541,0.215314,-0.069348,0.088915,0.408865,-0.052515,-0.267552,-0.15783,...,0.552647,0.091742,-0.605148,0.11292,0.09964,-0.174978,0.192274,0.250751,-0.157322,0.310022
2,Novo,direita,0.016604,0.137099,-0.1135,0.109841,0.290769,0.015612,-0.27526,-0.178999,...,0.497988,0.113761,-0.157607,0.001557,0.053836,0.133553,-0.021814,0.095863,-0.13713,0.318082
3,Novo,direita,-0.032087,-0.067949,-0.03627,0.208884,-0.090851,0.005983,-0.093461,-0.463273,...,0.601618,0.132886,-0.403809,0.213245,-0.007711,-0.157867,-0.053459,0.401732,-0.279196,0.16506
4,Novo,direita,0.181898,0.011968,-0.062858,0.162305,0.247084,0.055331,-0.51853,-0.171481,...,0.447449,-0.02097,-0.419976,0.198067,-0.142836,-0.334448,-0.083704,0.290215,-0.176763,0.452248


In [7]:
# conversao dos rotulos categoricos para numericos
data['Vies'] = data['Vies'].map({'direita':2,
                                'centro': 1,
                                'esquerda': 0})

In [8]:
data.head()

Unnamed: 0,Partido,Vies,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023,emb_1024
0,Novo,2,0.401686,-0.142705,0.023679,0.192193,0.323548,-0.067125,-0.598261,-0.355111,...,0.186607,-0.147967,-0.487217,0.110254,-0.125077,-0.159934,0.125114,0.134895,-0.527953,0.196096
1,Novo,2,-0.128541,0.215314,-0.069348,0.088915,0.408865,-0.052515,-0.267552,-0.15783,...,0.552647,0.091742,-0.605148,0.11292,0.09964,-0.174978,0.192274,0.250751,-0.157322,0.310022
2,Novo,2,0.016604,0.137099,-0.1135,0.109841,0.290769,0.015612,-0.27526,-0.178999,...,0.497988,0.113761,-0.157607,0.001557,0.053836,0.133553,-0.021814,0.095863,-0.13713,0.318082
3,Novo,2,-0.032087,-0.067949,-0.03627,0.208884,-0.090851,0.005983,-0.093461,-0.463273,...,0.601618,0.132886,-0.403809,0.213245,-0.007711,-0.157867,-0.053459,0.401732,-0.279196,0.16506
4,Novo,2,0.181898,0.011968,-0.062858,0.162305,0.247084,0.055331,-0.51853,-0.171481,...,0.447449,-0.02097,-0.419976,0.198067,-0.142836,-0.334448,-0.083704,0.290215,-0.176763,0.452248


# Divisão dos dados

In [10]:
# a seguir os dados serão divididos entre features (X) e label (y)
X_columns = [column for column in data.columns if column != 'Vies' and column != 'Partido']
X = data[X_columns]
X.head() # features

Unnamed: 0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,...,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023,emb_1024
0,0.401686,-0.142705,0.023679,0.192193,0.323548,-0.067125,-0.598261,-0.355111,0.14975,-0.180951,...,0.186607,-0.147967,-0.487217,0.110254,-0.125077,-0.159934,0.125114,0.134895,-0.527953,0.196096
1,-0.128541,0.215314,-0.069348,0.088915,0.408865,-0.052515,-0.267552,-0.15783,-0.072159,0.033104,...,0.552647,0.091742,-0.605148,0.11292,0.09964,-0.174978,0.192274,0.250751,-0.157322,0.310022
2,0.016604,0.137099,-0.1135,0.109841,0.290769,0.015612,-0.27526,-0.178999,-0.17713,0.065621,...,0.497988,0.113761,-0.157607,0.001557,0.053836,0.133553,-0.021814,0.095863,-0.13713,0.318082
3,-0.032087,-0.067949,-0.03627,0.208884,-0.090851,0.005983,-0.093461,-0.463273,-0.08833,0.23826,...,0.601618,0.132886,-0.403809,0.213245,-0.007711,-0.157867,-0.053459,0.401732,-0.279196,0.16506
4,0.181898,0.011968,-0.062858,0.162305,0.247084,0.055331,-0.51853,-0.171481,0.040109,0.020867,...,0.447449,-0.02097,-0.419976,0.198067,-0.142836,-0.334448,-0.083704,0.290215,-0.176763,0.452248


In [11]:
y = data['Vies'] # label
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Vies, dtype: int64

In [8]:
X_train_strat_label, X_test_strat_label,
y_train_strat_label, y_test_strat_label = train_test_split(X, y, test_size=0.2, random_state=42,
                                                   stratify=y)

In [None]:
X_train_strat_part, X_test_strat_part,
y_train_strat_part, y_test_strat_part = train_test_split(X, y, test_size=0.2, random_state=42,
                                                   stratify=data['Partido'])

# Comparação

In [None]:
def seleciona_grid(model):

    param_grid = None

    if isinstance(model, MultinomialNB):
            param_grid = {
            "selection__k": [200,400,600,800,1024],
            "estimator__alpha": [50, 15, 10, 5, 1, 0.5, 0.3, 0.1, 0.05, 0.03, 0.02, 0.01,  0.001],
            "estimator__fit_prior": [True, False],
            }

    if isinstance(model, SVC):
            param_grid = {
            "selection__k": [200,400,600,800,1024],
            "estimator__gamma": [1, 0.1, 0.01, 0.001],
            "estimator__kernel": ['linear', 'sigmoid'],
            "estimator__C": [0.1, 1, 10, 100]
            }


    if isinstance(model, RandomForestClassifier):
        param_grid = {
        "selection__k": [200,400,600,800,1024],
        "estimator__n_estimators": np.arange(20,150), 
        "estimator__max_features": ['log2', 'sqrt'],
        "estimator__max_depth": np.arange(10,110),
        "estimator__min_samples_split": np.arange(2,11),
        "estimator__min_samples_leaf": np.arange(1,5),
        "estimator__bootstrap": [True, False]
        }
        
    if isinstance(model, XGBClassifier):
        param_grid = {
        "selection__k": [200,400,600,800,1024],
        "estimator__gamma": np.linspace(0,9,100, dtype=np.int64),
        "estimator__alpha": np.linspace(0,40,100, dtype=np.int64),
        "estimator__lambda": np.linspace(0,3,10, dtype=np.int64),
        "estimator__colsample_bytree": np.linspace(0.2,1,10, dtype=np.int64)
        }

    return param_grid

In [9]:
def compara(iteracoes, modelos):
    
    # Escrita no arquivo
    nome_arquivo = 'compara-xg-rf-bert.txt'
    with open(nome_arquivo, "w") as arquivo:
        pass

    selection = SelectKBest() # pritnar kbest

    samplers = [RandomOverSampler(random_state=42), None]

    splits = ['strat_label', 'strat_partido', 'pred_partido_novo'] # adicionar a logica de separacao para o pred_part

    # dataframe em que sera inserido os dados do modelo testado
    df_resultados = pd.DataFrame(columns=['modelo', 'split', 'comeco_random_search',
                                          'final_random_search', 'qnt_iteracoes',
                                          'melhor_result_randsearch',
                                          'melhores_parametros', 'acuracia',
                                         'class_report'])
    
    for model in modelos:

        for sampler in samplers:

            for split in splits:
    
                param_grid = seleciona_grid(model)
                
                scaler = MaxAbsScaler()
        
                pipeline = Pipeline([
                        ('scaling', scaler), 
                        ('selection', selection),
                        ('ros', sampler),
                        ('estimator', model)
                        ])
        
                
                #  --- Prints das configurações ---
                print(f'Modelo: {model}')
                    
        
                # definicao da randomized search
                random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid,cv=StratifiedKFold(n_splits=5),
                                                    n_iter=iteracoes, n_jobs=2, random_state=42)


                # Random Search
                comeco_random_search = datetime.datetime.now()
                print(f'Começo da Random Search: {comeco_random_search}')
                
                if split == 'strat_label': # estratificacao pela label
                    model_trained = random_search.fit(X_train_strat_label, y_train_strat_label) 
                    
                    final_random_search = datetime.datetime.now()
                    print(f'Final da Random Search: {final_random_search}')
                    
                    y_pred = model_trained.predict(X_test_strat_label) # predicao
                    acc_pred = accuracy_score(y_test_strat_label, y_pred) # acuracia
                    report = classification_report(y_test_strat_label, y_pred) # class report
                    
                elif split == 'strat_partido': # estratificacao pelos partidos 
                    model_trained = random_search.fit(X_train_strat_part, y_train_strat_part)

                    final_random_search = datetime.datetime.now()
                    print(f'Final da Random Search: {final_random_search}')
                    
                    y_pred = model_trained.predict(X_test_strat_part) # predicao
                    acc_pred = accuracy_score(y_test_strat_part, y_pred) # acuracia
                    report = classification_report(y_test_strat_part, y_pred) # class report
                    
                elif split == 'pred_partido_novo': # predicao de partidos nao vistos no teste
                    model_trained = random_search.fit(, )
                    
                    final_random_search = datetime.datetime.now()
                    print(f'Final da Random Search: {final_random_search}')
                    
                    y_pred = model_trained.predict() # predicao
                    acc_pred = accuracy_score(, y_pred) # acuracia
                    report = classification_report(, y_pred) # class report
                    
                
                
                
                score_random_search = model_trained.best_score_
                score_random_search *= 100
                score_random_search = round(score_random_search,2)
                print(f'Melhor resultado na Random Search: {score_random_search}%')
                
                print('Melhores parâmetros encontrados:')
                print(model_trained.best_params_)
                
                    
                acc_pred *= 100
                acc_pred = round(acc_pred,2)
                print(f'Acurácia predita = {acc_pred}%')
        
                print(report)
                        
                
                print('----------------------------------------------')
                
                # Escrita em memória secundária


                # Nova linha que se deseja adicionar
                nova_linha = {'modelo': model, 'split': split,
                              'comeco_random_search': comeco_random_search,
                              'final_random_search': final_random_search, 'qnt_iteracoes': iteracoes,
                              'melhor_result_randsearch': score_random_search,
                              'melhores_parametros': str(model_trained.best_params_),
                              'acuracia': acc_pred, 'class_report': report}
            
                # Cria um novo DataFrame com a nova linha
                nova_linha_resultados = pd.DataFrame([nova_linha])
            
                # Concatena o novo DataFrame com o DataFrame existente
                df_resultados = pd.concat([df_resultados, nova_linha_resultados], ignore_index=True)
    
    df_resultados.to_csv('compara-xg-rf-bert.csv', index=False)
                
                
                

                with open(nome_arquivo, "a") as arquivo:
                    
                    arquivo.write(f'Modelo: {model}\n')
                    #arquivo.write(f'Vetorizador utilizado: {vectorizer}\n')
                    arquivo.write(f'Começo da Random Search: {comeco_random_search}\n')
                    arquivo.write(f'Final da Random Search: {final_random_search}\n')
                    arquivo.write(f'Melhor resultado na Random Search: {score_random_search}\n')
                    arquivo.write('Melhores parâmetros encontrados:\n')
                    arquivo.write(str(model_trained.best_params_))
                    arquivo.write('\n')
                    arquivo.write(f'Acurácia predita = {acc_pred}%\n')
                    arquivo.write(f'Classification report: \n')
                    arquivo.write(report)
                    arquivo.write('----------------------------------------------\n')

In [16]:
df = pd.read_csv('compara-xg-rf-bert.csv')

for index, row in df.iterrows():
    print(f"{row['modelo']}:")
    print(row['parametros'])
    print('\n')

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...):
Começo da Random Search: 2023-11-22 23:31:31.807720
Final da Random Search: 2023-11-22 23:31:52.152470
Melhores parâmetros encontrados:
{'selection__k': 800, 'estimator__lambda': 2, 'estimator__gamma': 4, 'estimator__colsample_bytree': 0, 'estimat

In [10]:
modelos = [XGBClassifier(seed=42),RandomForestClassifier(random_state=42)]

compara(1, modelos)

Modelo: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
Começo da Random Search: 2023-11-22 23:31:31.807720


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(data):


Final da Random Search: 2023-11-22 23:31:52.152470
Melhor resultado na Random Search: 85.26%
Melhores parâmetros encontrados:
{'selection__k': 800, 'estimator__lambda': 2, 'estimator__gamma': 4, 'estimator__colsample_bytree': 0, 'estimator__alpha': 13}
Acurácia predita = 86.44%
              precision    recall  f1-score   support

           0       0.76      0.79      0.77       494
           1       0.81      0.82      0.82       716
           2       0.95      0.93      0.94      1039

    accuracy                           0.86      2249
   macro avg       0.84      0.85      0.84      2249
weighted avg       0.87      0.86      0.87      2249

----------------------------------------------


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  df_resultados = pd.concat([df_resultados, nova_linha_resultados], ignore_index=True)


Modelo: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
Começo da Random Search: 2023-11-22 23:31:52.419495


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(data):


Final da Random Search: 2023-11-22 23:32:05.319498
Melhor resultado na Random Search: 85.31%
Melhores parâmetros encontrados:
{'selection__k': 800, 'estimator__lambda': 2, 'estimator__gamma': 4, 'estimator__colsample_bytree': 0, 'estimator__alpha': 13}
Acurácia predita = 85.9%
              precision    recall  f1-score   support

           0       0.83      0.70      0.76       494
           1       0.79      0.85      0.82       716
           2       0.92      0.94      0.93      1039

    accuracy                           0.86      2249
   macro avg       0.85      0.83      0.84      2249
weighted avg       0.86      0.86      0.86      2249

----------------------------------------------
Modelo: RandomForestClassifier(random_state=42)
Começo da Random Search: 2023-11-22 23:32:05.431493


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Final da Random Search: 2023-11-22 23:32:29.213951
Melhor resultado na Random Search: 89.31%
Melhores parâmetros encontrados:
{'selection__k': 800, 'estimator__n_estimators': 37, 'estimator__min_samples_split': 2, 'estimator__min_samples_leaf': 3, 'estimator__max_features': 'log2', 'estimator__max_depth': 47, 'estimator__bootstrap': False}
Acurácia predita = 90.35%
              precision    recall  f1-score   support

           0       0.88      0.74      0.81       494
           1       0.85      0.90      0.88       716
           2       0.95      0.98      0.96      1039

    accuracy                           0.90      2249
   macro avg       0.89      0.88      0.88      2249
weighted avg       0.90      0.90      0.90      2249

----------------------------------------------
Modelo: RandomForestClassifier(random_state=42)
Começo da Random Search: 2023-11-22 23:32:29.333953


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Final da Random Search: 2023-11-22 23:32:48.000398
Melhor resultado na Random Search: 89.13%
Melhores parâmetros encontrados:
{'selection__k': 800, 'estimator__n_estimators': 37, 'estimator__min_samples_split': 2, 'estimator__min_samples_leaf': 3, 'estimator__max_features': 'log2', 'estimator__max_depth': 47, 'estimator__bootstrap': False}
Acurácia predita = 89.86%
              precision    recall  f1-score   support

           0       0.90      0.71      0.79       494
           1       0.84      0.91      0.87       716
           2       0.94      0.98      0.96      1039

    accuracy                           0.90      2249
   macro avg       0.89      0.87      0.88      2249
weighted avg       0.90      0.90      0.90      2249

----------------------------------------------


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


In [11]:
print(None)

None
