# Importação das bibliotecas necessárias

In [1]:
import pandas as pd 

from sklearn.preprocessing import MaxAbsScaler

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn import set_config
set_config(display='diagram')
import datetime

# Importação dos dados

In [2]:
data = pd.read_csv('../dataset/processed/artigos_de_partidos/artigos_partidos.csv')
rem_cols = ['Partido', 'URL']
data.drop(rem_cols, axis=1, inplace=True)

In [3]:
data.head() # visualização das primeiras 5 linhas do dataframe

Unnamed: 0,Conteudo,Vies
0,Multa imposta ao candidato na condenação foi...,direita
1,Cadastro será usado como identificação junt...,direita
2,A Bancada do NOVO na Câmara considera temerá...,direita
3,Um ambiente com ausência de segurança juríd...,direita
4,"Segundo o MP, o estado do RJ sequer utiliza os...",direita


In [4]:
# conversao dos rotulos categoricos para numericos
data['Vies'] = data['Vies'].map({
                                    'direita':2,
                                    'centro': 1,
                                    'esquerda': 0})

In [5]:
data.head()

Unnamed: 0,Conteudo,Vies
0,Multa imposta ao candidato na condenação foi...,2
1,Cadastro será usado como identificação junt...,2
2,A Bancada do NOVO na Câmara considera temerá...,2
3,Um ambiente com ausência de segurança juríd...,2
4,"Segundo o MP, o estado do RJ sequer utiliza os...",2


# Divisão dos dados

In [6]:
# a seguir os dados serão divididos entre features (X) e label (y)
X_columns = [column for column in data.columns if column != 'Vies']
X = data[X_columns]
X.head() # features

Unnamed: 0,Conteudo
0,Multa imposta ao candidato na condenação foi...
1,Cadastro será usado como identificação junt...
2,A Bancada do NOVO na Câmara considera temerá...
3,Um ambiente com ausência de segurança juríd...
4,"Segundo o MP, o estado do RJ sequer utiliza os..."


In [7]:
y = data['Vies'] # label
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Vies, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
                                                   stratify=y)

# Comparação

In [9]:
def compara(iteracoes, modelos):
    
    # Escrita no arquivo
    nome_arquivo = 'compara-nb-svc-tfidf.txt'
    with open(nome_arquivo, "w") as arquivo:
        pass
    
    vectorizer = TfidfVectorizer()

    selection = SelectKBest()
    
    for model in modelos:
        
        #for vectorizer in vect:
    
        param_grid = None
        
        scaler = MaxAbsScaler()

        #preprocessor = ColumnTransformer(transformers=[
        #                                ('vect_resp_text', vectorizer,'resp_text'), 
        #                                ('vect_pos', vectorizer,'pos')])

        pipeline = Pipeline([
                ('vect', vectorizer), 
                ('scaling', scaler), 
                ('selection', selection),
                ('estimator', model)
                ])

        if isinstance(model, MultinomialNB):
                param_grid = {
                "vect__ngram_range": [(1,2), (1,3), (1,4), (2,3), (2,4), (3,4)],
                "vect__analyzer": ['word','char'],
                "selection__k": [200,400,600,800,1024],
                "estimator__alpha": [50, 15, 10, 5, 1, 0.5, 0.3, 0.1, 0.05, 0.03, 0.02, 0.01,  0.001],
                "estimator__fit_prior": [True, False],
                }

        if isinstance(model, SVC):
                param_grid = {
                "vect__ngram_range": [(1,2), (1,3), (1,4), (2,3), (2,4), (3,4)],
                "vect__analyzer": ['word','char'],
                "selection__k": [200,400,600,800,1024],
                "estimator__gamma": [1, 0.1, 0.01, 0.001],
                "estimator__kernel": ['linear', 'sigmoid'],
                "estimator__C": [0.1, 1, 10, 100]
                }


        if isinstance(model, RandomForestClassifier):
            param_grid = {
            "vect__ngram_range": [(1,2), (1,3), (1,4), (2,3), (2,4), (3,4)],
            "vect__analyzer": ['word','char'],
            "selection__k": [200,400,600,800,1024],
            "estimator__n_estimators": np.arange(20,150), 
            "estimator__max_features": ['log2', 'sqrt'],
            "estimator__max_depth": np.arange(10,110),
            "estimator__min_samples_split": np.arange(2,11),
            "estimator__min_samples_leaf": np.arange(1,5),
            "estimator__bootstrap": [True, False]
            }
            
        if isinstance(model, XGBClassifier):
            param_grid = {
            "vect__ngram_range": [(1,2), (1,3), (1,4), (2,3), (2,4), (3,4)],
            "vect__analyzer": ['word','char'],
            "selection__k": [200,400,600,800,1024],
            "estimator__gamma": np.linspace(0,9,100, dtype=np.int64),
            "estimator__alpha": np.linspace(0,40,100, dtype=np.int64),
            "estimator__lambda": np.linspace(0,3,10, dtype=np.int64),
            "estimator__colsample_bytree": np.linspace(0.2,1,10, dtype=np.int64)
            }

        
        # Prints do modelo e da vetorização
        print(f'Modelo: {model}')
            
        print(f'Vetorizador utilizado: {vectorizer}')
        
        # Random Search
        comeco_random_search = datetime.datetime.now()
        print(f'Começo da Random Search: {comeco_random_search}')
            
        random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid,cv=StratifiedKFold(n_splits=5),
                                            n_iter=iteracoes, n_jobs=2, random_state=42)
        
        model_trained = random_search.fit(X_train, y_train)
        
        final_random_search = datetime.datetime.now()
        print(f'Final da Random Search: {final_random_search}')
        
        score_random_search = model_trained.best_score_
        score_random_search *= 100
        score_random_search = round(score_random_search,2)
        print(f'Melhor resultado na Random Search: {score_random_search}%')
        
        print('Melhores parâmetros encontrados:')
        print(model_trained.best_params_)
        
        # Predição
        y_pred = model_trained.predict(X_test)
        acc_pred = accuracy_score(y_test, y_pred)
        acc_pred *= 100
        acc_pred = round(acc_pred,2)
        print(f'Acurácia predita = {acc_pred}%')

        report = classification_report(y_test, y_pred)
        print(report)
                
        
        print('----------------------------------------------')
        
        # Escrita no arquivo
        with open(nome_arquivo, "a") as arquivo:
            
            arquivo.write(f'Modelo: {model}\n')
            arquivo.write(f'Vetorizador utilizado: {vectorizer}\n')
            arquivo.write(f'Começo da Random Search: {comeco_random_search}\n')
            arquivo.write(f'Final da Random Search: {final_random_search}\n')
            arquivo.write(f'Melhor resultado na Random Search: {score_random_search}\n')
            arquivo.write('Melhores parâmetros encontrados:\n')
            arquivo.write(str(model_trained.best_params_))
            arquivo.write('\n')
            arquivo.write(f'Acurácia predita = {acc_pred}%\n')
            arquivo.write(f'Classification report: \n')
            arquivo.write(report)
            arquivo.write('----------------------------------------------\n')

In [10]:
modelos = [MultinomialNB(), SVC(random_state=42)]
compara(1, modelos)

Modelo: MultinomialNB()
Vetorizador utilizado: TfidfVectorizer()
Começo da Random Search: 2023-11-22 19:37:09.814938


5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Downloads2\Anaconda\download\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Downloads2\Anaconda\download\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "D:\Downloads2\Anaconda\download\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "D:\Downloads2\Anaconda\download\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.fun

ValueError: Found input variables with inconsistent numbers of samples: [1, 9370]