# Predição Final

In [1]:
import pandas as pd 

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from scipy.stats import uniform

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn import set_config
set_config(display='diagram')
import datetime
import time



In [3]:
data = pd.read_parquet('../dataset/processed/artigos_tratados/bertimbau/artigos_tratados_bert_lg.parquet')

In [4]:
# remocao de dados nulos
data = data[data['Conteudo'] != '']

In [5]:
# remocao de colunas desnecessarias
rem_cols = ['Conteudo', 'URL']
data.drop(rem_cols, axis=1, inplace=True)

In [6]:
# conversao dos rotulos categoricos para numericos
data['Vies'] = data['Vies'].map({'direita':2,
                                'centro': 1,
                                'esquerda': 0})

In [7]:
# a seguir os dados serão divididos entre features (X) e label (y)

X_columns = [column for column in data.columns if column != 'Vies']
X = data[X_columns] # features
X.head() 

Unnamed: 0,Partido,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023,emb_1024
0,Novo,0.401686,-0.142705,0.023679,0.192193,0.323548,-0.067125,-0.598261,-0.355111,0.14975,...,0.186607,-0.147967,-0.487217,0.110254,-0.125077,-0.159934,0.125114,0.134895,-0.527953,0.196096
1,Novo,-0.128541,0.215314,-0.069348,0.088915,0.408865,-0.052515,-0.267552,-0.15783,-0.072159,...,0.552647,0.091742,-0.605148,0.11292,0.09964,-0.174978,0.192274,0.250751,-0.157322,0.310022
2,Novo,0.016604,0.137099,-0.1135,0.109841,0.290769,0.015612,-0.27526,-0.178999,-0.17713,...,0.497988,0.113761,-0.157607,0.001557,0.053836,0.133553,-0.021814,0.095863,-0.13713,0.318082
3,Novo,-0.032087,-0.067949,-0.03627,0.208884,-0.090851,0.005983,-0.093461,-0.463273,-0.08833,...,0.601618,0.132886,-0.403809,0.213245,-0.007711,-0.157867,-0.053459,0.401732,-0.279196,0.16506
4,Novo,0.181898,0.011968,-0.062858,0.162305,0.247084,0.055331,-0.51853,-0.171481,0.040109,...,0.447449,-0.02097,-0.419976,0.198067,-0.142836,-0.334448,-0.083704,0.290215,-0.176763,0.452248


In [8]:
y = data['Vies'] # label
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Vies, dtype: int64

In [None]:
X_train_strat_vies, X_test_strat_vies, y_train_strat_vies, y_test_strat_vies = train_test_split(X, y,
                                                                                                test_size=0.2,
                                                                                                random_state=42,
                                                                                                stratify=y)

X_train_strat_vies.drop('Partido', axis=1, inplace=True) # remocao da coluna partido
X_test_strat_vies.drop('Partido', axis=1, inplace=True) # remocao da coluna partido

In [None]:
X_train_strat_part, X_test_strat_part, y_train_strat_part, y_test_strat_part = train_test_split(X, y,
                                                                                                test_size=0.2,
                                                                                                random_state=42,
                                                                                                stratify=data['Partido'])

X_train_strat_part.drop('Partido', axis=1, inplace=True) # remocao da coluna partido
X_test_strat_part.drop('Partido', axis=1, inplace=True) # remocao da coluna partido

In [None]:
part_teste = ['PSTU', 'PV', 'Novo'] # partidos do conjunto de teste

test = data[data['Partido'].isin(part_teste)].copy() # selecao dos dados de teste
test.drop('Partido', axis=1, inplace=True) # remocao da coluna partido

train = data[~data['Partido'].isin(part_teste)].copy() # selecao dos dados de treino
train.drop('Partido', axis=1, inplace=True) # remocao da coluna partido

In [None]:
X_train_part_novos = train.drop('Vies', axis=1) # X_train
y_train_part_novos = train['Vies'] # y_train

X_test_part_novos = test.drop('Vies', axis=1) # X_test
y_test_part_novos = test['Vies'] # y_test

In [11]:
best_params = {'selection__k': 800,
               'estimator__lambda': 0,
               'estimator__gamma': 0,
               'estimator__colsample_bytree': 1,
               'estimator__alpha': 22}

In [12]:
pipeline = Pipeline([
                        ('scaling', MaxAbsScaler()), 
                        ('selection', SelectKBest()),
                        ('ros', RandomOverSampler(random_state=42)),
                        ('estimator', XGBClassifier(seed=42, tree_method='gpu_hist', gpu_id=0))
                        ])
    
best_xgb = pipeline.set_params(**best_params)

In [13]:
best_xgb.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('scaling', MaxAbsScaler()), ('selection', SelectKBest(k=800)),
                ('ros', RandomOverSampler(random_state=42)),
                ('estimator',
                 XGBClassifier(alpha=22, base_score=None, booster=None,
                               callbacks=None, colsample_bylevel=None,
                               colsample_bynode=None, colsample_bytree=1,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=0, gpu_id=0,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, lambda=0,
                               learning_rate=None, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=None,
                 

In [15]:
best_xgb.fit(X.drop('Partido', axis=1), y)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(data):


In [23]:
novas_noticias = pd.read_csv('noticias-teste-embedding.csv')
novas_noticias.drop(['Unnamed: 0', 'Conteudo'], axis=1, inplace=True)
novas_noticias

Unnamed: 0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,...,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023,emb_1024
0,0.177256,-0.246991,0.087106,0.224288,0.162842,-0.261285,-0.354171,-0.329556,-0.042913,0.22194,...,0.237059,0.029648,-0.535245,-0.173133,-0.050813,-0.389454,0.441995,0.096808,-0.216968,0.300969
1,0.188755,-0.23697,0.26489,0.210608,-0.04907,-0.323651,-0.560016,-0.170308,0.03044,0.160717,...,0.16515,0.010903,-0.423455,-0.169438,-0.006396,-0.319956,0.625356,0.090047,-0.179706,0.056563
2,0.339368,-0.105536,0.235968,0.152529,0.164733,-0.254474,-0.333641,-0.31716,0.050852,0.216817,...,0.24004,-0.079102,-0.451281,-0.070833,0.014464,-0.143781,0.436501,-0.048927,0.018316,0.124147
3,0.336466,-0.262542,0.326849,0.218233,-0.019247,-0.266457,-0.444466,-0.092072,0.032738,0.223704,...,0.369901,-0.11204,-0.526602,-0.055398,0.039789,-0.476769,0.351495,0.198413,0.090005,0.069517
4,0.320413,-0.237129,0.442062,0.159104,-0.074181,-0.106775,-0.570315,-0.043499,0.011053,0.187507,...,0.336812,-0.094797,-0.526176,0.075011,-0.15833,-0.192057,0.340394,0.141768,-0.139985,0.31252
5,0.14061,-0.295143,0.277217,0.131121,0.110425,-0.179603,-0.639545,-0.218153,0.016547,0.125998,...,0.357463,-0.365226,-0.441883,0.008965,-0.277259,-0.289251,0.306093,0.002809,-0.019155,0.237498
6,0.1904,-0.210374,0.312346,0.308537,0.177782,-0.17949,-0.458267,-0.100351,0.052892,0.054652,...,0.225507,-0.022758,-0.503361,0.109931,-0.063078,-0.254466,0.205095,0.206967,-0.192988,0.324998
7,0.166542,-0.326735,0.206906,0.286478,0.130018,-0.261664,-0.526148,-0.090842,0.113488,0.04632,...,0.409236,-0.038461,-0.49583,-0.058001,-0.09696,-0.320547,0.273821,0.215213,-0.179342,0.408732
8,0.172165,-0.265606,0.262698,0.111486,0.060641,-0.178284,-0.464203,-0.243654,0.020358,0.341577,...,0.238188,0.053228,-0.653927,0.067177,-0.084639,-0.411395,0.412065,0.210706,-0.168072,0.123498
9,0.175475,-0.1616,0.165423,0.326432,0.035058,-0.228445,-0.537136,-0.082007,0.011754,0.324337,...,0.265662,-0.221337,-0.526943,0.008523,-0.057346,-0.390026,0.413752,0.152417,-0.139329,0.277802


In [24]:
pred = best_xgb.predict(novas_noticias)
pred

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


array([1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1],
      dtype=int64)

In [None]:
data['Vies'] = data['Vies'].map({'direita':2,
                                'centro': 1,
                                'esquerda': 0})