In [67]:
# == importando bibliotecas == 

import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import textblob
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, metrics
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow import keras
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import models, optimizers
# from tensorflow.keras.layers import LSTM, Convolution1D, GRU, Dense, Dropout, Input, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.layers import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

In [2]:
# # == download do modelo pré-treinado de word embedding == 

# inglês
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
# !gunzip cc.en.300.vec.gz
# !mv cc.en.300.vec ../../dataset/fasttext_word_embedding/en_word_embedding.vec

# português
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz
# !gunzip cc.pt.300.vec.gz
# !mv cc.pt.300.vec ../../fasttext_word_embedding/pt_word_embedding.vec

In [3]:
# == variáveis == 

path_pt = 'dados_treino_ingles'
path_en = 'dados_treino_pt_google_trad'
path = path_pt
word_embedding_en = 'en_word_embedding.vec'
word_embedding_pt = 'pt_word_embedding.vec'
word_embedding = word_embedding_pt

In [4]:
# == importar dados ==

df = pd.read_parquet(f'../../dataset/processed/artigos_de_partidos/artigos_partidos.parquet')

In [81]:
# == contagem de artigos por viés ==

df[df.Vies == 'direita'].Partido.value_counts() / df[df.Vies == 'direita'].shape[0]
df[df.Vies == 'esquerda'].Partido.value_counts() / df[df.Vies == 'esquerda'].shape[0]
df[df.Vies == 'centro'].Partido.value_counts() / df[df.Vies == 'centro'].shape[0]

'direita'

PL              0.620759
Novo            0.219512
PP              0.107647
União Brasil    0.052081
Name: Partido, dtype: float64

PSTU     0.242835
PCB      0.238962
PCDoB    0.233153
PT       0.231216
Rede     0.048025
PSOL     0.005809
Name: Partido, dtype: float64

PSB    0.479065
PV     0.235998
MDB    0.163404
PDT    0.121533
Name: Partido, dtype: float64

In [6]:
# == contagem de artigos por viés ==

df[df.Vies == 'direita'].Partido.value_counts() / df.shape[0]
df[df.Vies == 'esquerda'].Partido.value_counts() / df.shape[0]
df[df.Vies == 'centro'].Partido.value_counts() / df.shape[0]

PL              0.288995
Novo            0.102194
PP              0.050115
União Brasil    0.024247
Name: Partido, dtype: float64

PSTU     0.053530
PCB      0.052677
PCDoB    0.051396
PT       0.050969
Rede     0.010587
PSOL     0.001281
Name: Partido, dtype: float64

PSB    0.150431
PV     0.074106
MDB    0.051311
PDT    0.038163
Name: Partido, dtype: float64

In [7]:
# == separar aproximadamente 5% de cada viés para usar como validação ==

train = df[~df.Partido.isin(['PP', 'PT', 'MDB'])]
valid = df[df.Partido.isin(['PP', 'PT', 'MDB'])]

In [86]:
# == conjunto treino e teste == 

train[['Partido', 'Vies']].value_counts().to_frame()
valid[['Partido', 'Vies']].value_counts().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Partido,Vies,Unnamed: 2_level_1
PL,direita,3385
PSB,centro,1762
Novo,direita,1197
PV,centro,868
PSTU,esquerda,627
PCB,esquerda,617
PCDoB,esquerda,602
PDT,centro,447
União Brasil,direita,284
Rede,esquerda,124


Unnamed: 0_level_0,Unnamed: 1_level_0,0
Partido,Vies,Unnamed: 2_level_1
MDB,centro,601
PT,esquerda,597
PP,direita,587


In [9]:
# == train & test split ==

X_train, X_valid, y_train, y_valid = train.Conteudo, valid.Conteudo, train.Vies, valid.Vies

In [10]:
# == label encoding do rótulo == 

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [53]:
# == label das classes == 

encoder.classes_
encoder.inverse_transform([0,1,2])

array(['centro', 'direita', 'esquerda'], dtype=object)

array(['centro', 'direita', 'esquerda'], dtype=object)

In [11]:
# == count vectorizer ==

count_vect = CountVectorizer(
    analyzer='word', 
    token_pattern=r'\w{1,}'
)
count_vect.fit(df.Conteudo)
X_train_count =  count_vect.transform(X_train)
X_valid_count =  count_vect.transform(X_valid)

CountVectorizer(token_pattern='\\w{1,}')

In [12]:
# == word level tf-idf ==

tfidf_vect = TfidfVectorizer(
    analyzer='word', 
    max_features=100
)
tfidf_vect.fit(df.Conteudo)
X_train_tfidf = tfidf_vect.transform(X_train)
X_valid_tfidf = tfidf_vect.transform(X_valid)

TfidfVectorizer(max_features=100)

In [13]:
# == ngram level tf-idf ==

tfidf_vect_ngram = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1,3), 
    max_features=100
)
tfidf_vect_ngram.fit(df.Conteudo)
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
X_valid_tfidf_ngram =  tfidf_vect_ngram.transform(X_valid)

TfidfVectorizer(max_features=100, ngram_range=(1, 3))

In [14]:
# == characters level tf-idf ==

tfidf_vect_ngram_chars = TfidfVectorizer(
    analyzer='char', 
    ngram_range=(1,3), 
    max_features=100
)
tfidf_vect_ngram_chars.fit(df.Conteudo)
X_train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
X_valid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_valid) 

TfidfVectorizer(analyzer='char', max_features=100, ngram_range=(1, 3))

In [17]:
# == fazer load do vetor pré-treinado de word embedding ==  

embedding_idx = {}
for i, line in tqdm(enumerate(open(f'../../dataset/fasttext_word_embedding/{word_embedding}'))):
    values = line.split()
    embedding_idx[values[0]] = np.asarray(values[1:] , dtype='float32')

2000001it [02:01, 16424.59it/s]


In [18]:
# == criar um tokenizador ==

token = text.Tokenizer()
token.fit_on_texts(df.Conteudo)
word_index = token.word_index

In [19]:
# == converter texto para sequência de tokens e preenchê-los para ter o mesmo tamanho == 

X_train_seq = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=150)
X_valid_seq = sequence.pad_sequences(token.texts_to_sequences(X_valid), maxlen=150)

In [20]:
# == criar map de token-embedding ==

embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [58]:
encoder.inverse_transform([0,1,2])

array(['centro', 'direita', 'esquerda'], dtype=object)

In [95]:
# == método para trieinar o modelo == 

def train_model(model, X_train, y_train, X_valid, is_neural):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    print(y_pred)
    if is_neural: 
        y_pred = np.argmax(y_pred, axis=1)
        
    cr = pd.DataFrame(
        classification_report(
            y_pred, 
            y_valid, 
            digits=2,
            output_dict=True,
            target_names=encoder.inverse_transform([0,1,2])
        )
    ).T
    cr['support'] = cr.support.apply(int)
    
    cm = pd.DataFrame(
        confusion_matrix(
            y_valid,
            y_pred
        )
    )
    cm.columns = ['centro', 'direita', 'esquerda']
    cm.index = ['centro', 'direita', 'esquerda']
    
    return cr, cm


In [78]:
# == logistic regression == 

model = LogisticRegression(solver='liblinear')

print('Logistic Regression, Count Vectors:')
cr, cm = train_model(
    model, 
    X_train_count, 
    y_train, 
    X_valid_count, 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("Logistic Regression, WordLevel TF-IDF: ")
cr, cm = train_model(
    model, 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf, 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("Logistic Regression, N-Gram Vectors: ")
cr, cm = train_model(
    model, 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram, 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("Logistic Regression, CharLevel Vectors: ")
cr, cm = train_model(
    model, 
    X_train_tfidf_ngram_chars, 
    y_train, 
    X_valid_tfidf_ngram_chars, 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)


Logistic Regression, Count Vectors:


Unnamed: 0,centro,direita,esquerda
centro,595,6,0
direita,6,578,3
esquerda,473,107,17


Unnamed: 0,precision,recall,f1-score,support
centro,0.990017,0.554004,0.710448,1074
direita,0.984668,0.836469,0.904538,691
esquerda,0.028476,0.85,0.055105,20
accuracy,0.666667,0.666667,0.666667,0
macro avg,0.66772,0.746824,0.556697,1785
weighted avg,0.977172,0.666667,0.77824,1785


Logistic Regression, WordLevel TF-IDF: 


Unnamed: 0,centro,direita,esquerda
centro,557,12,32
direita,28,517,42
esquerda,447,114,36


Unnamed: 0,precision,recall,f1-score,support
centro,0.926789,0.539729,0.68218,1032
direita,0.88075,0.804044,0.84065,643
esquerda,0.060302,0.327273,0.101839,110
accuracy,0.621849,0.621849,0.621849,0
macro avg,0.622613,0.557015,0.541556,1785
weighted avg,0.856807,0.621849,0.703502,1785


Logistic Regression, N-Gram Vectors: 


Unnamed: 0,centro,direita,esquerda
centro,556,12,33
direita,28,518,41
esquerda,451,112,34


Unnamed: 0,precision,recall,f1-score,support
centro,0.925125,0.537198,0.679707,1035
direita,0.882453,0.806854,0.842962,642
esquerda,0.056951,0.314815,0.096454,108
accuracy,0.620728,0.620728,0.620728,0
macro avg,0.62151,0.552955,0.539707,1785
weighted avg,0.857249,0.620728,0.703134,1785


Logistic Regression, CharLevel Vectors: 


Unnamed: 0,centro,direita,esquerda
centro,540,42,19
direita,6,578,3
esquerda,559,2,36


Unnamed: 0,precision,recall,f1-score,support
centro,0.898502,0.488688,0.63306,1105
direita,0.984668,0.92926,0.956162,622
esquerda,0.060302,0.62069,0.109924,58
accuracy,0.646499,0.646499,0.646499,0
macro avg,0.647824,0.679546,0.566382,1785
weighted avg,0.901292,0.646499,0.72865,1785


In [79]:
# == support vector machine (svm)

print ("Suport Vector Machine, N-Gram Vectors: ")
cr, cm = train_model(
    SVC(), 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram,
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

Suport Vector Machine, N-Gram Vectors: 


Unnamed: 0,centro,direita,esquerda
centro,600,0,1
direita,42,525,20
esquerda,430,156,11


Unnamed: 0,precision,recall,f1-score,support
centro,0.998336,0.559701,0.717274,1072
direita,0.894378,0.770925,0.828076,681
esquerda,0.018425,0.34375,0.034976,32
accuracy,0.636415,0.636415,0.636415,0
macro avg,0.637047,0.558126,0.526775,1785
weighted avg,0.941108,0.636415,0.747315,1785


In [80]:
# == random forest classifier == 

print ("Random Forest, Count Vectors: ")
cr, cm = train_model(
    RandomForestClassifier(
        n_estimators=500
    ), 
    X_train_count, 
    y_train, 
    X_valid_count,
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("Random Forest, WordLevel TF-IDF: ")
cr, cm = train_model(
    RandomForestClassifier(
        n_estimators=500
    ), 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf,
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

Random Forest, Count Vectors: 


Unnamed: 0,centro,direita,esquerda
centro,598,2,1
direita,0,584,3
esquerda,563,34,0


Unnamed: 0,precision,recall,f1-score,support
centro,0.995008,0.515073,0.678774,1161
direita,0.994889,0.941935,0.967688,620
esquerda,0.0,0.0,0.0,4
accuracy,0.662185,0.662185,0.662185,0
macro avg,0.663299,0.48567,0.548821,1785
weighted avg,0.992737,0.662185,0.777604,1785


Random Forest, WordLevel TF-IDF: 


Unnamed: 0,centro,direita,esquerda
centro,598,3,0
direita,2,580,5
esquerda,184,405,8


Unnamed: 0,precision,recall,f1-score,support
centro,0.995008,0.762755,0.863538,784
direita,0.988075,0.587045,0.736508,988
esquerda,0.0134,0.615385,0.02623,13
accuracy,0.664426,0.664426,0.664426,0
macro avg,0.665495,0.655061,0.542092,1785
weighted avg,0.984022,0.664426,0.787129,1785


In [87]:
# == gradient boost == 

model = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss',
    learning_rate=0.01,
    n_estimators=500
)

print ("XGBClassifier, Count Vectors: ")
cr, cm = train_model(
    model, 
    X_train_count.tocsc(), 
    y_train, 
    X_valid_count.tocsc(), 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("XGBClassifier, WordLevel TF-IDF: ")
cr, cm = train_model(
    model, 
    X_train_tfidf.tocsc(), 
    y_train, 
    X_valid_tfidf.tocsc(), 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("XGBClassifier, CharLevel Vectors: ")
cr, cm = train_model(
    model, 
    X_train_tfidf_ngram_chars.tocsc(), 
    y_train, 
    X_valid_tfidf_ngram_chars.tocsc(), 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)


XGBClassifier, Count Vectors: 


Unnamed: 0,centro,direita,esquerda
centro,402,68,131
direita,20,560,7
esquerda,79,418,100


Unnamed: 0,precision,recall,f1-score,support
centro,0.668885,0.802395,0.729583,501
direita,0.954003,0.535373,0.685854,1046
esquerda,0.167504,0.420168,0.239521,238
accuracy,0.594958,0.594958,0.594958,0
macro avg,0.596798,0.585979,0.551653,1785
weighted avg,0.769112,0.594958,0.638616,1785


XGBClassifier, WordLevel TF-IDF: 


Unnamed: 0,centro,direita,esquerda
centro,579,1,21
direita,83,473,31
esquerda,43,458,96


Unnamed: 0,precision,recall,f1-score,support
centro,0.963394,0.821277,0.886677,705
direita,0.805792,0.507511,0.622778,932
esquerda,0.160804,0.648649,0.257718,148
accuracy,0.643137,0.643137,0.643137,0
macro avg,0.64333,0.659145,0.589058,1785
weighted avg,0.81456,0.643137,0.696739,1785


XGBClassifier, CharLevel Vectors: 


Unnamed: 0,centro,direita,esquerda
centro,573,18,10
direita,3,575,9
esquerda,582,8,7


Unnamed: 0,precision,recall,f1-score,support
centro,0.953411,0.494819,0.651507,1158
direita,0.979557,0.956739,0.968013,601
esquerda,0.011725,0.269231,0.022472,26
accuracy,0.647059,0.647059,0.647059,0
macro avg,0.648231,0.573596,0.547331,1785
weighted avg,0.948498,0.647059,0.74891,1785


In [89]:
# == arquitetura lstm == 

def lstm():
    # limpar a sessão
    keras.backend.clear_session()
    # iniciar o modelo 
    model = keras.Sequential()
    # camada de entrada
    model.add(Input((150, )))
    # camada de word embedding
    model.add(Embedding(
        len(word_index) + 1, 
        300, 
        weights=[embedding_matrix], 
        trainable=True
    ))
    model.add(Convolution1D(
        128, 
        3, 
        activation="relu"
    ))
    model.add(GlobalMaxPool1D())
    # model.add(SpatialDropout1D(0.2))
    # camada LSTM
    model.add(Dropout(0.2))
    model.add(LSTM(128, return_sequences=True, activation='relu'))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    # camadas de saída
    model.add(Dense(1, activation="sigmoid"))
    # compilar o modelo
    model.compile(optimizer=optimizers.Adam(), metrics='accuracy', loss='binary_crossentropy')
    model.summary()
    return model

print ("RNN-LSTM, Word Embeddings")
cr, cm = train_model(
    lstm(), 
    X_train_seq, 
    y_train, 
    X_valid_seq, 
    True
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)


RNN-LSTM, Word Embeddings


ValueError: Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 128)

In [98]:
# == arquitetura recurrent convolutional neural network (RCNN) == 

def rcnn():
    # limpar a sessão
    keras.backend.clear_session()
    # iniciar o modelo 
    model = keras.Sequential()
    # camada de entrada 
    model.add(Input((150, )))
    # camada de word embedding 
    model.add(Embedding(
        len(word_index) + 1, 
        300, 
        weights=[embedding_matrix], 
        trainable=True
    ))
    model.add(SpatialDropout1D(0.3))
    # camada recurrent 
    model.add(Bidirectional(
        GRU(
            50, 
            return_sequences=True
        )
    ))
    # camada convolucional
    model.add(Convolution1D(
        100, 
        3, 
        activation="relu"
    ))
    # camada pooling 
    model.add(GlobalMaxPool1D())
    # camada denso 
    model.add(Dense(
        50, 
        activation="relu"
    ))
    model.add(Dropout(0.25))
    model.add(Dense(
        1, 
        activation="sigmoid"
    ))
    # model compile
    model.compile(
        optimizer=optimizers.Adam(), 
        loss='binary_crossentropy'
    )
    model.summary()
    return model

print ("CNN, Word Embeddings")
cr, cm = train_model(
    rcnn(), 
    X_train_seq, 
    y_train, 
    X_valid_seq, 
    True
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)


CNN, Word Embeddings
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          33610800  
                                                                 
 spatial_dropout1d (SpatialD  (None, 150, 300)         0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 150, 100)         105600    
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 148, 100)          30100     
                                                                 
 global_max_pooling1d (Globa  (None, 100)              0         
 lMaxPooling1D)                                                  
                                   

Unnamed: 0,centro,direita,esquerda
centro,601,0,0
direita,587,0,0
esquerda,597,0,0


Unnamed: 0,precision,recall,f1-score,support
centro,1.0,0.336695,0.503772,1785
direita,0.0,0.0,0.0,0
esquerda,0.0,0.0,0.0,0
accuracy,0.336695,0.336695,0.336695,0
macro avg,0.333333,0.112232,0.167924,1785
weighted avg,1.0,0.336695,0.503772,1785
