In [1]:
# == importando bibliotecas == 

import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import textblob
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, metrics
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow import keras
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import models, optimizers
# from tensorflow.keras.layers import LSTM, Convolution1D, GRU, Dense, Dropout, Input, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.layers import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

In [2]:
# # == download do modelo pré-treinado de word embedding == 

# inglês
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
# !gunzip cc.en.300.vec.gz
# !mv cc.en.300.vec ../../dataset/fasttext_word_embedding/en_word_embedding.vec

# português
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz
# !gunzip cc.pt.300.vec.gz
# !mv cc.pt.300.vec ../../fasttext_word_embedding/pt_word_embedding.vec

In [3]:
# == variáveis == 

path_pt = 'dados_treino_ingles'
path_en = 'dados_treino_pt_google_trad'
path = path_pt
word_embedding_en = 'en_word_embedding.vec'
word_embedding_pt = 'pt_word_embedding.vec'
word_embedding = word_embedding_pt

In [4]:
# == importar dados ==

df = pd.read_parquet(f'../../dataset/processed/artigos_tratados/artigos_tratados.parquet')

In [5]:
# == remover partidos de centro == 

df = df[df.Vies != 'centro']

In [6]:
# == contagem de artigos por viés ==

df[df.Vies == 'direita'].Partido.value_counts() / df[df.Vies == 'direita'].shape[0]
df[df.Vies == 'esquerda'].Partido.value_counts() / df[df.Vies == 'esquerda'].shape[0]

PL              0.613593
Novo            0.223720
PP              0.111090
União Brasil    0.051598
Name: Partido, dtype: float64

PSTU     0.252128
PCDoB    0.244021
PCB      0.225780
PT       0.224970
Rede     0.047021
PSOL     0.006080
Name: Partido, dtype: float64

In [7]:
# == contagem de artigos por viés ==

df[df.Vies == 'direita'].Partido.value_counts() / df.shape[0]
df[df.Vies == 'esquerda'].Partido.value_counts() / df.shape[0]

PL              0.416003
Novo            0.151677
PP              0.075317
União Brasil    0.034982
Name: Partido, dtype: float64

PSTU     0.081190
PCDoB    0.078580
PCB      0.072706
PT       0.072445
Rede     0.015142
PSOL     0.001958
Name: Partido, dtype: float64

In [9]:
# == separar aproximadamente 8% de cada viés para usar como validação ==

train = df[~df.Partido.isin(['PP', 'PCB'])]
valid = df[df.Partido.isin(['PP', 'PCB'])]

In [10]:
# == conjunto treino e teste == 

train[['Partido', 'Vies']].value_counts().to_frame()
valid[['Partido', 'Vies']].value_counts().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Partido,Vies,Unnamed: 2_level_1
PL,direita,3385
Novo,direita,1197
PSTU,esquerda,627
PCDoB,esquerda,602
PT,esquerda,597
União Brasil,direita,284
Rede,esquerda,124
PSOL,esquerda,15


Unnamed: 0_level_0,Unnamed: 1_level_0,0
Partido,Vies,Unnamed: 2_level_1
PCB,esquerda,617
PP,direita,587


In [11]:
# == train & test split ==

X_train, X_valid, y_train, y_valid = train.Conteudo, valid.Conteudo, train.Vies, valid.Vies

In [12]:
# == label encoding do rótulo == 

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [14]:
# == label das classes == 

encoder.classes_
encoder.inverse_transform([0,1])

array(['direita', 'esquerda'], dtype=object)

array(['direita', 'esquerda'], dtype=object)

In [15]:
# == count vectorizer ==

count_vect = CountVectorizer(
    analyzer='word', 
    token_pattern=r'\w{1,}'
)
count_vect.fit(df.Conteudo)
X_train_count =  count_vect.transform(X_train)
X_valid_count =  count_vect.transform(X_valid)

CountVectorizer(token_pattern='\\w{1,}')

In [46]:
# == word level tf-idf ==

tfidf_vect = TfidfVectorizer(
    analyzer='word', 
    max_features=300
)
tfidf_vect.fit(df.Conteudo)
X_train_tfidf = tfidf_vect.transform(X_train)
X_valid_tfidf = tfidf_vect.transform(X_valid)

TfidfVectorizer(max_features=300)

In [17]:
# == ngram level tf-idf ==

tfidf_vect_ngram = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1,3), 
    max_features=100
)
tfidf_vect_ngram.fit(df.Conteudo)
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
X_valid_tfidf_ngram =  tfidf_vect_ngram.transform(X_valid)

TfidfVectorizer(max_features=100, ngram_range=(1, 3))

In [18]:
# == characters level tf-idf ==

tfidf_vect_ngram_chars = TfidfVectorizer(
    analyzer='char', 
    ngram_range=(1,3), 
    max_features=100
)
tfidf_vect_ngram_chars.fit(df.Conteudo)
X_train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
X_valid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_valid) 

TfidfVectorizer(analyzer='char', max_features=100, ngram_range=(1, 3))

In [19]:
# == fazer load do vetor pré-treinado de word embedding ==  

embedding_idx = {}
for i, line in tqdm(enumerate(open(f'../../dataset/fasttext_word_embedding/{word_embedding}'))):
    values = line.split()
    embedding_idx[values[0]] = np.asarray(values[1:] , dtype='float32')

2000001it [01:45, 19013.31it/s]


In [20]:
# == criar um tokenizador ==

token = text.Tokenizer()
token.fit_on_texts(df.Conteudo)
word_index = token.word_index

In [21]:
# == converter texto para sequência de tokens e preenchê-los para ter o mesmo tamanho == 

X_train_seq = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=150)
X_valid_seq = sequence.pad_sequences(token.texts_to_sequences(X_valid), maxlen=150)

In [22]:
# == criar map de token-embedding ==

embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [37]:
# == método para trieinar o modelo == 

def train_model(model, X_train, y_train, X_valid, is_neural, return_model = False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    print(y_pred)
    if is_neural: 
        y_pred = np.round(y_pred)
        
    cr = pd.DataFrame(
        classification_report(
            y_pred, 
            y_valid, 
            digits=2,
            output_dict=True,
            target_names=encoder.inverse_transform([0,1])
        )
    ).T
    cr['support'] = cr.support.apply(int)
    
    cm = pd.DataFrame(
        confusion_matrix(
            y_valid,
            y_pred
        )
    )
    cm.columns = ['direita', 'esquerda']
    cm.index = ['direita', 'esquerda']
    
    if return_model:
        return cr, cm, model
    return cr, cm


In [28]:
# == logistic regression == 

model = LogisticRegression(solver='liblinear')

print('Logistic Regression, Count Vectors:')
cr, cm = train_model(
    model, 
    X_train_count, 
    y_train, 
    X_valid_count, 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("Logistic Regression, WordLevel TF-IDF: ")
cr, cm = train_model(
    model, 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf, 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("Logistic Regression, N-Gram Vectors: ")
cr, cm = train_model(
    model, 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram, 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("Logistic Regression, CharLevel Vectors: ")
cr, cm = train_model(
    model, 
    X_train_tfidf_ngram_chars, 
    y_train, 
    X_valid_tfidf_ngram_chars, 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)


Logistic Regression, Count Vectors:
[1 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,584,3
esquerda,129,488


Unnamed: 0,precision,recall,f1-score,support
direita,0.994889,0.819074,0.898462,713
esquerda,0.790924,0.99389,0.880866,491
accuracy,0.890365,0.890365,0.890365,0
macro avg,0.892907,0.906482,0.889664,1204
weighted avg,0.911711,0.890365,0.891286,1204


Logistic Regression, WordLevel TF-IDF: 
[1 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,576,11
esquerda,184,433


Unnamed: 0,precision,recall,f1-score,support
direita,0.981261,0.757895,0.855234,760
esquerda,0.701783,0.975225,0.816211,444
accuracy,0.83804,0.83804,0.83804,0
macro avg,0.841522,0.86656,0.835722,1204
weighted avg,0.878197,0.83804,0.840843,1204


Logistic Regression, N-Gram Vectors: 
[1 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,577,10
esquerda,188,429


Unnamed: 0,precision,recall,f1-score,support
direita,0.982964,0.754248,0.85355,765
esquerda,0.6953,0.977221,0.8125,439
accuracy,0.835548,0.835548,0.835548,0
macro avg,0.839132,0.865735,0.833025,1204
weighted avg,0.878077,0.835548,0.838583,1204


Logistic Regression, CharLevel Vectors: 
[0 0 0 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,586,1
esquerda,609,8


Unnamed: 0,precision,recall,f1-score,support
direita,0.998296,0.490377,0.657688,1195
esquerda,0.012966,0.888889,0.025559,9
accuracy,0.493355,0.493355,0.493355,0
macro avg,0.505631,0.689633,0.341624,1204
weighted avg,0.990931,0.493355,0.652963,1204


In [29]:
# == support vector machine (svm)

print ("Suport Vector Machine, N-Gram Vectors: ")
cr, cm = train_model(
    SVC(), 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram,
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

Suport Vector Machine, N-Gram Vectors: 
[1 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,575,12
esquerda,138,479


Unnamed: 0,precision,recall,f1-score,support
direita,0.979557,0.806452,0.884615,713
esquerda,0.776337,0.97556,0.864621,491
accuracy,0.875415,0.875415,0.875415,0
macro avg,0.877947,0.891006,0.874618,1204
weighted avg,0.896682,0.875415,0.876462,1204


In [30]:
# == random forest classifier == 

print ("Random Forest, Count Vectors: ")
cr, cm = train_model(
    RandomForestClassifier(
        n_estimators=500
    ), 
    X_train_count, 
    y_train, 
    X_valid_count,
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("Random Forest, WordLevel TF-IDF: ")
cr, cm = train_model(
    RandomForestClassifier(
        n_estimators=500
    ), 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf,
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

Random Forest, Count Vectors: 
[0 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,585,2
esquerda,299,318


Unnamed: 0,precision,recall,f1-score,support
direita,0.996593,0.661765,0.795377,884
esquerda,0.515397,0.99375,0.678762,320
accuracy,0.75,0.75,0.75,0
macro avg,0.755995,0.827757,0.73707,1204
weighted avg,0.8687,0.75,0.764383,1204


Random Forest, WordLevel TF-IDF: 
[1 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,584,3
esquerda,204,413


Unnamed: 0,precision,recall,f1-score,support
direita,0.994889,0.741117,0.849455,788
esquerda,0.669368,0.992788,0.799613,416
accuracy,0.828073,0.828073,0.828073,0
macro avg,0.832129,0.866953,0.824534,1204
weighted avg,0.882417,0.828073,0.832233,1204


In [31]:
# == gradient boost == 

model = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss',
    learning_rate=0.01,
    n_estimators=500
)

print ("XGBClassifier, Count Vectors: ")
cr, cm = train_model(
    model, 
    X_train_count.tocsc(), 
    y_train, 
    X_valid_count.tocsc(), 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("XGBClassifier, WordLevel TF-IDF: ")
cr, cm = train_model(
    model, 
    X_train_tfidf.tocsc(), 
    y_train, 
    X_valid_tfidf.tocsc(), 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

print ("XGBClassifier, CharLevel Vectors: ")
cr, cm = train_model(
    model, 
    X_train_tfidf_ngram_chars.tocsc(), 
    y_train, 
    X_valid_tfidf_ngram_chars.tocsc(), 
    False
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)


XGBClassifier, Count Vectors: 
[1 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,580,7
esquerda,209,408


Unnamed: 0,precision,recall,f1-score,support
direita,0.988075,0.735108,0.843023,789
esquerda,0.661264,0.983133,0.790698,415
accuracy,0.820598,0.820598,0.820598,0
macro avg,0.82467,0.85912,0.81686,1204
weighted avg,0.875428,0.820598,0.824987,1204


XGBClassifier, WordLevel TF-IDF: 
[1 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,563,24
esquerda,113,504


Unnamed: 0,precision,recall,f1-score,support
direita,0.959114,0.83284,0.891528,676
esquerda,0.816856,0.954545,0.880349,528
accuracy,0.886213,0.886213,0.886213,0
macro avg,0.887985,0.893693,0.885939,1204
weighted avg,0.896728,0.886213,0.886626,1204


XGBClassifier, CharLevel Vectors: 
[1 0 0 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,587,0
esquerda,521,96


Unnamed: 0,precision,recall,f1-score,support
direita,1.0,0.529783,0.692625,1108
esquerda,0.155592,1.0,0.269285,96
accuracy,0.567276,0.567276,0.567276,0
macro avg,0.577796,0.764892,0.480955,1204
weighted avg,0.932672,0.567276,0.658871,1204


In [47]:
# == testando esse modelo e obtendo FI == 

print ("XGBClassifier, WordLevel TF-IDF: ")
cr, cm, model = train_model(
    model, 
    X_train_tfidf.tocsc(), 
    y_train, 
    X_valid_tfidf.tocsc(), 
    False,
    True
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)

XGBClassifier, WordLevel TF-IDF: 
[1 1 1 ... 0 0 0]


Unnamed: 0,direita,esquerda
direita,565,22
esquerda,108,509


Unnamed: 0,precision,recall,f1-score,support
direita,0.962521,0.839525,0.896825,673
esquerda,0.824959,0.958569,0.88676,531
accuracy,0.892027,0.892027,0.892027,0
macro avg,0.89374,0.899047,0.891792,1204
weighted avg,0.901852,0.892027,0.892386,1204


In [54]:
sorted_idx = np.argsort(-model.feature_importances_)
feature_name = [tfidf_vect.get_feature_names()[i] for i in sorted_idx]
feature_imp = [round(x * 100, 1) for x in model.feature_importances_[sorted_idx]]

In [58]:
fi_dict = {
    'feature': feature_name,
    'importance': feature_imp
}
pd.DataFrame(fi_dict).to_excel('../../tmp/feature_imp.xlsx')

In [89]:
# == arquitetura lstm == 

def lstm():
    # limpar a sessão
    keras.backend.clear_session()
    # iniciar o modelo 
    model = keras.Sequential()
    # camada de entrada
    model.add(Input((150, )))
    # camada de word embedding
    model.add(Embedding(
        len(word_index) + 1, 
        300, 
        weights=[embedding_matrix], 
        trainable=True
    ))
    model.add(Convolution1D(
        128, 
        3, 
        activation="relu"
    ))
    model.add(GlobalMaxPool1D())
    # model.add(SpatialDropout1D(0.2))
    # camada LSTM
    model.add(Dropout(0.2))
    model.add(LSTM(128, return_sequences=True, activation='relu'))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    # camadas de saída
    model.add(Dense(1, activation="sigmoid"))
    # compilar o modelo
    model.compile(optimizer=optimizers.Adam(), metrics='accuracy', loss='binary_crossentropy')
    model.summary()
    return model

print ("RNN-LSTM, Word Embeddings")
cr, cm = train_model(
    lstm(), 
    X_train_seq, 
    y_train, 
    X_valid_seq, 
    True
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)


RNN-LSTM, Word Embeddings


ValueError: Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 128)

In [34]:
# == arquitetura recurrent convolutional neural network (RCNN) == 

def rcnn():
    # limpar a sessão
    keras.backend.clear_session()
    # iniciar o modelo 
    model = keras.Sequential()
    # camada de entrada 
    model.add(Input((150, )))
    # camada de word embedding 
    model.add(Embedding(
        len(word_index) + 1, 
        300, 
        weights=[embedding_matrix], 
        trainable=True
    ))
    model.add(SpatialDropout1D(0.3))
    # camada recurrent 
    model.add(Bidirectional(
        GRU(
            50, 
            return_sequences=True
        )
    ))
    # camada convolucional
    model.add(Convolution1D(
        100, 
        3, 
        activation="relu"
    ))
    # camada pooling 
    model.add(GlobalMaxPool1D())
    # camada denso 
    model.add(Dense(
        50, 
        activation="relu"
    ))
    model.add(Dropout(0.25))
    model.add(Dense(
        1, 
        activation="sigmoid"
    ))
    # model compile
    model.compile(
        optimizer=optimizers.Adam(), 
        loss='binary_crossentropy'
    )
    model.summary()
    return model

print ("CNN, Word Embeddings")
cr, cm = train_model(
    rcnn(), 
    X_train_seq, 
    y_train, 
    X_valid_seq, 
    True
)
cm.style.background_gradient(
    cmap='viridis'
)
cr.style.background_gradient(
    cmap='viridis',
    subset=pd.IndexSlice[:, :'f1-score']
)


CNN, Word Embeddings
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          24552600  
                                                                 
 spatial_dropout1d (SpatialD  (None, 150, 300)         0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 150, 100)         105600    
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 148, 100)          30100     
                                                                 
 global_max_pooling1d (Globa  (None, 100)              0         
 lMaxPooling1D)                                                  
                                   

Unnamed: 0,direita,esquerda
direita,587,0
esquerda,103,514


Unnamed: 0,precision,recall,f1-score,support
direita,1.0,0.850725,0.919342,690
esquerda,0.833063,1.0,0.90893,514
accuracy,0.914452,0.914452,0.914452,0
macro avg,0.916532,0.925362,0.914136,1204
weighted avg,0.928733,0.914452,0.914897,1204
