In [37]:
# == importando bibliotecas == 

import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import textblob
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, metrics
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow import keras
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import models, optimizers
# from tensorflow.keras.layers import LSTM, Convolution1D, GRU, Dense, Dropout, Input, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.layers import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

In [2]:
# # == download do modelo pré-treinado de word embedding == 

# inglês
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
# !gunzip cc.en.300.vec.gz
# !mv cc.en.300.vec ../../dataset/fasttext_word_embedding/en_word_embedding.vec

# português
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz
# !gunzip cc.pt.300.vec.gz
# !mv cc.pt.300.vec ../../fasttext_word_embedding/pt_word_embedding.vec

In [3]:
# == variáveis == 

path_pt = 'dados_treino_ingles'
path_en = 'dados_treino_pt_google_trad'
path = path_pt
word_embedding_en = 'en_word_embedding.vec'
word_embedding_pt = 'pt_word_embedding.vec'
word_embedding = word_embedding_pt

In [4]:
# == importar dados ==

df = pd.read_parquet(f'../../dataset/processed/artigos_de_partidos/artigos_partidos.parquet')

In [5]:
# == contagem de artigos por viés ==

df[df.Vies == 'direita'].Partido.value_counts() / df[df.Vies == 'direita'].shape[0]
df[df.Vies == 'esquerda'].Partido.value_counts() / df[df.Vies == 'esquerda'].shape[0]
df[df.Vies == 'centro'].Partido.value_counts() / df[df.Vies == 'centro'].shape[0]

PL              0.620759
Novo            0.219512
PP              0.107647
União Brasil    0.052081
Name: Partido, dtype: float64

PSTU     0.242835
PCB      0.238962
PCDoB    0.233153
PT       0.231216
Rede     0.048025
PSOL     0.005809
Name: Partido, dtype: float64

PSB    0.479065
PV     0.235998
MDB    0.163404
PDT    0.121533
Name: Partido, dtype: float64

In [6]:
# == contagem de artigos por viés ==

df[df.Vies == 'direita'].Partido.value_counts() / df.shape[0]
df[df.Vies == 'esquerda'].Partido.value_counts() / df.shape[0]
df[df.Vies == 'centro'].Partido.value_counts() / df.shape[0]

PL              0.288995
Novo            0.102194
PP              0.050115
União Brasil    0.024247
Name: Partido, dtype: float64

PSTU     0.053530
PCB      0.052677
PCDoB    0.051396
PT       0.050969
Rede     0.010587
PSOL     0.001281
Name: Partido, dtype: float64

PSB    0.150431
PV     0.074106
MDB    0.051311
PDT    0.038163
Name: Partido, dtype: float64

In [7]:
# == separar aproximadamente 5% de cada viés para usar como validação ==

train = df[~df.Partido.isin(['PP', 'PT', 'MDB'])]
valid = df[df.Partido.isin(['PP', 'PT', 'MDB'])]

In [9]:
# == train & test split ==

X_train, X_valid, y_train, y_valid = train.Conteudo, valid.Conteudo, train.Vies, valid.Vies

In [10]:
# == label encoding do rótulo == 

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [53]:
# == label das classes == 

encoder.classes_
encoder.inverse_transform([0,1,2])

array(['centro', 'direita', 'esquerda'], dtype=object)

array(['centro', 'direita', 'esquerda'], dtype=object)

In [11]:
# == count vectorizer ==

count_vect = CountVectorizer(
    analyzer='word', 
    token_pattern=r'\w{1,}'
)
count_vect.fit(df.Conteudo)
X_train_count =  count_vect.transform(X_train)
X_valid_count =  count_vect.transform(X_valid)

CountVectorizer(token_pattern='\\w{1,}')

In [12]:
# == word level tf-idf ==

tfidf_vect = TfidfVectorizer(
    analyzer='word', 
    max_features=100
)
tfidf_vect.fit(df.Conteudo)
X_train_tfidf = tfidf_vect.transform(X_train)
X_valid_tfidf = tfidf_vect.transform(X_valid)

TfidfVectorizer(max_features=100)

In [13]:
# == ngram level tf-idf ==

tfidf_vect_ngram = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1,3), 
    max_features=100
)
tfidf_vect_ngram.fit(df.Conteudo)
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
X_valid_tfidf_ngram =  tfidf_vect_ngram.transform(X_valid)

TfidfVectorizer(max_features=100, ngram_range=(1, 3))

In [14]:
# == characters level tf-idf ==

tfidf_vect_ngram_chars = TfidfVectorizer(
    analyzer='char', 
    ngram_range=(1,3), 
    max_features=100
)
tfidf_vect_ngram_chars.fit(df.Conteudo)
X_train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
X_valid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_valid) 

TfidfVectorizer(analyzer='char', max_features=100, ngram_range=(1, 3))

In [17]:
# == fazer load do vetor pré-treinado de word embedding ==  

embedding_idx = {}
for i, line in tqdm(enumerate(open(f'../../dataset/fasttext_word_embedding/{word_embedding}'))):
    values = line.split()
    embedding_idx[values[0]] = np.asarray(values[1:] , dtype='float32')

2000001it [02:01, 16424.59it/s]


In [18]:
# == criar um tokenizador ==

token = text.Tokenizer()
token.fit_on_texts(df.Conteudo)
word_index = token.word_index

In [19]:
# == converter texto para sequência de tokens e preenchê-los para ter o mesmo tamanho == 

X_train_seq = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=150)
X_valid_seq = sequence.pad_sequences(token.texts_to_sequences(X_valid), maxlen=150)

In [20]:
# == criar map de token-embedding ==

embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [54]:
# == método para trieinar o modelo == 

def train_model(model, X_train, y_train, X_valid, is_neural):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    res = pd.DataFrame(
        classification_report(
            y_pred, 
            y_valid, 
            digits=2,
            output_dict=True,
            labels=encoder.inverse_transform([0,1,2])
        )
    ).T
    res['support'] = res.support.apply(int)
    if is_neural:
        return res
    else:
        return res

In [57]:
# == logistic regression == 

model = LogisticRegression(solver='liblinear')

accuracy = train_model(
    model, 
    X_train_count, 
    y_train, 
    X_valid_count, 
    False
)
print ("LR, Count Vectors: ", accuracy)
accuracy.style.background_gradient(
        cmap='viridis',
        subset=pd.IndexSlice['centro':'esquerda', :'f1-score']
    )

accuracy = train_model(
    model, 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf, 
    False
)
print ("LR, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram, 
    False
)
print ("LR, N-Gram Vectors: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf_ngram_chars, 
    y_train, 
    X_valid_tfidf_ngram_chars, 
    False
)
print ("LR, CharLevel Vectors: ", accuracy)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

In [28]:
# == support vector machine (svm)

accuracy = train_model(
    SVC(), 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram,
    False
)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.6364145658263305


In [29]:
# == random forest classifier == 

accuracy = train_model(
    RandomForestClassifier(
        n_estimators=500
    ), 
    X_train_count, 
    y_train, 
    X_valid_count,
    False
)
print ("RF, Count Vectors: ", accuracy)

accuracy = train_model(
    RandomForestClassifier(
        n_estimators=500
    ), 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf,
    False
)
print ("RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.661624649859944
RF, WordLevel TF-IDF:  0.6649859943977591


In [34]:
# == gradient boost == 

model = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss',
    learning_rate=0.01,
    n_estimators=500
)

accuracy = train_model(
    model, 
    X_train_count.tocsc(), 
    y_train, 
    X_valid_count.tocsc(), 
    False
)
print ("Xgb, Count Vectors: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf.tocsc(), 
    y_train, 
    X_valid_tfidf.tocsc(), 
    False
)
print ("Xgb, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf_ngram_chars.tocsc(), 
    y_train, 
    X_valid_tfidf_ngram_chars.tocsc(), 
    False
)
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.5949579831932773
Xgb, WordLevel TF-IDF:  0.6431372549019608
Xgb, CharLevel Vectors:  0.6470588235294118


In [30]:
# == arquitetura lstm == 

def lstm():
    # limpar a sessão
    keras.backend.clear_session()
    # iniciar o modelo 
    model = keras.Sequential()
    # camada de entrada
    model.add(Input((150, )))
    # camada de word embedding
    model.add(Embedding(
        len(word_index) + 1, 
        300, 
        weights=[embedding_matrix], 
        trainable=True
    ))
    model.add(Convolution1D(
        128, 
        3, 
        activation="relu"
    ))
    model.add(GlobalMaxPool1D())
    # model.add(SpatialDropout1D(0.2))
    # camada LSTM
    model.add(Dropout(0.2))
    model.add(LSTM(128, return_sequences=True, activation='relu'))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    # camadas de saída
    model.add(Dense(1, activation="sigmoid"))
    # compilar o modelo
    model.compile(optimizer=optimizers.Adam(), metrics='accuracy', loss='binary_crossentropy')
    model.summary()
    return model

accuracy = train_model(
    lstm(), 
    X_train_seq, 
    y_train, 
    X_valid_seq, 
    True
)
print ("RNN-LSTM, Word Embeddings",  accuracy)

ValueError: Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 128)

In [134]:
# == arquitetura recurrent convolutional neural network (RCNN) == 

def rcnn():
    # limpar a sessão
    keras.backend.clear_session()
    # iniciar o modelo 
    model = keras.Sequential()
    # camada de entrada 
    model.add(Input((150, )))
    # camada de word embedding 
    model.add(Embedding(
        len(word_index) + 1, 
        300, 
        weights=[embedding_matrix], 
        trainable=True
    ))
    model.add(SpatialDropout1D(0.3))
    # camada recurrent 
    model.add(Bidirectional(
        GRU(
            50, 
            return_sequences=True
        )
    ))
    # camada convolucional
    model.add(Convolution1D(
        100, 
        3, 
        activation="relu"
    ))
    # camada pooling 
    model.add(GlobalMaxPool1D())
    # camada denso 
    model.add(Dense(
        50, 
        activation="relu"
    ))
    model.add(Dropout(0.25))
    model.add(Dense(
        1, 
        activation="sigmoid"
    ))
    # model compile
    model.compile(
        optimizer=optimizers.Adam(), 
        loss='binary_crossentropy'
    )
    model.summary()
    return model

accuracy = train_model(
    rcnn(), 
    X_train_seq, 
    y_train, 
    X_valid_seq, 
    True
)
print ("CNN, Word Embeddings",  accuracy)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          46842600  
                                                                 
 spatial_dropout1d (SpatialD  (None, 150, 300)         0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 150, 100)         105600    
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 148, 100)          30100     
                                                                 
 global_max_pooling1d (Globa  (None, 100)              0         
 lMaxPooling1D)                                                  
                                                        