In [68]:
# == importando bibliotecas == 

import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import textblob
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, metrics
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow import keras
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import models, optimizers
#from tensorflow.keras.layers import LSTM, Convolution1D, GRU, Dense, Dropout, Input, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.layers import *

In [17]:
# # == download do modelo pré-treinado de word embedding == 

# inglês
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
# !gunzip cc.en.300.vec.gz
# !mv cc.en.300.vec ../dataset/fasttext_word_embedding/en_word_embedding.vec

# português
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz
# !gunzip cc.pt.300.vec.gz
# !mv cc.pt.300.vec ../dataset/fasttext_word_embedding/pt_word_embedding.vec

In [None]:
# == variáveis == 

path_pt = 'dados_treino_ingles'
path_en = 'dados_treino_pt_google_trad'
path = path_en
word_embedding_en = 'en_word_embedding.vec'
word_embedding_pt = 'pt_word_embedding.vec'
word_embedding = word_embedding_en

In [4]:
# == importar dados ==

df = pd.DataFrame()
for n in range(5):
    df = pd.concat([
        df,
        pd.read_parquet(f'../dataset/{path}/parte_{n+1}.parquet')
    ])

In [5]:
# == train & test split ==

X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
                                         df.conteudo, 
                                         df.rotulo
                                     )

In [6]:
# == label encoding do rótulo == 

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [7]:
# == count vectorizer ==

count_vect = CountVectorizer(
    analyzer='word', 
    token_pattern=r'\w{1,}'
)
count_vect.fit(df.conteudo)
X_train_count =  count_vect.transform(X_train)
X_valid_count =  count_vect.transform(X_valid)

In [8]:
# == word level tf-idf ==

tfidf_vect = TfidfVectorizer(
    analyzer='word', 
    max_features=100
)
tfidf_vect.fit(df.conteudo)
X_train_tfidf = tfidf_vect.transform(X_train)
X_valid_tfidf = tfidf_vect.transform(X_valid)

In [9]:
# == ngram level tf-idf ==

tfidf_vect_ngram = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1,3), 
    max_features=100
)
tfidf_vect_ngram.fit(df.conteudo)
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
X_valid_tfidf_ngram =  tfidf_vect_ngram.transform(X_valid)

In [10]:
# == characters level tf-idf ==

tfidf_vect_ngram_chars = TfidfVectorizer(
    analyzer='char', 
    ngram_range=(1,3), 
    max_features=100
)
tfidf_vect_ngram_chars.fit(df.conteudo)
X_train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
X_valid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_valid) 

In [19]:
# == fazer load do vetor pré-treinado de word embedding ==  

embedding_idx = {}
for i, line in tqdm(enumerate(open(f'../dataset/fasttext_word_embedding/{word_embedding}.vec'))):
    values = line.split()
    embedding_idx[values[0]] = np.asarray(values[1:] , dtype='float32')

2000001it [01:42, 19573.23it/s]


In [20]:
# == criar um tokenizador ==

token = text.Tokenizer()
token.fit_on_texts(df.conteudo)
word_index = token.word_index

In [21]:
# == converter texto para sequência de tokens e preenchê-los para ter o mesmo tamanho == 

X_train_seq = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=150)
X_valid_seq = sequence.pad_sequences(token.texts_to_sequences(X_valid), maxlen=150)

In [55]:
# == criar map de token-embedding ==

embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [23]:
# == método para trieinar o modelo == 

def train_model(model, X_train, y_train, X_valid, is_neural_net):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    if is_neural_net:
        y_pred = y_pred.argmax(axis=-1)
    return metrics.accuracy_score(y_pred, y_valid)

In [27]:
# == logistic regression == 

model = LogisticRegression(solver='liblinear')

accuracy = train_model(
    model, 
    X_train_count, 
    y_train, 
    X_valid_count, 
    False
)
print ("LR, Count Vectors: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf, 
    False
)
print ("LR, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram, 
    False
)
print ("LR, N-Gram Vectors: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf_ngram_chars, 
    y_train, 
    X_valid_tfidf_ngram_chars, 
    False
)
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.7712789827973074
LR, WordLevel TF-IDF:  0.6448765893792072
LR, N-Gram Vectors:  0.6457741211667913
LR, CharLevel Vectors:  0.5949139865370232


In [60]:
# == support vector machine (svm)

accuracy = train_model(
    SVC(), 
    X_train_tfidf_ngram, 
    y_train, 
    X_valid_tfidf_ngram,
    False
)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.6851159311892296


In [72]:
# == random forest classifier == 

accuracy = train_model(
    RandomForestClassifier(
        n_estimators=500
    ), 
    X_train_count, 
    y_train, 
    X_valid_count,
    False
)
print ("RF, Count Vectors: ", accuracy)

accuracy = train_model(
    RandomForestClassifier(
        n_estimators=500
    ), 
    X_train_tfidf, 
    y_train, 
    X_valid_tfidf,
    False
)
print ("RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.7096484667165296
RF, WordLevel TF-IDF:  0.6830216903515333


In [71]:
# == gradient boost == 

model = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss',
    learning_rate=0.01,
    n_estimators=500
)

accuracy = train_model(
    model, 
    X_train_count.tocsc(), 
    y_train, 
    X_valid_count.tocsc(), 
    False
)
print ("Xgb, Count Vectors: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf.tocsc(), 
    y_train, 
    X_valid_tfidf.tocsc(), 
    False
)
print ("Xgb, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(
    model, 
    X_train_tfidf_ngram_chars.tocsc(), 
    y_train, 
    X_valid_tfidf_ngram_chars.tocsc(), 
    False
)
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.7842931937172775
Xgb, WordLevel TF-IDF:  0.6900523560209424
Xgb, CharLevel Vectors:  0.656095736724009


In [79]:
# == arquitetura lstm == 

def lstm():
    # limpar a sessão
    keras.backend.clear_session()
    # iniciar o modelo 
    model = keras.Sequential()
    # camada de entrada
    model.add(Input((150, )))
    # camada de word embedding
    model.add(Embedding(
        len(word_index) + 1, 
        300, 
        weights=[embedding_matrix], 
        trainable=True
    ))
    # model.add(SpatialDropout1D(0.3))
    # camada LSTM
    model.add(Dropout(0.2))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    # camadas de saída
    model.add(Dense(1, activation="sigmoid"))
    # compilar o modelo
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    model.summary()
    return model

accuracy = train_model(
    lstm(), 
    X_train_seq, 
    y_train, 
    X_valid_seq, 
    True
)
print ("RNN-LSTM, Word Embeddings",  accuracy)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          46842600  
                                                                 
 dropout (Dropout)           (None, 150, 300)          0         
                                                                 
 lstm (LSTM)                 (None, 150, 128)          219648    
                                                                 
 dropout_1 (Dropout)         (None, 150, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 150, 128)          131584    
                                                                 
 dropout_2 (Dropout)         (None, 150, 128)          0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               1

In [None]:
# == arquitetura recurrent convolutional neural network (RCNN) == 

def rcnn():
    # limpar a sessão
    keras.backend.clear_session()
    # iniciar o modelo 
    model = keras.Sequential()
    # camada de entrada 
    model.add(Input((150, )))
    # camada de word embedding 
    model.add(Embedding(
        len(word_index) + 1, 
        300, 
        weights=[embedding_matrix], 
        trainable=True
    ))
    model.add(SpatialDropout1D(0.3))
    # camada recurrent 
    model.add(Bidirectional(
        GRU(
            50, 
            return_sequences=True
        )
    ))
    # camada convolucional
    model.add(Convolution1D(
        100, 
        3, 
        activation="relu"
    ))
    # camada pooling 
    model.add(GlobalMaxPool1D())
    # camada denso 
    model.add(Dense(
        50, 
        activation="relu"
    ))
    model.add(Dropout(0.25))
    model.add(Dense(
        1, 
        activation="sigmoid"
    ))
    # model compile
    model.compile(
        optimizer=optimizers.Adam(), 
        loss='binary_crossentropy'
    )
    model.summary()
    return model

accuracy = train_model(
    rcnn(), 
    X_train_seq, 
    y_train, 
    X_valid_seq, 
    True
)
print ("CNN, Word Embeddings",  accuracy)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          46842600  
                                                                 
 spatial_dropout1d (SpatialD  (None, 150, 300)         0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 150, 100)         105600    
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 148, 100)          30100     
                                                                 
 global_max_pooling1d (Globa  (None, 100)              0         
 lMaxPooling1D)                                                  
                                                        