In [6]:
# == importando bibliotecas == 

import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import textblob
from sklearn.liner_model import LogisticRegression
from sklearn import model_selection, metrics
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow import keras
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import layers, models, optimizers

2023-10-08 17:17:34.643063: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-08 17:17:41.072365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-10-08 17:17:41.088303: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

In [None]:
# # == download do modelo pré-treinado de word embedding == 

# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
# !gunzip cc.en.300.vec.gz
# !mv cc.en.300.vec ../dataset/fasttest_word_embedding/en_word_embedding.vec

In [23]:
# == importar dados ==

df = pd.DataFrame()
for n in range(5):
    df = pd.concat([
        df,
        pd.read_parquet(f'../dataset/dados_treino_ingles/parte_{n+1}.parquet')
    ])

In [33]:
# == train & test split ==

X_train, X_valid, y_train, y_valid = model_selection.train_test_split(df.conteudo, df.rotulo)

In [68]:
# == label encoding do rótulo == 

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [26]:
# == count vectorizer ==

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df.conteudo)
X_train_count =  count_vect.transform(X_train)
X_valid_count =  count_vect.transform(X_valid)

In [27]:
# == word level tf-idf ==

tfidf_vect = TfidfVectorizer(analyzer='word', max_features=5000)
tfidf_vect.fit(df.conteudo)
X_train_tfidf = tfidf_vect.transform(X_train)
X_valid_tfidf = tfidf_vect.transform(X_valid)

In [28]:
# == ngram level tf-idf ==

tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df.conteudo)
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
X_valid_tfidf_ngram =  tfidf_vect_ngram.transform(X_valid)

In [29]:
# == characters level tf-idf ==

tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(df.conteudo)
X_train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
X_valid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_valid) 

In [None]:
# == fazer load do vetor pré-treinado de word embedding ==  

embedding_idx = {}
for i, line in tqdm(enumerate(open('../dataset/en_word_embedding.vec'))):
    values = line.split()
    embedding_idx[values[0]] = np.asarray(values[1:], dtype='float32')

254622it [00:17, 14528.16it/s]

In [19]:
# == criar um tokenizador ==

token = text.Tokenizer()
token.fit_on_texts(df.conteudo)
word_index = token.word_index

In [42]:
# == converter texto para sequência de tokens e preenchê-los para ter o mesmo tamanho == 

X_train_seq = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=150)
X_valid_seq = sequence.pad_sequences(token.texts_to_sequences(X_valid), maxlen=150)

In [31]:
# == criar map de token-embedding ==

embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [64]:
# == método para trieinar o modelo == 

def train_model(model, X_train, y_train, X_valid, is_neural_net):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    if is_neural_net:
        y_pred = y_pred.argmax(axis=-1)
    return metrics.accuracy_score(y_pred, y_valid)

In [74]:
# == logistic regression == 

model = linear_model.LogisticRegression(solver='liblinear')

accuracy = train_model(model, X_train_count, y_train, X_valid_count, False)
print ("LR, Count Vectors: ", accuracy)

accuracy = train_model(model, X_train_tfidf, y_train, X_valid_tfidf, False)
print ("LR, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(model, X_train_tfidf_ngram, y_train, X_valid_tfidf_ngram, False)
print ("LR, N-Gram Vectors: ", accuracy)

accuracy = train_model(model, X_train_tfidf_ngram_chars, y_train, X_valid_tfidf_ngram_chars, False)
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.5117427075542259
LR, WordLevel TF-IDF:  0.5053103964098729
LR, N-Gram Vectors:  0.5136873597606582
LR, CharLevel Vectors:  0.49857890800299176


In [None]:
# == gradient boost == 

model = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss',
    learning_rate=0.01
)

accuracy = train_model(model, X_train_count.tocsc(), y_train, X_valid_count.tocsc(), False)
print ("Xgb, Count Vectors: ", accuracy)

accuracy = train_model(model, X_train_tfidf.tocsc(), y_train, X_valid_tfidf.tocsc(), False)
print ("Xgb, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(model, X_train_tfidf_ngram_chars.tocsc(), y_train, X_valid_tfidf_ngram_chars.tocsc(), False)
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.5228122662677637
Xgb, WordLevel TF-IDF:  0.5265519820493643


In [None]:
# == arquitetura lstm == 

def lstm():
    keras.backend.clear_session()
    # camada de entrada
    input_layer = layers.Input((150, ))
    # camada de word embedding
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    # camada LSTM
    lstm_layer = layers.LSTM(128, activation='relu')(embedding_layer)
    # camadas de saída
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)
    # compilar o modelo
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return model

accuracy = train_model(lstm(), X_train_seq, y_train, X_valid_seq, True)
print ("RNN-LSTM, Word Embeddings",  accuracy)