In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re,string,unicodedata
import pickle

from tqdm import tqdm
import seaborn as sns

import gensim

In [None]:
print(tf.__version__)

In [None]:
df=pd.read_csv("/kaggle/input/news-of-the-site-folhauol/articles.csv",encoding="utf8")

In [None]:
df.columns

In [None]:
df.head(3)

In [None]:
df.shape

In [None]:
print(df["title"][0],"\n",df["text"][0])

In [None]:
print('Linhas:',len(df))
df.drop_duplicates(subset=["text"],inplace=True)
print('Removendo duplicadas em "text":',len(df))
df.drop(['subcategory','link','date'],axis=1,inplace=True)
print('Removendo subcategory')
df.dropna(inplace=True)
print('Removendo nulos:',len(df))
df.reset_index(drop=True,inplace=True)
print('Resetando index')
df.head()

In [None]:
stop = set(stopwords.words('portuguese'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [None]:
# Removing URL's
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

#Removing the noisy text
def denoise_text(text):
    text = remove_urls(text)
    text = remove_stopwords(text)
    text = remove_punct(text)
    return text

#Apply function on review column
df['text']=df['text'].apply(denoise_text)

In [None]:
print(df["title"][0],"\n",df["text"][0])

In [None]:
df.head(3)

In [None]:
sns.set_style("dark")
sns.countplot(y='category', data=df)

In [None]:
# Contar a frequência de cada categoria
categoria_counts = df['category'].value_counts()

# Pegar as 7 categorias mais frequentes
top_7_categorias = categoria_counts.index[:7]

# Defina todas as categorias que não estão entre as 7 mais frequentes como 'outros'
df.loc[~df['category'].isin(top_7_categorias), 'category'] = 'outros'

In [None]:
sns.set_style("dark")
sns.countplot(y='category', data=df)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
# Calcular o número de palavras em cada texto
word_counts = [len(str(text).split()) for text in df['text']]

# Encontrar o número máximo de palavras
max_words = max(word_counts)

# Encontrar o número mínimo de palavras
min_words = min(word_counts)

print('Número máximo de palavras:', max_words)
print('Número mínimo de palavras:', min_words)

## WORD2VEC MODEL USING GENSIM

In [None]:
df.reset_index(drop=True, inplace=True)

articles_tokens=[]
for i in range(len(df["text"])):
    articles_tokens.append([word for word in word_tokenize(str(df["text"][i].lower())) if len(word)>2])

In [None]:
articles_tokens[0][0:10]

In [None]:
save, load =  False, True
if save: 
    # Salvar articles_tokens como um arquivo pickle
    with open('articles_tokens.pkl', 'wb') as f:
        pickle.dump(articles_tokens, f)
if load:
    # Carregar articles_tokens de um arquivo pickle
    with open('articles_tokens.pkl', 'rb') as f:
        articles_tokens = pickle.load(f)

In [None]:
#Dimension of vectors we are generating
EMBEDDING_DIM = 100

In [None]:
wv_model = gensim.models.Word2Vec(sentences=articles_tokens, 
                                  min_count=5, 
                                  vector_size=EMBEDDING_DIM, 
                                  workers=4)

In [None]:
wv_model.save('word2vec_v2.model')

In [None]:
wv_model = gensim.models.Word2Vec.load('word2vec_v2.model')

In [None]:
wv_model.wv.most_similar("lula")

In [None]:
wv_model.wv.most_similar("esporte")

In [None]:
print(len(wv_model.wv.key_to_index))

## Model

In [None]:
# Create vocabulary and embedding matrix
max_len = max([len(seq) for seq in articles_tokens])  # Find maximum sequence length
vocab_size = len(wv_model.wv.key_to_index) + 1  # Include padding token (+1)

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in wv_model.wv.key_to_index.items():
    embedding_vector = wv_model.wv[word]
    if embedding_vector is not None:  # Handle out-of-vocabulary (OOV) words
        embedding_matrix[i + 1] = embedding_vector  # +1 for padding token

In [None]:
embedding_matrix.shape

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Conv1D, Dropout, MaxPooling1D, GRU
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.initializers import Constant
from tensorflow.keras import regularizers

In [None]:
# O próximo passo é tokenizar o texto
tokenizer = Tokenizer(num_words=3000, split=" ")
tokenizer.fit_on_texts(df['text'].values)

In [None]:
# Transformando o texto em sequência de números e preenchendo sequência para ter o mesmo tamanho
X_seq = tokenizer.texts_to_sequences(df['text'].values)

In [None]:
# padding our text vector so they all have the same length
X_pad = pad_sequences(X_seq, padding="post", truncating="post")

In [None]:
# Categorizando as labels
Y = pd.get_dummies(df['category']).values

In [None]:
print(X_pad.shape)
print(Y.shape)

In [None]:
word_to_check = 'condenação'  # substitua por qualquer palavra que você queira verificar

# Obtenha o índice da palavra
index = tokenizer.word_index.get(word_to_check)

if index is not None:
    print(f"A palavra '{word_to_check}' está mapeada para o índice {index} no tokenizer.")
else:
    print(f"A palavra '{word_to_check}' não está no vocabulário do tokenizer.")

In [None]:
# Crie um mapeamento de índices para palavras
index_to_word = {v: k for k, v in tokenizer.word_index.items()}

In [None]:
# Escolhendo um exemplo para mostrar
example_index = 0

sequence_to_translate = X_seq[example_index]
translated_text = ' '.join(index_to_word[i] for i in sequence_to_translate if i in index_to_word)

print("Texto original: ", df['text'].values[example_index])

# Mostrando a sequência correspondente
print("Sequência correspondente: ", X_seq[example_index])

# Mostrando a sequência após o padding
print("Sequência após o padding: ", X_pad[example_index])

print("Texto traduzido: ", translated_text)

In [None]:
# Dividindo os dados em conjunto de treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(X_pad, Y, test_size = 0.3, random_state = 42)

In [None]:
Y_train.shape

In [None]:
del articles_tokens, wv_model, tokenizer, Y, X_seq

In [None]:
model = Sequential()

# Load pre-trained embeddings
embedding_layer = Embedding(
    vocab_size, 
    EMBEDDING_DIM, 
    embeddings_initializer=Constant(embedding_matrix), 
    trainable=True
)

model.add(embedding_layer)
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Bidirectional(LSTM(32, return_sequences=True, kernel_regularizer=regularizers.l2(0.01))))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32, return_sequences=False, kernel_regularizer=regularizers.l2(0.01))))
model.add(Dropout(0.5))
model.add(Dense(Y_train.shape[1], activation='softmax'))

In [None]:
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), 
              optimizer = tf.keras.optimizers.Adam(), 
              metrics = [tf.keras.metrics.F1Score(average="macro")])

In [None]:
earlystop = EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True)

In [None]:
# Treinando o modelo
batch_size = 512
history = model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, callbacks=[earlystop], validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Plotando a história de perda
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(loc='upper right')
plt.show()

In [None]:
# Plotando a história de perda
plt.figure(figsize=(12, 6))
plt.plot(history.history['f1_score'], label='Train Score')
plt.plot(history.history['val_f1_score'], label='Validation Score')
plt.title('Model F1 Score')
plt.ylabel('F1')
plt.xlabel('Epochs')
plt.legend(loc='upper right')
plt.show()

In [None]:
# Gerando o relatório de classificação
Y_test_pred = model.predict(X_test)
Y_test_pred_classes = np.argmax(Y_test_pred, axis=1)
Y_test_classes = np.argmax(Y_test, axis=1)

In [None]:
print(classification_report(Y_test_classes, Y_test_pred_classes, target_names=df['category'].unique()))

In [None]:
# Gerar a matriz de confusão
cm = confusion_matrix(Y_test_classes, Y_test_pred_classes)

# Criar uma matriz de zeros com a mesma forma da matriz de confusão
mask = np.zeros_like(cm)

# Marcar a metade superior da matriz com True para ocultá-la
mask[np.triu_indices_from(mask, k=1)] = True

# Obter os nomes das classes
class_names = df['category'].unique()

# Visualizar a matriz de confusão
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predito')
plt.ylabel('Verdadeiro')
plt.show()

In [None]:
model.summary()