In [2]:
#%pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.7.24-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm (from nltk)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading regex-2024.7.24-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (775 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m775.

In [3]:
import pandas as pd
import numpy as np
import re
import random
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [4]:
# Baixando stopwords do nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/romildopaiter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Função para adicionar ruído ao texto
def add_noise(text, noise_level=0.1):
    words = text.split()
    num_noisy_words = int(len(words) * noise_level)

    # Adicionando palavras aleatórias como ruído
    noisy_words = ['abc', 'xyz', '123', 'noise']
    for _ in range(num_noisy_words):
        idx = np.random.randint(len(words))
        words[idx] = random.choice(noisy_words)  # Alteração para usar random.choice

    # Adicionando erros de digitação
    for _ in range(num_noisy_words):
        idx = np.random.randint(len(words))
        if len(words[idx]) > 2:
            pos = np.random.randint(len(words[idx]))
            # Substituindo caractere por um caractere aleatório
            words[idx] = words[idx][:pos] + random.choice('abcdefghijklmnopqrstuvwxyz') + words[idx][pos+1:]

    return ' '.join(words)

In [None]:
# Exemplo de conjunto de dados
data = {
    'Message': [
        'I love machine learning!',
        'Deep learning is amazing.',
        'Natural Language Processing is a subset of AI.',
        'How to build a neural network?',
        'Machine learning models are powerful.'
    ],
    'Label': [1, 1, 0, 0, 1]  # 1 para positivo, 0 para negativo (exemplo fictício)
}

df = pd.DataFrame(data)
display(df)

Unnamed: 0,Message,Label
0,I love machine learning!,1
1,Deep learning is amazing.,1
2,Natural Language Processing is a subset of AI.,0
3,How to build a neural network?,0
4,Machine learning models are powerful.,1


In [None]:
# Adicionando ruído ao conjunto de dados
df['Noisy_Message'] = df['Message'].apply(lambda x: add_noise(x, noise_level=0.5))

In [None]:
# Exibindo o DataFrame com ruído adicionado
print("\nDataFrame com Ruído Adicionado:")
display(df[['Noisy_Message']])



DataFrame com Ruído Adicionado:


Unnamed: 0,Noisy_Message
0,aoc ljve machine learning!
1,Deep byz is amazmng.
2,Natural abc ahc h23 a hubset of AI.
3,How to noism abc neural xyz
4,xyz learning models ore atc


In [None]:
# Funções de pré-processamento
def preprocess_text(text):
    # Convertendo para minúsculas
    text = text.lower()

    # Removendo URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Removendo caracteres não-alfanuméricos
    text = re.sub(r'\W', ' ', text)

    # Removendo números
    text = re.sub(r'\d', '', text)

    # Tokenização
    tokens = text.split()

    # Remoção de stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Juntando tokens de volta em uma string
    text = ' '.join(tokens)

    return text

# Aplicando o pré-processamento
df['Processed_Message'] = df['Noisy_Message'].apply(preprocess_text)
display(df[['Processed_Message']])


Unnamed: 0,Processed_Message
0,aoc ljve machin learn
1,deep byz amazmng
2,natur abc ahc h hubset ai
3,noism abc neural xyz
4,xyz learn model ore atc


In [None]:
# Vetorização do texto
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Processed_Message'])
y = df['Label']


In [None]:
# Exibindo a matriz de características X
print("\nMatriz de Características (X):")
print(X.toarray())

# Exibindo os nomes das características
print("\nNomes das Características:")
print(vectorizer.get_feature_names_out())


Matriz de Características (X):
[[0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0]
 [0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1]
 [0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 1]]

Nomes das Características:
['abc' 'ahc' 'ai' 'amazmng' 'aoc' 'atc' 'byz' 'deep' 'hubset' 'learn'
 'ljve' 'machin' 'model' 'natur' 'neural' 'noism' 'ore' 'xyz']


In [None]:
# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, df['Label'], df.index, test_size=0.3, random_state=42)

# Treinando o modelo MLP
model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=500, random_state=42)
model.fit(X_train, y_train)

# Fazendo previsões
y_pred = model.predict(X_test)





In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Avaliação do modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia: {accuracy:.2f}")

# Identificar as classes presentes no conjunto de teste
classes_present = np.unique(y_test)

# Ajustar o relatório de classificação e a matriz de confusão com base nas classes presentes
report = classification_report(y_test, y_pred, target_names=[f'Classe {c}' for c in classes_present], labels=classes_present)
conf_matrix = confusion_matrix(y_test, y_pred, labels=classes_present)

# Exibindo o relatório de classificação
print("Relatório de Classificação:")
print(report)

Acurácia: 1.00
Relatório de Classificação:
              precision    recall  f1-score   support

    Classe 1       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [None]:
# Criar um DataFrame para o conjunto de teste
df_test = df.loc[indices_test].copy()
df_test['Predicted_Label'] = y_pred

# Adicionar coluna com textos preditos
df_test['Predicted_Message'] = df_test['Noisy_Message']  # Adicionar coluna com o texto original

# Exibindo o DataFrame com mensagens reais, preditas e rótulos
print("\nDataFrame com Mensagens Reais, Previsões e Rótulos:")
print(df_test[['Message', 'Noisy_Message', 'Processed_Message', 'Label', 'Predicted_Label','Predicted_Message']])


DataFrame com Mensagens Reais, Previsões e Rótulos:
                                 Message                Noisy_Message  \
1              Deep learning is amazing.         Deep byz is amazmng.   
4  Machine learning models are powerful.  xyz learning models ore atc   

         Processed_Message  Label  Predicted_Label  \
1         deep byz amazmng      1                1   
4  xyz learn model ore atc      1                1   

             Predicted_Message  
1         Deep byz is amazmng.  
4  xyz learning models ore atc  


In [None]:
#
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('wordnet')

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import pandas as pd

# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# define function to lemmatize tokens
def lemmatize_tokens(tokens):
    # convert POS tag to WordNet format
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    # lemmatize tokens
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

    # return lemmatized tokens as a list
    return lemmas

# apply lemmatization function to column of dataframe
df['lemmatized_messages'] = df['Noisy_Message'].apply(lemmatize_tokens)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#Preenchimento e Truncamento
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample texts
texts = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

# Define a maximum sequence length
max_length = 10

# Perform padding or truncation
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Display the results
print("Original Sequences:")
print(sequences)
print("\nPadded/Truncated Sequences:")
print(padded_sequences)

Original Sequences:
[[1, 2, 3, 5, 4], [1, 4, 2, 3, 6, 4], [7, 1, 2, 3, 8, 9], [2, 1, 3, 5, 4]]

Padded/Truncated Sequences:
[[1 2 3 5 4 0 0 0 0 0]
 [1 4 2 3 6 4 0 0 0 0]
 [7 1 2 3 8 9 0 0 0 0]
 [2 1 3 5 4 0 0 0 0 0]]
