<a href="https://colab.research.google.com/github/spdr-lily/Descomplica-Data-Science/blob/main/linguagem_natural_com_nltk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import nltk
import random
import numpy as np
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk import pos_tag
from sklearn.metrics import confusion_matrix, classification_report

In [24]:
# Baixar os conjuntos de dados necessários do NLTK
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [25]:
# Carregar e processar um texto
def process_text(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    freq_dist = FreqDist(tokens)
    return tokens, pos_tags, freq_dist

In [26]:
# Exemplo de uso
sample_text = "The movie was fantastic! The performances were breathtaking."
tokens, pos_tags, freq_dist = process_text(sample_text)

In [27]:
print("Tokens:", tokens)
print("Partes do discurso:", pos_tags)
print("Frequência de palavras:", freq_dist.most_common(10))

Tokens: ['The', 'movie', 'was', 'fantastic', '!', 'The', 'performances', 'were', 'breathtaking', '.']
Partes do discurso: [('The', 'DT'), ('movie', 'NN'), ('was', 'VBD'), ('fantastic', 'JJ'), ('!', '.'), ('The', 'DT'), ('performances', 'NNS'), ('were', 'VBD'), ('breathtaking', 'VBG'), ('.', '.')]
Frequência de palavras: [('The', 2), ('movie', 1), ('was', 1), ('fantastic', 1), ('!', 1), ('performances', 1), ('were', 1), ('breathtaking', 1), ('.', 1)]


In [29]:
# Carregar os dados de exemplo do corpus de críticas de filmes
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [30]:
# Embaralhar os dados para evitar viés
random.shuffle(documents)

In [31]:
# Função para extrair características usando bag-of-words
def extract_features(words):
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalpha() and word not in stop_words]
    return {word: True for word in words}

In [32]:
# Criar o conjunto de características
feature_sets = [(extract_features(words), category) for words, category in documents]

In [33]:
# Dividir os dados em treinamento (80%) e teste (20%)
train_size = int(len(feature_sets) * 0.8)
train_set, test_set = feature_sets[:train_size], feature_sets[train_size:]

In [34]:
# Treinar o modelo usando Naive Bayes
classifier = NaiveBayesClassifier.train(train_set)

In [35]:
# Avaliar o desempenho do modelo
print(f"Acurácia: {accuracy(classifier, test_set) * 100:.2f}%")

Acurácia: 70.75%


In [36]:
# Exibir as palavras mais informativas
classifier.show_most_informative_features(10)

Most Informative Features
               insulting = True              neg : pos    =     16.7 : 1.0
                  turkey = True              neg : pos    =     10.8 : 1.0
             outstanding = True              pos : neg    =      9.9 : 1.0
               strongest = True              pos : neg    =      9.8 : 1.0
               stupidity = True              neg : pos    =      9.7 : 1.0
             magnificent = True              pos : neg    =      9.5 : 1.0
               atrocious = True              neg : pos    =      9.5 : 1.0
                headache = True              neg : pos    =      9.5 : 1.0
               ludicrous = True              neg : pos    =      9.4 : 1.0
              astounding = True              pos : neg    =      9.1 : 1.0


In [37]:
# Testar com uma nova amostra de texto
tokens = word_tokenize(sample_text)
features = extract_features(tokens)
predicted_label = classifier.classify(features)
print(f"Classificação do texto de teste: {predicted_label}")

Classificação do texto de teste: pos


In [38]:
# Avaliação detalhada do modelo
y_true = [label for (_, label) in test_set]
y_pred = [classifier.classify(features) for (features, _) in test_set]

In [39]:
# Relatório de classificação
print("\nRelatório de Classificação:")
print(classification_report(y_true, y_pred))


Relatório de Classificação:
              precision    recall  f1-score   support

         neg       0.94      0.42      0.58       194
         pos       0.64      0.98      0.77       206

    accuracy                           0.71       400
   macro avg       0.79      0.70      0.68       400
weighted avg       0.79      0.71      0.68       400



In [40]:
# Matriz de Confusão
conf_matrix = confusion_matrix(y_true, y_pred)
print("\nMatriz de Confusão:")
print(conf_matrix)


Matriz de Confusão:
[[ 82 112]
 [  5 201]]
