<a href="https://colab.research.google.com/github/raulc66/AI-Learning/blob/main/Movie_Review_Classification_with_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1 : Importing the Libraries

In [1]:
import nltk
import random

In [2]:
# import the dataset
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
! ls /root/nltk_data/corpora/movie_reviews/ # location of the downloaded file

neg  pos  README


# Step 2 : Data Processing

In [4]:
# create a list of tuples
documents= []
for category in movie_reviews.categories(): # divides into positive and negative reviews
  for fileid in movie_reviews.fileids(category): # for each file id in the given category
    documents.append((list(movie_reviews.words(fileid)), category)) # appends the list containing the files divided into the negative / positive reviews

In [5]:
# Shuffle the documents
random.shuffle(documents)
print(documents[0]) # first tuple of the list

(['the', 'swirling', 'sick', 'feeling', 'hit', 'me', 'just', 'a', 'few', 'minutes', 'into', '"', 'heartbreakers', '.', '"', 'ray', 'liotta', "'", 's', 'character', 'was', 'making', 'out', 'with', 'his', 'secretary', 'when', 'his', 'new', 'wife', 'knocked', 'on', 'the', 'door', 'of', 'his', 'office', '.', 'while', 'scrambling', 'to', 'collect', 'himself', ',', 'he', 'frantically', 'shouted', 'to', 'her', ',', '"', 'just', 'wait', 'a', 'sex', '?', 'er', ',', 'i', 'mean', 'sec', '!', '"', 'i', 'was', 'struck', 'by', 'a', 'wave', 'of', 'revulsion', ',', 'thinking', ',', '"', 'geez', ',', 'didn', "'", 't', 'lines', 'like', 'that', 'die', 'when', "'", 'three', "'", 's', 'company', "'", 'was', 'canceled', '?', '"', 'over', 'the', 'next', 'few', 'minutes', ',', 'as', 'the', 'barely', 'double', 'entendres', 'and', 'lingering', 'cleavage', 'shots', 'grew', 'more', 'numerous', ',', 'i', 'realized', 'that', 'the', 'mindset', 'behind', '"', 'heartbreakers', '"', 'predated', '"', 'three', "'", 's', 

In [6]:
# Normalize the dataset
all_words = []
for w in movie_reviews.words():
  all_words.append(w.lower())

In [7]:
# NLTK frequency distribution
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(all_words['love'])

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
1119


In [8]:
# Limit the words
word_features = list(all_words.keys())[:3000]

In [9]:
# Find features within the documents
def find_features(document):
  words = set(document) # unique values
  features = {} # dictionary containing the existence of a feature within the document
  for w in word_features:
    features[w] : (w in words) # if the given word is found within the documents, returns True
  return features # Provided the file, it will return the existence of each word

In [10]:
print((find_features(movie_reviews.words('/root/nltk_data/corpora/movie_reviews/pos/cv000_29590.txt'))))

{}


In [11]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents] # returns a list for all the words found in the documents list if the feature exists

In [12]:
train_set = feature_sets[:1900]
test_set = feature_sets[1900:]

In [13]:
# Training the Classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [14]:
# Test Accuracy
print('Accuracy :', (nltk.classify.accuracy(classifier, test_set)) * 100)

Accuracy : 48.0


In [15]:
classifier.show_most_informative_features()

Most Informative Features
