In [None]:
# Importing the NLTK library and Random module
import nltk
import random

In [None]:
# Downloading all the NLTK files
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to /root/nltk_data...
       |   Unzipping grammars/book_gr


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [None]:
# importing the Movie Review corpus
from nltk.corpus import movie_reviews

In [None]:
# Creating an empty list for storing the documents
documents = []

# For each category in the movie_reviews corpus
for category in movie_reviews.categories():

  # for each file identifiers in the each category
  for fileid in movie_reviews.fileids(category):
    
    # appending the file identifiers and cateories in a list. (the fileids and categories are stored in a set)
    documents.append((list(movie_reviews.words(fileid)), category))

In [None]:
# Using the shuffle function of the random module to shuffle the elements of the documents list.
random.shuffle(documents)

**Creating a list of all word tokens**

In [None]:
# Let's get all the word tokens of the movie_review corpus and stored it in a list (all_words)
all_words_tokens = []
for w in movie_reviews.words():
    all_words_tokens.append(w.lower()) # lowring the words

**Frequency Distribution of all the word tokens**

In [None]:
# Get the frequency distribition of all the words
all_words_freq = nltk.FreqDist(all_words_tokens)

In [None]:
# Print out the length of the all frequent words list
print(len(all_words_freq))

39768


**Removing Stopwords and Punctuations**

In [None]:
# Importing stopwords and punctuations
from nltk.corpus import stopwords
import string               # for punctuations

In [None]:
# Getting the English stopwords
stopwords_eng = stopwords.words("english")

In [None]:
# Creating a function to remove stopwords and punctuations
def remove_punc_stopwords(txt):
    """
        1. First we will remove punctutations
        2. Then, we will remove stopwords
        3. Lastly, we will return the clean word tokens
    """
    nopunc = [char for char in txt if char not in string.punctuation]
    no_stops = [word for word in nopunc if word.lower() not in stopwords_eng]
    return no_stops

In [None]:
# Running the function on all word tokens
all_words_tokens_cleaned = remove_punc_stopwords(all_words_tokens)

In [None]:
# Let's see the lengths of the all word tokens list prior and after removing stopwords and punctuations
print("Original len of all word tokens = ", len(all_words_tokens))
print("After removal of stopwords and punctuations,  len of all word tokens = ", len(all_words_tokens_cleaned))

Original len of all word tokens =  1583820
After removal of stopwords and punctuations,  len of all word tokens =  710578


In [None]:
# Frequency Distribution of all the word tokens after removing punctuations and stopwords
all_words_tokens_cleaned_freq = nltk.FreqDist(all_words_tokens_cleaned)

In [None]:
# Now let's see the top 15 most common words 
all_words_tokens_cleaned_freq.most_common(15)

[('film', 1),
 ('one', 1),
 ('movie', 1),
 ('like', 1),
 ('even', 1),
 ('good', 1),
 ('time', 1),
 ('story', 1),
 ('would', 1),
 ('much', 1),
 ('character', 1),
 ('also', 1),
 ('get', 1),
 ('two', 1),
 ('well', 1)]

In [None]:
# Let's check the length of all freq words
print(len(all_words_tokens_cleaned_freq))

39586


In [None]:
# Most common words (2000 freq words)
most_common_word_tokens = all_words_tokens_cleaned_freq.most_common(2000)

# print top 10 most common words
print(most_common_word_tokens[:10])

[('film', 1), ('one', 1), ('movie', 1), ('like', 1), ('even', 1), ('good', 1), ('time', 1), ('story', 1), ('would', 1), ('much', 1)]


In [None]:
# Least 10 freq words (botton 10 common words)
print(most_common_word_tokens[1990:])

[('remain', 1), ('anna', 1), ('moved', 1), ('asking', 1), ('genuinely', 1), ('rain', 1), ('path', 1), ('aware', 1), ('causes', 1), ('international', 1)]


In [None]:
# Since the elements of the most_common_word_tokens list are in the form of tuples,
# we need to extract the first element of each tuple to get the words as word features
word_features = [token[0] for token in most_common_word_tokens]

# Print out the top 10 word features
print(word_features[:10])

['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much']


**Creating a Feature Set**

In [None]:
# Creating a function to get the features (words) in a dictionary
def doc_features(doc):
    
    # creating a set for all the unique words present in a document
    doc_words = set(doc)
    
    # creating an empty features list
    features = {}
    
    # Will iterate through all the words present in the word_features list
    for word in word_features:
        
        # Get that word and see its presence in the document (will return a bollean value)
        features[word] = (word in doc_words)
    
    return features

In [None]:
# Now, we are going to create a feature set which will contain the word features of the review and its correspoding category
feature_sets = [(doc_features(review), category) for (review, category) in documents]

In [None]:
# Print out the first element of the feature set
print(feature_sets[0])

({'film': True, 'one': True, 'movie': False, 'like': True, 'even': False, 'good': True, 'time': True, 'story': True, 'would': True, 'much': True, 'character': False, 'also': True, 'get': True, 'two': True, 'well': True, 'characters': True, 'first': True, '--': False, 'see': False, 'way': False, 'make': False, 'life': False, 'really': True, 'films': True, 'plot': False, 'little': False, 'people': True, 'could': False, 'scene': True, 'man': False, 'bad': False, 'never': True, 'best': False, 'new': False, 'scenes': False, 'many': True, 'director': True, 'know': False, 'movies': False, 'action': False, 'great': False, 'another': False, 'love': False, 'go': False, 'made': False, 'us': False, 'big': False, 'end': False, 'something': False, 'back': False, 'still': False, 'world': False, 'seems': True, 'work': False, 'makes': False, 'however': False, 'every': True, 'though': False, 'better': False, 'real': False, 'audience': False, 'enough': False, 'seen': False, 'take': False, 'around': True,