#### Importing the Libraries

In the cell below, the required Libraries and packages

In [1]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.probability import ConditionalFreqDist

#### Tokenizing the text

Here in the below function text is tokenized the stop words, noun and verbs along with the filteration of the words starting with special characters.

In [1]:
   '''This is the backup
   def tokenize(data):
    lst = data['review']
    pattern = re.compile('^[^A-Za-z0-9]')
    s2w = word_tokenize(lst)
    stop_words = set(stopwords.words("english"))
    return  [word.casefold()  for word in s2w if word.casefold() not in stop_words and len(re.findall(pattern, word.casefold())) == 0]'''

def tokenize(data):
    lst = data['review']
    pattern = re.compile('^[^A-Za-z0-9]') #Creating regex string
    s2w = word_tokenize(lst)
    stop_words = set(stopwords.words("english"))
    parts_of_speech = nltk.pos_tag(s2w) #pos_tag gives the parts of speech of a sentence
    filter_pos = ['NN', 'VB'] # Here NN stands for noun and VB stans for Verb
    words_exclude = [pos[0].casefold() for pos in parts_of_speech if pos[1] not in filter_pos and  pos[0].casefold() not in stop_words and len(re.findall(pattern,pos[0].casefold())) == 0] # Creating a list of words
    return words_exclude

#### Frequency Distrubution
in the blow function we are calculating the frequency distribution and the conditional frequency distribution of the words

In [3]:
def freq_dist(data):
    review_freq_dist = FreqDist()
    review_cond_freq_dist = ConditionalFreqDist()
    for i in range (0, len(data)):
        for word in data['review_token'][i]:
            review_freq_dist[word] +=1
           # print(data['sentiment'][i])
            review_cond_freq_dist[data['sentiment'][i]][word] += 1
    return [review_freq_dist, review_cond_freq_dist]

Reading the raw data from the dataset

In [7]:
raw_data = pd.read_csv(filepath_or_buffer = "Data\IMDB - Cleansed data from Kaggle\IMDB Dataset.csv", header = 'infer')

Removing the line break html tags

In [9]:
raw_data_tag_rem = raw_data.replace(regex=['<br /><br />'], value = ' ')

Separating positive sentiment and the negative sentiment data to divide data into test and train

In [11]:
positive = raw_data_tag_rem[raw_data_tag_rem['sentiment'] == 'positive']
negative = raw_data_tag_rem[raw_data_tag_rem['sentiment'] == 'negative']

In [13]:
x_pos_train, x_pos_test, y_pos_train, y_pos_test = train_test_split(positive['review'], positive['sentiment'], test_size=1/4, random_state=42) 
x_neg_train, x_neg_test, y_neg_train, y_neg_test = train_test_split(negative['review'], negative['sentiment'], test_size=1/4, random_state=42) 

appending the positive and the negative training data set and applying tokenize function for the field review_token

In [15]:
x_train = x_pos_train._append(x_neg_train)
y_train = y_pos_train._append(y_neg_train)
training_data = pd.concat([x_train, y_train], axis=1)
training_data['review_token'] = training_data.apply(tokenize, axis=1)
training_data = training_data.set_index(pd.Index([i for i in range (0, len(training_data))]))

In [17]:
training_data

Unnamed: 0,review,sentiment,review_token
0,I was not expecting the powerful filmmaking ex...,positive,"[expecting, powerful, filmmaking, girlfight, i..."
1,Somewhere in the dark recesses of my brain cel...,positive,"[somewhere, dark, recesses, cells, plays, ca, ..."
2,"Surprisingly good ""Mean Streets""-type crime dr...",positive,"[surprisingly, good, mean, streets, foreshadow..."
3,It plays like your usual teenage-audience T&A ...,positive,"[plays, like, usual, incredibly, bleak, made, ..."
4,Fans of Gerry Anderson's productions will reco...,positive,"[fans, gerry, anderson, productions, several, ..."
...,...,...,...
37495,The operative rule in the making of this film ...,negative,"[operative, seems, never, 1, 10, minute, set, ..."
37496,The plot outline of this movie is similar to t...,negative,"[similar, original, someone, gets, kidnapped, ..."
37497,"Obviously, the comments above that fawn over t...",negative,"[obviously, comments, made, n't, ever, seeing,..."
37498,This movie had the potential to be far more th...,negative,"[potential, far, fails, brings, nauseous, righ..."


Creating the frequency distribution

In [19]:
a = freq_dist(training_data)

obtaining top 10000 most common positive negative and all common words

In [21]:
negative_most_common_words = a[1]['negative'].most_common(10000)
positive_most_common_words = a[1]['positive'].most_common(10000)
all_most_common_words = a[0].most_common(10000)

In [25]:
all_most_common_words

[("n't", 49886),
 ('one', 34645),
 ('like', 27056),
 ('good', 21817),
 ('would', 19789),
 ('even', 18517),
 ('really', 17404),
 ('much', 14377),
 ('well', 14355),
 ('could', 13858),
 ('people', 13552),
 ('bad', 13499),
 ('great', 13458),
 ('also', 13275),
 ('first', 12881),
 ('made', 11680),
 ('movies', 11348),
 ('characters', 10630),
 ('seen', 10065),
 ('many', 10053),
 ('films', 10004),
 ('two', 9835),
 ('never', 9766),
 ('little', 9291),
 ('best', 9170),
 ('ever', 8871),
 ('better', 8444),
 ('still', 8071),
 ('scenes', 7757),
 ('real', 6730),
 ('actors', 6601),
 ('think', 6587),
 ('watching', 6507),
 ('though', 6461),
 ('years', 6439),
 ('another', 6399),
 ('back', 6383),
 ('actually', 6351),
 ('makes', 6200),
 ('new', 6032),
 ('going', 5996),
 ('funny', 5962),
 ('every', 5910),
 ('old', 5824),
 ('us', 5505),
 ('things', 5465),
 ('seems', 5390),
 ('got', 5373),
 ('quite', 5279),
 ('around', 5276),
 ('ca', 5258),
 ('pretty', 5202),
 ('young', 5171),
 ('however', 5155),
 ('enough', 50