## Import Modules

In [1]:
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from gensim.models import Phrases
from collections import defaultdict, Counter, OrderedDict

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec

## Load Dataset

In [2]:
df = pd.read_csv('Datasets/imdb_master.csv', encoding="latin-1")
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


## Drop Unnecssary Columns

In [4]:
df = df.drop(['Unnamed: 0','file'], axis=1)
df.columns = ["type", "review","sentiment"]

df = df[df.sentiment != 'unsup']
df['sentiment'] = df['sentiment'].map({'pos': 1, 'neg': 0})

print("Dimension of the data frame: ", df.shape)

df.head()

Dimension of the data frame:  (50000, 3)


Unnamed: 0,type,review,sentiment
0,test,Once again Mr. Costner has dragged out a movie...,0
1,test,This is an example of why the majority of acti...,0
2,test,"First of all I hate those moronic rappers, who...",0
3,test,Not even the Beatles could write songs everyon...,0
4,test,Brass pictures (movies is not a fitting word f...,0


## Data Preprocessing

In [5]:
# Function for converting a list of sentences to a list of lists containing tokenized words
def docs_preprocessor(inputDocs):
    docs = inputDocs.copy()
    tokenizer = RegexpTokenizer(r'\w+') # Tokenize the words.
    
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    #docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

In [6]:
%%time
# Convert a list of sentences to a list of lists containing tokenized words
texts_tokenized = docs_preprocessor(df["review"])

Wall time: 1min 36s


## Compute Bigrams/Trigrams

In [6]:
%%time

'''
Add bigrams to docs (only ones that appear 10 times or more).
'''
bigram = Phrases(texts_tokenized, min_count=10, threshold=0.5, scoring='npmi')
#trigram = Phrases(bigram[docs])

for idx in range(len(texts_tokenized)):
    for token in bigram[texts_tokenized[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            texts_tokenized[idx].append(token)
#     for token in trigram[texts_tokenized[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_tokenized[idx].append(token)

Wall time: 3min 14s


## Vocabulary Word Count

In [7]:
texts_tokenized_counts = defaultdict(int)

for row in texts_tokenized:
    for word in row:
            texts_tokenized_counts[word] += 1
vocabulary = list(texts_tokenized_counts.keys())

print("\nVocabulary size: ", len(vocabulary))


Vocabulary size:  98722


## Phrases Count

We can investigate the phrases (by uncommenting the print statement below) and, if needed, increase/decrease the number of phrases by varying the "min_count" and "threshold" parameters of the Phrases object (above).

In [8]:
# Find phrases that are joined by an underscore (_)
numOfPhrases = 0
for i in range(len(vocabulary)):
    if(vocabulary[i].find("_") > -1):
        numOfPhrases += 1
        #print(vocabulary[i])
    
print("\nTotal Number of Phrases: ", numOfPhrases)


Total Number of Phrases:  5401


## Total Tokens Count
The Keras tokenizer needs us to set the number of high frequency tokens/words. These top k tokens will be used to define the length of the feature vectors.

Thus, we need to understand how many words have high frequency. Analyzing the frequency of the tokens we can decide the threshold for the high frequency tokens.

In [9]:
#Create a list of ALL tokens
tokens = []
for text in texts_tokenized:
    for token in text:
        tokens.append(token)
        
print("Total number of tokens (including repetition): ", len(tokens))


# Sort tokens from high to low frequency
token_frequency = Counter(tokens)
token_frequency_ordered = OrderedDict(sorted(token_frequency.items(), key=lambda t: t[1], reverse = True))


'''
We may view the top frequent tokens (by uncommenting the print statement below).
By varying the "top_k_tokens" we can see the top k tokens.
This will help to determine the threshold for top k frequent tokens to create the vectors.
'''
numOfTokens = 0
top_k_tokens = 10000
for k, v in token_frequency_ordered.items(): 
    if k in token_frequency_ordered.keys():
        numOfTokens += 1
        #print(k, v)
        if(numOfTokens == top_k_tokens):
            break# 

Total number of tokens (including repetition):  11633232


## Remove underscores from Phrases

The bigrams are created by adding an underscore between two words.

We remove the underscore from all words. Otherwise later the Keras tokenizer will split the phrases.

In [10]:
for i in range(len(texts_tokenized)):
    for j in range(len(texts_tokenized[i])):
        texts_tokenized[i][j] = texts_tokenized[i][j].replace("_", "")

## Create text corpus by adding Phrases

Combine the words, including the phrases to create a full text corpus

In [11]:
texts_processed = []

for i in range(len(texts_tokenized)):
    text = " ".join(texts_tokenized[i])
    texts_processed.append(text)

In [12]:
# View first two reviews in the text corpus
texts_processed[:2]

['once again mr costner ha dragged out movie for far longer than necessary aside from the terrific sea rescue sequence of which there are very few just did not care about any of the character most of u have ghost in the closet and costner character are realized early on and then forgotten until much later by which time did not care the character we should really care about is very cocky overconfident ashton kutcher the problem is he come off a kid who think he better than anyone else around him and show no sign of cluttered closet his only obstacle appears to be winning over costner finally when we are well past the half way point of this stinker costner tell u all about kutcher ghost we are told why kutcher is driven to be the best with no prior inkling or foreshadowing no magic here it wa all could do to keep from turning it off an hour in onceagain ashtonkutcher betterthan anyoneelse',
 'this is an example of why the majority of action film are the same generic and boring there real

## Add processed Reviews from Corpus into Dataframe

In [13]:
df['Processed_Reviews'] = texts_processed
df.head()

Unnamed: 0,type,review,sentiment,Processed_Reviews
0,test,Once again Mr. Costner has dragged out a movie...,0,once again mr costner ha dragged out movie for...
1,test,This is an example of why the majority of acti...,0,this is an example of why the majority of acti...
2,test,"First of all I hate those moronic rappers, who...",0,first of all hate those moronic rapper who cou...
3,test,Not even the Beatles could write songs everyon...,0,not even the beatles could write song everyone...
4,test,Brass pictures (movies is not a fitting word f...,0,brass picture movie is not fitting word for th...


## Create Train and Test set

In [14]:
df_test = df.loc[df['type'] == 'test']
print("Dimension of the data: ", df_test.shape)

df_train = df.loc[df['type'] == 'train']
print("Dimension of the train data: ", df_train.shape)

Dimension of the data:  (25000, 4)
Dimension of the train data:  (25000, 4)


## Export Tokenized text into CSV

In [15]:
import pickle
with open('Datasets/texts_tokenized.txt', 'wb') as fp:
    pickle.dump(texts_tokenized, fp)

## Export Dataframe into CSV

In [16]:
df_train.to_csv('Datasets/imdb_master_train.csv')
df_test.to_csv('Datasets/imdb_master_test.csv')