## Import Modules

In [3]:
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from gensim.models import Phrases
from collections import defaultdict, Counter, OrderedDict

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec

## Load Dataset

In [4]:
df = pd.read_csv('Datasets/sst_train.txt', sep='\t', header=None, names=['label', 'review'], encoding="latin-1")
df['label'] = df['label'].str.replace('__label__', '')
df['label'] = df['label'].astype(int).astype('category')
df.head()

Unnamed: 0,label,review
0,4,The Rock is destined to be the 21st Century 's...
1,5,The gorgeously elaborate continuation of `` Th...
2,4,Singer/composer Bryan Adams contributes a slew...
3,3,You 'd think by now America would have had eno...
4,4,Yet the act is still charming here .


In [5]:
df_val = pd.read_csv('Datasets/sst_dev.txt', sep='\t', header=None, names=['label', 'review'], encoding="latin-1")
df_val['label'] = df_val['label'].str.replace('__label__', '')
df_val['label'] = df_val['label'].astype(int).astype('category')
df_val.head()

Unnamed: 0,label,review
0,4,It 's a lovely film with lovely performances b...
1,3,"No one goes unindicted here , which is probabl..."
2,4,And if you 're not nearly moved to tears by a ...
3,5,"A warm , funny , engaging film ."
4,5,Uses sharp humor and insight into human nature...


In [6]:
df_test = pd.read_csv('Datasets/sst_test.txt', sep='\t', header=None, names=['label', 'review'], encoding="latin-1")
df_test['label'] = df_test['label'].str.replace('__label__', '')
df_test['label'] = df_test['label'].astype(int).astype('category')
df_test.head()

Unnamed: 0,label,review
0,3,Effective but too-tepid biopic
1,4,If you sometimes like to go to the movies to h...
2,5,"Emerges as something rare , an issue movie tha..."
3,3,The film provides some great insight into the ...
4,5,Offers that rare combination of entertainment ...


## Data Preprocessing

In [7]:
# Function for converting a list of sentences to a list of lists containing tokenized words
def docs_preprocessor(inputDocs):
    docs = inputDocs.copy()
    tokenizer = RegexpTokenizer(r'\w+') # Tokenize the words.
    
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    #docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

In [8]:
%%time
# Convert a list of sentences to a list of lists containing tokenized words
texts_tokenized = docs_preprocessor(df["review"])
texts_tokenized_val = docs_preprocessor(df_val["review"])
texts_tokenized_test = docs_preprocessor(df_test["review"])

Wall time: 4.71 s


## Compute Bigrams/Trigrams

In [None]:
%%time

'''
Add bigrams to docs (only ones that appear 10 times or more).
'''
bigram = Phrases(texts_tokenized, min_count=10, threshold=0.5, scoring='npmi')
#trigram = Phrases(bigram[docs])

for idx in range(len(texts_tokenized)):
    for token in bigram[texts_tokenized[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            texts_tokenized[idx].append(token)
#     for token in trigram[texts_tokenized[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_tokenized[idx].append(token)

## Vocabulary Word Count

In [None]:
texts_tokenized_counts = defaultdict(int)

for row in texts_tokenized:
    for word in row:
            texts_tokenized_counts[word] += 1
vocabulary = list(texts_tokenized_counts.keys())

print("\nVocabulary size: ", len(vocabulary))

## Phrases Count

We can investigate the phrases (by uncommenting the print statement below) and, if needed, increase/decrease the number of phrases by varying the "min_count" and "threshold" parameters of the Phrases object (above).

In [None]:
# Find phrases that are joined by an underscore (_)
numOfPhrases = 0
for i in range(len(vocabulary)):
    if(vocabulary[i].find("_") > -1):
        numOfPhrases += 1
        #print(vocabulary[i])
    
print("\nTotal Number of Phrases: ", numOfPhrases)

## Total Tokens Count
The Keras tokenizer needs us to set the number of high frequency tokens/words. These top k tokens will be used to define the length of the feature vectors.

Thus, we need to understand how many words have high frequency. Analyzing the frequency of the tokens we can decide the threshold for the high frequency tokens.

In [None]:
#Create a list of ALL tokens
tokens = []
for text in texts_tokenized:
    for token in text:
        tokens.append(token)
        
print("Total number of tokens (including repetition): ", len(tokens))


# Sort tokens from high to low frequency
token_frequency = Counter(tokens)
token_frequency_ordered = OrderedDict(sorted(token_frequency.items(), key=lambda t: t[1], reverse = True))


'''
We may view the top frequent tokens (by uncommenting the print statement below).
By varying the "top_k_tokens" we can see the top k tokens.
This will help to determine the threshold for top k frequent tokens to create the vectors.
'''
numOfTokens = 0
top_k_tokens = 10000
for k, v in token_frequency_ordered.items(): 
    if k in token_frequency_ordered.keys():
        numOfTokens += 1
        #print(k, v)
        if(numOfTokens == top_k_tokens):
            break# 

## Remove underscores from Phrases

The bigrams are created by adding an underscore between two words.

We remove the underscore from all words. Otherwise later the Keras tokenizer will split the phrases.

In [None]:
for i in range(len(texts_tokenized)):
    for j in range(len(texts_tokenized[i])):
        texts_tokenized[i][j] = texts_tokenized[i][j].replace("_", "")

## Create text corpus by adding Phrases

Combine the words, including the phrases to create a full text corpus

In [None]:
texts_processed = []

for i in range(len(texts_tokenized)):
    text = " ".join(texts_tokenized[i])
    texts_processed.append(text)

In [None]:
# View first two reviews in the text corpus
texts_processed[:2]

## Add processed Reviews from Corpus into Dataframe

In [None]:
df['Processed_Reviews'] = texts_processed
df.head()

## Create Train and Test set

In [None]:
df_test = df.loc[df['type'] == 'test']
print("Dimension of the data: ", df_test.shape)

df_train = df.loc[df['type'] == 'train']
print("Dimension of the train data: ", df_train.shape)

## Export Tokenized text into CSV

In [None]:
import pickle
with open('Datasets/texts_tokenized.txt', 'wb') as fp:
    pickle.dump(texts_tokenized, fp)

## Export Dataframe into CSV

In [None]:
df_train.to_csv('Datasets/imdb_master_train.csv')
df_test.to_csv('Datasets/imdb_master_test.csv')