## Import Modules

In [1]:
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from gensim.models import Phrases
from collections import defaultdict, Counter, OrderedDict

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec

## Load Dataset

In [2]:
# Read train-split data
df_train = pd.read_csv('Datasets/sst_train.txt', sep='\t', header=None, names=['label', 'review'], encoding="latin-1")
df_train['label'] = df_train['label'].str.replace('__label__', '')
df_train['label'] = df_train['label'].astype(int).astype('category')
df_train['type'] = "train"
df_train.head()

Unnamed: 0,label,review,type
0,4,The Rock is destined to be the 21st Century 's...,train
1,5,The gorgeously elaborate continuation of `` Th...,train
2,4,Singer/composer Bryan Adams contributes a slew...,train
3,3,You 'd think by now America would have had eno...,train
4,4,Yet the act is still charming here .,train


In [3]:
# Read validation-split data
df_val = pd.read_csv('Datasets/sst_dev.txt', sep='\t', header=None, names=['label', 'review'], encoding="latin-1")
df_val['label'] = df_val['label'].str.replace('__label__', '')
df_val['label'] = df_val['label'].astype(int).astype('category')
df_val['type'] = "train"
df_val.head()

Unnamed: 0,label,review,type
0,4,It 's a lovely film with lovely performances b...,train
1,3,"No one goes unindicted here , which is probabl...",train
2,4,And if you 're not nearly moved to tears by a ...,train
3,5,"A warm , funny , engaging film .",train
4,5,Uses sharp humor and insight into human nature...,train


In [4]:
# Read test-split data
df_test = pd.read_csv('Datasets/sst_dev.txt', sep='\t', header=None, names=['label', 'review'], encoding="latin-1")
df_test['label'] = df_test['label'].str.replace('__label__', '')
df_test['label'] = df_test['label'].astype(int).astype('category')
df_test['type'] = "test"
df_test.head()

Unnamed: 0,label,review,type
0,4,It 's a lovely film with lovely performances b...,test
1,3,"No one goes unindicted here , which is probabl...",test
2,4,And if you 're not nearly moved to tears by a ...,test
3,5,"A warm , funny , engaging film .",test
4,5,Uses sharp humor and insight into human nature...,test


In [5]:
df = df_train.append([df_val, df_test], ignore_index=True)
df.head()

Unnamed: 0,label,review,type
0,4,The Rock is destined to be the 21st Century 's...,train
1,5,The gorgeously elaborate continuation of `` Th...,train
2,4,Singer/composer Bryan Adams contributes a slew...,train
3,3,You 'd think by now America would have had eno...,train
4,4,Yet the act is still charming here .,train


## Data Preprocessing

In [7]:
# Function for converting a list of sentences to a list of lists containing tokenized words
def docs_preprocessor(inputDocs):
    docs = inputDocs.copy()
    tokenizer = RegexpTokenizer(r'\w+') # Tokenize the words.
    
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    #docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

In [8]:
%%time
# Convert a list of sentences to a list of lists containing tokenized words
texts_tokenized = docs_preprocessor(df["review"])

Wall time: 3.92 s


## Compute Bigrams/Trigrams

In [9]:
%%time

'''
Add bigrams to docs (only ones that appear 10 times or more).
'''
bigram = Phrases(texts_tokenized, min_count=10, threshold=0.5, scoring='npmi')
#trigram = Phrases(bigram[docs])

for idx in range(len(texts_tokenized)):
    for token in bigram[texts_tokenized[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            texts_tokenized[idx].append(token)

Wall time: 1.74 s


## Vocabulary Word Count

In [10]:
texts_tokenized_counts = defaultdict(int)

for row in texts_tokenized:
    for word in row:
            texts_tokenized_counts[word] += 1
vocabulary = list(texts_tokenized_counts.keys())

print("\nVocabulary size: ", len(vocabulary))


Vocabulary size:  14801


## Phrases Count

We can investigate the phrases (by uncommenting the print statement below) and, if needed, increase/decrease the number of phrases by varying the "min_count" and "threshold" parameters of the Phrases object (above).

In [11]:
# Find phrases that are joined by an underscore (_)
numOfPhrases = 0
for i in range(len(vocabulary)):
    if(vocabulary[i].find("_") > -1):
        numOfPhrases += 1
        #print(vocabulary[i])
    
print("\nTotal Number of Phrases: ", numOfPhrases)


Total Number of Phrases:  131


## Total Tokens Count
The Keras tokenizer needs us to set the number of high frequency tokens/words. These top k tokens will be used to define the length of the feature vectors.

Thus, we need to understand how many words have high frequency. Analyzing the frequency of the tokens we can decide the threshold for the high frequency tokens.

In [12]:
#Create a list of ALL tokens
tokens = []
for text in texts_tokenized:
    for token in text:
        tokens.append(token)
        
print("Total number of tokens (including repetition): ", len(tokens))


# Sort tokens from high to low frequency
token_frequency = Counter(tokens)
token_frequency_ordered = OrderedDict(sorted(token_frequency.items(), key=lambda t: t[1], reverse = True))


'''
We may view the top frequent tokens (by uncommenting the print statement below).
By varying the "top_k_tokens" we can see the top k tokens.
This will help to determine the threshold for top k frequent tokens to create the vectors.
'''
numOfTokens = 0
top_k_tokens = 10000
for k, v in token_frequency_ordered.items(): 
    if k in token_frequency_ordered.keys():
        numOfTokens += 1
        #print(k, v)
        if(numOfTokens == top_k_tokens):
            break# 

Total number of tokens (including repetition):  177092


## Remove underscores from Phrases

The bigrams are created by adding an underscore between two words.

We remove the underscore from all words. Otherwise later the Keras tokenizer will split the phrases.

In [13]:
for i in range(len(texts_tokenized)):
    for j in range(len(texts_tokenized[i])):
        texts_tokenized[i][j] = texts_tokenized[i][j].replace("_", "")

## Create text corpus by adding Phrases

Combine the words, including the phrases to create a full text corpus

In [14]:
texts_processed = []

for i in range(len(texts_tokenized)):
    text = " ".join(texts_tokenized[i])
    texts_processed.append(text)

In [15]:
# View first two reviews in the text corpus
texts_processed[:2]

['the rock is destined to be the 21st century new conan and that he going to make splash even greater than arnold schwarzenegger jean claud van damme or steven segal',
 'the gorgeously elaborate continuation of the lord of the ring trilogy is so huge that column of word can not adequately describe co writer director peter jackson expanded vision of tolkien middle earth cowriter']

## Add processed Reviews from Corpus into Dataframe

In [16]:
df['Processed_Reviews'] = texts_processed
df.head()

Unnamed: 0,label,review,type,Processed_Reviews
0,4,The Rock is destined to be the 21st Century 's...,train,the rock is destined to be the 21st century ne...
1,5,The gorgeously elaborate continuation of `` Th...,train,the gorgeously elaborate continuation of the l...
2,4,Singer/composer Bryan Adams contributes a slew...,train,singer composer bryan adam contributes slew of...
3,3,You 'd think by now America would have had eno...,train,you think by now america would have had enough...
4,4,Yet the act is still charming here .,train,yet the act is still charming here


## Create Train and Test set

In [17]:
df_train = df.loc[df['type'] == 'train']
print("Dimension of the train data: ", df_train.shape)

df_test = df.loc[df['type'] == 'test']
print("Dimension of the test data: ", df_test.shape)

Dimension of the train data:  (9645, 4)
Dimension of the test data:  (1101, 4)


## Export Tokenized text into CSV

In [18]:
import pickle
with open('Datasets/SST5_texts_tokenized.txt', 'wb') as fp:
    pickle.dump(texts_tokenized, fp)

## Export Dataframe into CSV

In [19]:
df_train.to_csv('Datasets/SST5_master_train.csv')
df_test.to_csv('Datasets/SST5_master_test.csv')