In [1]:
# Generic Libraries
import os.path
import os
import re
import pandas as pd
import numpy as np
from pprint import pprint

# Text Processing
from bs4 import BeautifulSoup  # HTML to text
from nltk.corpus import stopwords  # String cleaning
import nltk.data  # To load sentence tokenizer

# Metrics
from sklearn.metrics import roc_auc_score

# joblib is used to store/load passage tokenizers and/or models
from sklearn.externals import joblib

# Keras
from keras.models import Sequential
from keras.layers import Dense, GRU, Dropout, Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras import backend
from keras.models import load_model

# Folder where models and tokenizers are stored
HOME_DIR = '/Users/ram/Desktop/BagPopcorn'
os.chdir(HOME_DIR)

# Random Seed
SEED = 42
np.random.seed(SEED)

# Model and Tokenizer
KERAS_NN_MODEL = "keras_nn_model.h5"
KERAS_TOKENIZER = "keras_tokenizer.pkl"
MAX_REVIEW_LENGTH_FOR_KERAS_RNN = 500

# Text Preprocessing
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    # print ("Length of the Review-",len(raw_review))
    review_text = BeautifulSoup(raw_review,"html.parser").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Tokenization : Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    words_M = [w for w in words if not w in stops]   
    #
    # 6. WordNetLemmatizer should better be used with POS tagging
    wl = nltk.WordNetLemmatizer()
    words_L = [wl.lemmatize(word) for word in words_M]
    #
    # 7. PorterStemmer(), LancasterStemmer(), SnowballStemmer()
    # stemming can often create non-existent words, whereas lemmas are actual words.
    ps = nltk.PorterStemmer()
    words_S = [ps.stem(word) for word in words_L]
    # 8. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( words_M ))     


# Train and save Keras tokenizer and Keras CNN model
def keras_tokenizer_cnn_model(x_train, y_train, x_test, use_cnn=True):
    
    #---------------------------------- Tokenization ----------------------------------#
    # 1. The Tokenizer class in Keras has various methods which help to prepare text so it can be used in neural network models.
    # 2. The top-n words num_words will not truncate the words found in the input but it will truncate the usage. 
    # 3. Assigns each word in the reviews an ID corresponding to its frequency rank.
    # (Top 5000 most frequent words - This is the size of the vocabulary in the text data.)
    
    num_most_freq_words_to_include = 5000
    tokenizer = Tokenizer(num_words=num_most_freq_words_to_include)

    # Convert unicode strings into ascii to avoid tokenization errors
    train_reviews_list = [s.encode('ascii') for s in x_train.tolist()]
    
    # The Tokenizer stores everything in the word_index during fit_on_texts. 
    # Then, when calling the texts_to_sequences method, only the top num_words are considered.
    tokenizer.fit_on_texts(all_reviews_list)
    #pprint(tokenizer.word_index)
    print('Found {} unique tokens.'.format(len(tokenizer.word_index)))
    
    # The train_reviews_list is converted into a list of numbers which are the high frequency ranked 
    # from tokenizer.word_index. Ex: HOW ARE U => [10 11 98] where 10,11 and 98 are from tokenizer.word_index
    train_reviews_tokenized = tokenizer.texts_to_sequences(train_reviews_list)
    
    #print type(train_reviews_tokenized)
    #print type(tokenizer.word_counts)
    # Number of times a particular word appeared in the sentence
    #pprint(tokenizer.word_counts)

    # Truncate and pad input sequences, so that we only cover up to the first 500 tokens per review
    # This ensures all reviews have a representation of the same size, which is needed for 
    # the Keras NN to process them.
    x_train = sequence.pad_sequences(train_reviews_tokenized, maxlen=MAX_REVIEW_LENGTH_FOR_KERAS_RNN)
    
    #---------------------------------- CNN ----------------------------------#
   
    # Indicate it's a sequential type of model - linear stack of layers
    model = Sequential()

    # Embedding dropout value
    initial_dropout = 0.2  

    # Embedding layer
    # Keras offers an Embedding layer that can be used for neural networks on text data.It requires that
    # the input data be integer encoded, so that each word is represented by a unique integer. This data 
    # preparation step can be performed using the Tokenizer API also provided with Keras. The Embedding layer 
    # is initialized with random weights and will learn an embedding for all of the words in the training dataset.
    
    # This is the size of the vector space in which words will be embedded. 
    # It defines the size of the output vectors from this layer for each word. 
    # For example, it could be 32 or 100 or even larger. 
    embedding_vector_length = 32  
    model.add(Embedding(num_most_freq_words_to_include, 
                        embedding_vector_length,
                        input_length=MAX_REVIEW_LENGTH_FOR_KERAS_RNN, 
                        dropout=initial_dropout))   
    model.add(Dropout(0.5))

    # CNN and MaxPool layer
    if use_cnn:
        model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=2))

    # GRU layer
    model.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))

    # Dense layer to get final probability prediction
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    #---------------------------------- CNN ----------------------------------#
    
    print ("Model Summary")
    print(model.summary())

    print ("Model Fit")
    model.fit(x_train, y_train, epochs=3, batch_size=64)  # , validation_split=0.2)

    print ("Saving Model")
    model.save(KERAS_NN_MODEL)
    _ = joblib.dump(tokenizer, KERAS_TOKENIZER, compress=9)

def get_train_test_data(review_to_words):
    
    df = pd.read_csv('labeledTrainData.tsv', header=0, quotechar='"', sep='\t')
    #df = df.head(50)
    
    # Shuffle data frame rows
    df = df.iloc[np.random.permutation(len(df))]

    x = df["review"].map(review_to_words)
    y = df["sentiment"]
    
    # Split 80/20
    test_start_index = int(df.shape[0] * .8)
    x_train = x[0:test_start_index]
    y_train = y[0:test_start_index]
    x_test = x[test_start_index:]
    y_test = y[test_start_index:]

    return x_train, y_train, x_test, y_test

# Keras NN -> Embeddings Layer + GRU Layer + Dense Layer only
if __name__ == "__main__":
     
    print ("Preprocessing and Splitting data")
    x_train, y_train, x_test, y_test = get_train_test_data(review_to_words)
    
    print ("Building Tokenizer and NN Model, if they dont exist")
    if not os.path.isfile(KERAS_NN_MODEL) or not os.path.isfile(KERAS_TOKENIZER):
        print("Building Tokenizer and NN Model")
        keras_tokenizer_cnn_model(x_train, y_train, x_test)
    else:
        print("Loading existing Keras NN Model")
    
    print ("Loading Classifier and Tokenizer")    
    classifier = load_model(KERAS_NN_MODEL)
    tokenizer  = joblib.load(KERAS_TOKENIZER)

    print ("Converting reviews to ascii, tokenize and pad reviews ")
    test_reviews_list = [s.encode('ascii') for s in x_test.tolist()]
    test_reviews_tokenized = tokenizer.texts_to_sequences(test_reviews_list)
    test_reviews_tokenized_padded = sequence.pad_sequences(test_reviews_tokenized,maxlen=MAX_REVIEW_LENGTH_FOR_KERAS_RNN)
    
    print("Making predictions...")
    y_predict = classifier.predict(test_reviews_tokenized_padded)
    
    print("Printing AUC Scores...")
    print("AUC: {}".format(roc_auc_score(y_test, y_predict, average='macro')))

Using TensorFlow backend.


Preprocessing and Splitting data
Building Tokenizer and NN Model, if they dont exist
Building Tokenizer and NN Model
Found 74065 unique tokens.




Model Summary
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               39900     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101    