In [13]:
# Importing the relevant libraries
import numpy as np
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import string
import re
import pandas as pd
import collections
import os
import time


In [2]:
# We define a function to load the data
def load_data(path_to_dir, dataset):
    
    '''
    The loading procedure is different for the two datasets, therefore we add a parameter to the load 
    function that specifies the dataset to load
    '''
    
    # Regular Expression to remove punctuation
    Remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    
    if dataset == "imdb":
        
        train_pos = []
        train_neg = []
        test_pos = []
        test_neg = []

        for filename in os.listdir(path_to_dir+"/train/pos/"):
            train_pos.append(open(path_to_dir+"/train/pos/"+filename,'r',encoding='utf-8').read())

        for filename in os.listdir(path_to_dir+"/train/neg/"):
            train_neg.append(open(path_to_dir+"/train/neg/"+filename,'r',encoding='utf-8').read())

        for filename in os.listdir(path_to_dir+"/test/pos/"):
            test_pos.append(open(path_to_dir+"/test/pos/"+filename,'r',encoding='utf-8').read())

        for filename in os.listdir(path_to_dir+"/test/neg/"):
            test_neg.append(open(path_to_dir+"/test/neg/"+filename,'r',encoding='utf-8').read())
        
        train_pos = [Remove_punctuation.sub('',i).lower().split() for i in train_pos]
        train_neg = [Remove_punctuation.sub('',i).lower().split() for i in train_neg]
        test_pos = [Remove_punctuation.sub('',i).lower().split() for i in test_pos]
        test_neg = [Remove_punctuation.sub('',i).lower().split() for i in test_neg]
        
        X_train = train_pos + train_neg
        X_test = test_pos + test_neg
        
        Y_train = [1]*len(train_pos) + [0]*len(train_neg)
        Y_test = [1]*len(test_pos) + [0]*len(test_neg)        
        
        return X_train, Y_train, X_test, Y_test
        
    if dataset == "yelp":
        
        Train_data = pd.read_csv(path_to_dir + "Yelp_train_data.csv")
        Train_data = Train_data[Train_data['stars']!=3]
        
        X_train = Train_data['text']
        X_train = list(X_train)
        
        Y_train = Train_data['stars']
        Y_train = list(Y_train)
        Y_train = [0 if i <= 3 else 1 for i in Y_train]
        
        Test_data =  pd.read_csv(path_to_dir + "Yelp_test_data.csv")
        Test_data = Test_data[Test_data['stars']!=3]
        
        X_test = Test_data['text']
        X_test = list(X_test)
        
        Y_test = Test_data['stars']
        Y_test = list(Y_test)
        Y_test = [0 if i <=3 else 1 for i in Y_test]
        
        X_train = [Remove_punctuation.sub('',i).lower().split() for i in X_train]
        X_test = [Remove_punctuation.sub('',i).lower().split() for i in X_test]
               
        return X_train, Y_train, X_test, Y_test  


In [4]:
X_train, Y_train, X_test, Y_test = load_data('../../../IMDB Stanford/', "imdb")

In [5]:
def stopwords(corpus, proportion = 0.5):
    
    ''' This function is built to identify stopwords specific to the dataset in question
    '''
    unique_sent_words = [list(set(sentence)) for sentence in corpus]
    flattened_word_list = [item for sublist in unique_sent_words for item in sublist]
    doc_count = dict(collections.Counter(flattened_word_list))
    stopwords = [key for key, value in doc_count.items() if value >= proportion * len(corpus)]
    
    return set(stopwords)


In [6]:
stopwords_list = stopwords(X_train, proportion=0.4)

In [7]:
# Removing stopwords from the training data

X_train = [[word for word in data if word not in stopwords_list] for data in X_train]

In [8]:
# Setting up parameters for the word embeddings

Max_Words = 20000 #Max words in the vocabulary
Max_Sequence_length = 1000 #Max number of words in a review
Embedding_dimension = 100 #Using 100 dimensional Glove data


In [14]:
# Converting the words into tokens

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
data_sequence_train = tokenizer.texts_to_sequences(X_train)
data_sequence_test = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index

In [15]:
# Padding the Train and Test data

Train_data = pad_sequences(data_sequence_train, maxlen = Max_Sequence_length)
Test_data = pad_sequences(data_sequence_test, maxlen = Max_Sequence_length)

# Creating the labels
Train_labels = np.array(Y_train)
Test_labels = np.array(Y_test)

In [16]:
# Loading the trained model and measuring accuracy on test data
CNN_model = load_model('CNN_model.h5')

# Evaluating accuracy on Test data
Test_score = CNN_model.evaluate(Test_data, Test_labels, verbose=1)
print("%s: %.2f%%" % (CNN_model.metrics_names[1], Test_score[1]*100))


acc: 81.86%
