In [1]:
import os
import io
import re
import matplotlib.pyplot as plt
import gensim
from six.moves import cPickle as pickle
import numpy as np
import scipy.stats as stats
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
truthful_pos = 'op_spam_v1.4/positive_polarity/truthful_from_TripAdvisor/'
truthful_neg = 'op_spam_v1.4/negative_polarity/truthful_from_Web/'

deceptive_pos = 'op_spam_v1.4/positive_polarity/deceptive_from_MTurk/'
deceptive_neg = 'op_spam_v1.4/negative_polarity/deceptive_from_MTurk/'

truthful_reviews_link = []
for fold in os.listdir(truthful_pos):
    foldLink = os.path.join(truthful_pos, fold)
    if os.path.isdir(foldLink):
        for f in os.listdir(foldLink):
            fileLink = os.path.join(foldLink, f)
            truthful_reviews_link.append(fileLink)

for fold in os.listdir(truthful_neg):
    foldLink = os.path.join(truthful_neg, fold)
    if os.path.isdir(foldLink):
        for f in os.listdir(foldLink):
            fileLink = os.path.join(foldLink, f)
            truthful_reviews_link.append(fileLink)

deceptive_reviews_link = []

for fold in os.listdir(deceptive_pos):
    foldLink = os.path.join(deceptive_pos, fold)
    if os.path.isdir(foldLink):
        for f in os.listdir(foldLink):
            fileLink = os.path.join(foldLink, f)
            deceptive_reviews_link.append(fileLink)

for fold in os.listdir(deceptive_neg):
    foldLink = os.path.join(deceptive_neg, fold)
    if os.path.isdir(foldLink):
        for f in os.listdir(foldLink):
            fileLink = os.path.join(foldLink, f)
            deceptive_reviews_link.append(fileLink)
        
print('Number of truthfuls reviews ', len(truthful_reviews_link))
print('Number of deceptives reviews ', len(deceptive_reviews_link))


Number of truthfuls reviews  800
Number of deceptives reviews  800


In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def handleFile(filePath):
    with open(filePath, "r") as f:
        lines=f.readlines()
        file_voc = []
        file_numWords = 0
        for line in lines:
            cleanedLine = clean_str(line)
            cleanedLine = cleanedLine.strip()
            cleanedLine = cleanedLine.lower()
            words = cleanedLine.split(' ')
            file_numWords = file_numWords + len(words)
            file_voc.extend(words)
    return file_voc, file_numWords


allFilesLinks = truthful_reviews_link + deceptive_reviews_link
vocabulary = []
numWords = []
for fileLink in allFilesLinks:
    file_voc, file_numWords = handleFile(fileLink)
    vocabulary.extend(file_voc)
    numWords.append(file_numWords)

vocabulary = set(vocabulary)
vocabulary = list(vocabulary)

print('The total number of files is ', len(numWords))
print('The total number of words in the files is ', sum(numWords))
print('Vocabulary size is ', len(vocabulary))
print('The average number of words in the files is', sum(numWords)/len(numWords))

The total number of files is  1600
The total number of words in the files is  253157
Vocabulary size is  9687
The average number of words in the files is 158.223125


In [4]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [5]:
MAX_SEQ_LENGTH = 200

def convertFileToArray(filePath):
    s = ""
    with open(filePath, "r") as f:
        lines=f.readlines()
        for line in lines:
            cleanedLine = clean_str(line)
            cleanedLine = cleanedLine.strip()
            cleanedLine = cleanedLine.lower()
            s += cleanedLine
    return s

totalFiles = len(truthful_reviews_link) + len(deceptive_reviews_link)
idsMatrix = np.ndarray(shape=(totalFiles, MAX_SEQ_LENGTH), dtype='int32')
#dataMatrix = np.ndarray(shape=(totalFiles,1),dtype='object')
dataMatrix = []
#labels = np.ndarray(shape=(totalFiles, 2), dtype='int32')
labelsMatrix = []
counter = 0

for filePath in truthful_reviews_link:
    dataMatrix.append(convertFileToArray(filePath))
    labelsMatrix.append(1)
  

for filePath in deceptive_reviews_link:
    dataMatrix.append(convertFileToArray(filePath))
    labelsMatrix.append(0)

dict_reviewLabels = {'review': dataMatrix,'labels': labelsMatrix}
df_reviewLabels = pd.DataFrame(dict_reviewLabels)
df_reviewLabels.head(2)



macronum=sorted(set(df_reviewLabels['labels']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))


def fun(i):
    return macro_to_id[i]

pd.set_option('mode.chained_assignment',None)
df_reviewLabels.iloc[:,0]=df_reviewLabels.iloc[:,0].apply(fun)
df_reviewLabels.head(2)



print(df_reviewLabels.shape)
# a list contains each review as a list 
balanced_texts = []
balanced_labels = []

for i in range(len(df_reviewLabels)):
    balanced_texts.append(df_reviewLabels.iloc[i,1])
    balanced_labels.append(df_reviewLabels.iloc[i,0])
 


tokenizer = Tokenizer(num_words=2000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")#20000
tokenizer.fit_on_texts(balanced_texts)
sequences = tokenizer.texts_to_sequences(balanced_texts)
x = pad_sequences(sequences, maxlen=200)#300
from keras.utils import to_categorical
labels = to_categorical(np.asarray(balanced_labels))
#y = df_reviewLabels['labels'].values
word_index = tokenizer.word_index

(1600, 2)


In [6]:
from keras.layers import Input, GlobalMaxPooling1D, Conv1D, Dropout, MaxPooling1D, Dense, Embedding, LSTM, Activation
from keras.models import Model, Sequential
from keras import optimizers
# Build embedding layers with weights initialized from each model
googlenews_w2v_size = 300
googlenews_w2v_matrix = np.zeros((len(word_index) + 1, googlenews_w2v_size))
for word,i in word_index.items():
    try:
        if word in w2v_model.vocab:
            googlenews_w2v_matrix[i] = w2v_model[word]
    except:
        pass

googlenews_w2v_emb = Embedding(len(word_index)+1,

                            googlenews_w2v_size,

                            weights=[googlenews_w2v_matrix],

                            input_length=200)


In [7]:
indices = np.arange(x.shape[0])
np.random.shuffle(indices)
x = x[indices]
labels = labels[indices]
nb_validation_samples = int(0.2 * x.shape[0])

x_train = x[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = x[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [10]:
from keras.layers import GRU

#GRU

model = Sequential()
model.add(googlenews_w2v_emb)
model.add(GRU(units=20,activation='tanh',recurrent_activation='hard_sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(len(macronum)))
model.add(Activation('sigmoid'))

model.summary()
opt = optimizers.adam(lr=0.0008)
model.compile(loss='categorical_crossentropy',optimizer=opt,metrics=['accuracy'])
history = model.fit(x_train, y_train,batch_size=200,epochs=45,validation_split=0.4,shuffle=True)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          2904900   
_________________________________________________________________
gru_1 (GRU)                  (None, 20)                19260     
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 42        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 2,924,202
Trainable params: 2,924,202
Non-trainable params: 0
_________________________________________________________________
Train on 768 samples, validate on 512 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/4

In [11]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_val, y_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.8820
Testing Accuracy:  0.7031


In [14]:
from keras.layers import LSTM, Bidirectional
#Bi directional LSTM
model = Sequential()
model.add(googlenews_w2v_emb)
model.add(Bidirectional(LSTM(units=20,activation='tanh',recurrent_activation='hard_sigmoid')))
model.add(Dropout(0.25))
model.add(Dense(len(macronum)))
model.add(Activation('sigmoid'))
model.summary()
opt = optimizers.adam(lr=0.0008)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])


model.fit(x_train, y_train, batch_size=200, nb_epoch=45,validation_split=0.4,shuffle=True)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          2904900   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 40)                51360     
_________________________________________________________________
dropout_4 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 82        
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 2,956,342
Trainable params: 2,956,342
Non-trainable params: 0
_________________________________________________________________


  app.launch_new_instance()


Train on 768 samples, validate on 512 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


<keras.callbacks.History at 0x15245ed58128>

In [15]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_val, y_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.8961
Testing Accuracy:  0.7156


In [16]:
model = Sequential()
model.add(googlenews_w2v_emb)
model.add(Dropout(0.25))
model.add(LSTM(units=20,activation='tanh',recurrent_activation='hard_sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(len(macronum)))
model.add(Activation('sigmoid'))
model.summary()
opt = optimizers.adam(lr=0.0008)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])


model.fit(x_train, y_train, batch_size=200, nb_epoch=45,validation_split=0.4,shuffle=True)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          2904900   
_________________________________________________________________
dropout_5 (Dropout)          (None, 200, 300)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 20)                25680     
_________________________________________________________________
dropout_6 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 42        
_________________________________________________________________
activation_5 (Activation)    (None, 2)                 0         
Total params: 2,930,622
Trainable params: 2,930,622
Non-trainable params: 0
_________________________________________________________________


  from ipykernel import kernelapp as app


Train on 768 samples, validate on 512 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


<keras.callbacks.History at 0x15245cdd84e0>

In [17]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_val, y_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.8969
Testing Accuracy:  0.7250


In [22]:
#CNN + LSTM
from keras.layers import Convolution1D, MaxPooling1D
model = Sequential()
model.add(googlenews_w2v_emb)
model.add(Dropout(0.5))
model.add(Convolution1D(nb_filter=150,
                        filter_length=3,
                        border_mode='same',
                        activation='relu',
                        subsample_length=1))
model.add(MaxPooling1D(pool_length=2))
model.add(LSTM(units=20,activation='tanh',recurrent_activation='hard_sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(len(macronum)))
model.add(Activation('sigmoid'))

model.summary()
opt = optimizers.adam(lr=0.0008)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])


model.fit(x_train, y_train, batch_size=200, epochs=45,validation_split=0.4,shuffle=True)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          2904900   
_________________________________________________________________
dropout_8 (Dropout)          (None, 200, 300)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 200, 150)          135150    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 100, 150)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 20)                13680     
_________________________________________________________________
dropout_9 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 42        
__________

<keras.callbacks.History at 0x15245c0e9390>

In [23]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_val, y_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9102
Testing Accuracy:  0.7625


In [31]:
###########################################################
# CNN
# Based on "Convolutional Neural Networks for Sentence Classification" by Yoon Kim http://arxiv.org/pdf/1408.5882v2.pdf
# https://github.com/keon/keras-text-classification/blob/master/train.py
from keras.layers import Flatten
from keras.layers import concatenate
from keras.optimizers import SGD
filter_sizes = (3,4,5)
num_filters = 100
graph_in = Input(shape=(200, googlenews_w2v_size))
convs = []
for fsz in filter_sizes:
    conv = Convolution1D(nb_filter=100,
                         filter_length=fsz,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(graph_in)
    pool = MaxPooling1D(pool_length=2)(conv)
    flatten = Flatten()(pool)
    convs.append(flatten)

if len(filter_sizes) > 1:
    out = concatenate(convs)
    #out = Merge(mode='concat')(convs)
else:
    out = convs[0]

graph = Model(input=graph_in, output=out)
model = Sequential()
model.add(googlenews_w2v_emb)
model.add(Dropout(0.25, input_shape=(200, googlenews_w2v_size)))
model.add(graph)
model.add(Dense(64))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(len(macronum)))
model.add(Activation('sigmoid'))
opt = SGD(lr=0.01, momentum=0.80, decay=1e-6, nesterov=True)


model.summary()
opt = optimizers.adam(lr=0.0008)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])


model.fit(x_train, y_train, batch_size=200, epochs=45,validation_split=0.4,shuffle=True)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          2904900   
_________________________________________________________________
dropout_14 (Dropout)         (None, 200, 300)          0         
_________________________________________________________________
model_3 (Model)              (None, 29500)             360300    
_________________________________________________________________
dense_11 (Dense)             (None, 64)                1888064   
_________________________________________________________________
dropout_15 (Dropout)         (None, 64)                0         
_________________________________________________________________
activation_11 (Activation)   (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 130       
__________

<keras.callbacks.History at 0x152446b3ada0>

In [32]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_val, y_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9164
Testing Accuracy:  0.7812
