In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd

In [2]:
# Store the filepaths of the data of all websites in a Dictionary
filepaths = {
             'amazon' : '../Datasets/labelled_sentences/amazon_cells_labelled.txt',
             'yelp' : '../Datasets/labelled_sentences/yelp_labelled.txt',
             'imdb' : '../Datasets/labelled_sentences/imdb_labelled.txt'
            }

# Used to store all the dataframes in a list
data_list = []
    
# Read the files into dataframes
for website, path in filepaths.items():
    website_data = pd.read_csv(path, names=['sentence', 'label'], sep='\t')
    
    # Adding a source column 
    website_data['source'] = website
    
    data_list.append(website_data)
    
# Concatenate all the dataframes
raw_data = pd.concat(data_list)

In [3]:
# Print the first 5 entries in dataset
raw_data.head()

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon


In [4]:
# Print Unique values in Source
raw_data['source'].unique()

array(['amazon', 'yelp', 'imdb'], dtype=object)

## Amazon Reviews

In [5]:
# Extract the amazon reviews from the dataframe
data_amazon = raw_data[raw_data['source'] == 'amazon']

# Separate sentences and labels from amazon data
sentences = data_amazon['sentence'].values
y = data_amazon['label'].values

In [6]:
# Perform Stemming and remove stopwords
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = stopwords.words('english')
stemmer = PorterStemmer()

mod_sentences = []

for sentence in sentences:
    sentence = [stemmer.stem(word) for word in sentence.split() if word not in stop_words]
    mod_sentences.append(' '.join(sentence))

In [7]:
# Print the sentence before and after stemming and stopword removal
print(sentences[2])
print(mod_sentences[2])

Great for the jawbone.
great jawbone.


In [8]:
# Tokenize the sentences
from keras.preprocessing.text import Tokenizer

# Create a tokenizer to get 5000 most used words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(mod_sentences)

In [9]:
print(tokenizer.document_count)

1000


In [10]:
encoded_doc = tokenizer.texts_to_sequences(mod_sentences)

In [11]:
print(mod_sentences[2])
print(encoded_doc[2])

great jawbone.
[5, 746]


In [12]:
# Set the vocab size as 5000
vocab_size = 5000

In [13]:
from keras.preprocessing.sequence import pad_sequences
max_len = 30
encoded_doc = pad_sequences(encoded_doc, maxlen=max_len)

In [14]:
# Printing the padded sentence
print(encoded_doc[2, :])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   5 746]


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_doc, y, test_size=0.3, random_state=42)

In [16]:
print("Shape of X_train is: ", X_train.shape)
print("Shape of X_test is: ", X_test.shape)
print("Shape of y_train is: ", y_train.shape)
print("Shape of y_test is: ", y_test.shape)

Shape of X_train is:  (700, 30)
Shape of X_test is:  (300, 30)
Shape of y_train is:  (700,)
Shape of y_test is:  (300,)


In [17]:
# Create word embeddings
from nltk.corpus import brown
from gensim.models import Word2Vec
import multiprocessing

sents = brown.sents()
print(sents[2])

w2v = Word2Vec(sentences=sents,vector_size=300,window=5,min_count=5,negative=15,workers=multiprocessing.cpu_count())

word_vectors = w2v.wv



['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']


In [18]:
result = word_vectors.similar_by_word('good')

print("Most similar words are:\n", result[:5])

Most similar words are:
 [('real', 0.8825404644012451), ('bad', 0.8674448132514954), ('quite', 0.8272847533226013), ('little', 0.7967464923858643), ('simply', 0.7869082689285278)]


In [19]:
# Creating embedding matrix
def create_embedding_matrix(word_vectors, word_index, embedding_dim):
    vocab_size = 5000
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count = 0
    
    for word in word_vectors.key_to_index:
        if word in word_index:
            index = word_index[word]
            count += 1
            
            embedding_matrix[index] = np.array(word_vectors[word], dtype=np.float32)[:embedding_dim]
    
    return embedding_matrix, count

In [20]:
embedding_dim = 300
c = 0
embedding_matrix, c = create_embedding_matrix(word_vectors, tokenizer.word_index, embedding_dim)

In [21]:
embedding_matrix.shape

(5000, 300)

In [22]:
c

1000

## Model Creation

In [27]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, 
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True
                   )
         )

model.add(Bidirectional(LSTM(50), merge_mode='mul'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))



NotImplementedError: Cannot convert a symbolic Tensor (bidirectional/forward_lstm_1/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

In [None]:
model.compile(optimzer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=10, verbose=True,
          validation_data=(X_test, y_test),
          batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy is: {:.5f}".format(accuracy))
loss, accuracy = model.evaulate(X_test, y_test, verbose=False)
print("Testing Accuracy is: {:.5f}".format(accuracy))