In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Get the Encoding format of the file so that it can be read correctly
import chardet

# Store the filepaths of the data of all websites in a Dictionary
filepaths = {
             'amazon' : '/content/drive/MyDrive/Datasets/labelled_sentences/amazon_cells_labelled.txt',
             'yelp' : '/content/drive/MyDrive/Datasets/labelled_sentences/yelp_labelled.txt',
             'imdb' : '/content/drive/MyDrive/Datasets/labelled_sentences/imdb_labelled.txt'
            }

encodings = {}

for key, value in filepaths.items():
    with open(value, 'rb') as rawdata:
        result = chardet.detect(rawdata.read(100000))
        encodings[key] = result['encoding']
        print("The encoding of {} file was {}".format(key, encodings[key]))

The encoding of amazon file was ascii
The encoding of yelp file was utf-8
The encoding of imdb file was utf-8


In [12]:


# Used to store all the dataframes in a list
data_list = []
    
# Read the files into dataframes
for website, path in filepaths.items():
    website_data = pd.read_csv(path, names=['sentence', 'label'], sep='\t', encoding=encodings[website])
    
    # Adding a source column 
    website_data['source'] = website
    
    data_list.append(website_data)
    
# Concatenate all the dataframes
raw_data = pd.concat(data_list)

In [13]:
# Print the first 5 entries in dataset
raw_data.head()

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon


In [14]:
# Print Unique values in Source
raw_data['source'].unique()

array(['amazon', 'yelp', 'imdb'], dtype=object)

## Amazon Reviews

In [15]:
# Extract the amazon reviews from the dataframe
data_amazon = raw_data[raw_data['source'] == 'amazon']

# Separate sentences and labels from amazon data
sentences = data_amazon['sentence'].values
y = data_amazon['label'].values

In [32]:
# Import nltk and download the list of stopwords
import nltk
nltk.download('stopwords')
nltk.download('brown')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [20]:
# Perform Stemming and remove stopwords
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = stopwords.words('english')
stemmer = PorterStemmer()

mod_sentences = []

for sentence in sentences:
    sentence = [stemmer.stem(word) for word in sentence.split() if word not in stop_words]
    mod_sentences.append(' '.join(sentence))

In [21]:
# Print the sentence before and after stemming and stopword removal
print(sentences[2])
print(mod_sentences[2])

Great for the jawbone.
great jawbone.


In [22]:
# Tokenize the sentences
from keras.preprocessing.text import Tokenizer

# Create a tokenizer to get 5000 most used words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(mod_sentences)

In [23]:
print(tokenizer.document_count)

1000


In [24]:
encoded_doc = tokenizer.texts_to_sequences(mod_sentences)

In [25]:
print(mod_sentences[2])
print(encoded_doc[2])

great jawbone.
[5, 746]


In [26]:
# Set the vocab size as 5000
vocab_size = 5000

In [27]:
from keras.preprocessing.sequence import pad_sequences
max_len = 30
encoded_doc = pad_sequences(encoded_doc, maxlen=max_len)

In [28]:
# Printing the padded sentence
print(encoded_doc[2, :])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   5 746]


In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_doc, y, test_size=0.3, random_state=42)

In [30]:
print("Shape of X_train is: ", X_train.shape)
print("Shape of X_test is: ", X_test.shape)
print("Shape of y_train is: ", y_train.shape)
print("Shape of y_test is: ", y_test.shape)

Shape of X_train is:  (700, 30)
Shape of X_test is:  (300, 30)
Shape of y_train is:  (700,)
Shape of y_test is:  (300,)


In [34]:
# Create word embeddings
from nltk.corpus import brown
from gensim.models import Word2Vec
import multiprocessing

sents = brown.sents()
print(sents[2])

w2v = Word2Vec(sentences=sents,size=300,window=5,min_count=5,negative=15,workers=multiprocessing.cpu_count())

word_vectors = w2v.wv

['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']


In [35]:
result = word_vectors.similar_by_word('good')

print("Most similar words are:\n", result[:5])

Most similar words are:
 [('real', 0.8915715217590332), ('bad', 0.8800411820411682), ('quite', 0.8509020805358887), ('simply', 0.8086235523223877), ('little', 0.8049530982971191)]


In [38]:
# Creating embedding matrix
def create_embedding_matrix(word_vectors, word_index, embedding_dim):
    vocab_size = 5000
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count = 0
    
    for word in word_vectors.vocab:
        if word in word_index:
            index = word_index[word]
            count += 1
            
            embedding_matrix[index] = np.array(word_vectors[word], dtype=np.float32)[:embedding_dim]
    
    return embedding_matrix, count

In [39]:
embedding_dim = 300
c = 0
embedding_matrix, c = create_embedding_matrix(word_vectors, tokenizer.word_index, embedding_dim)

In [40]:
embedding_matrix.shape

(5000, 300)

In [41]:
c

1000

## Model Creation

In [42]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, 
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True
                   )
         )

model.add(Bidirectional(LSTM(50), merge_mode='mul'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))



In [44]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [45]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 300)           1500000   
                                                                 
 bidirectional (Bidirectiona  (None, 50)               140400    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 10)                510       
                                                                 
 dropout_1 (Dropout)         (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                        

In [47]:
model.fit(X_train, y_train, epochs=10, verbose=True,
          validation_data=(X_test, y_test),
          batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7051cfddd0>

In [48]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy is: {:.5f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy is: {:.5f}".format(accuracy))

Training Accuracy is: 1.00000
Testing Accuracy is: 0.79333
