In [2]:
import pandas as pd

filepath_dict = {'yelp':   'sentences/yelp_labelled.txt',
                 'amazon': 'sentences/amazon_cells_labelled.txt',
                 'imdb':   'sentences/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [90]:
df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

In [99]:
from nltk.corpus import stopwords
est = stopwords.words('english')
sentences_mod = []
nlist = ['no', 'nor', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", "won't", 'wouldn', "wouldn't"]
for sent in sentences:
    sent = [word for word in sent.split(' ') if ((word in nlist) or (not word in est))]
    sentences_mod.append(' '.join(sent))

In [100]:
print(sentences[1])
print(sentences_mod[1])

Crust is not good.
Crust not good.


In [101]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_mod)

In [102]:
#print(tokenizer.word_counts)
print(tokenizer.document_count)
#print(tokenizer.word_index) #dictionary like {'the':1,'and':2,'i':3}
#print(tokenizer.word_docs)

1000


In [103]:
encoded_doc = tokenizer.texts_to_sequences(sentences_mod) 

In [104]:
print(sentences_mod[1])
print(encoded_doc[1])

Crust not good.
[543, 4, 6]


In [106]:
wleng = -2
for sent in sentences_mod:
    if(len(sent.split(' '))>wleng):
        wleng = len(sent.split(' '))
print(wleng)

20


In [107]:
vocab_size = len(tokenizer.word_index) + 1 

In [108]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 30
encoded_doc = pad_sequences(encoded_doc, maxlen=maxlen)

In [109]:
print(encoded_doc[345,:])
#print(y)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0  33   6 149 109 241]


In [110]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_doc, y, test_size=0.25, random_state=1000)

In [41]:
#lets create word embeddings
from nltk.corpus import brown
from gensim.models import Word2Vec
import multiprocessing
st = brown.sents() # Going through the data of brown corpus and seeing a sample sentence
print(st[1])
w2v = Word2Vec(st,size=300,window=5,min_count=5,negative=15,iter=10,workers=multiprocessing.cpu_count())
word_vectors = w2v.wv



['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']


In [42]:
result = word_vectors.similar_by_word("good")
print("Most similar words are:\n ",result[:3])

Most similar words are:
  [('bad', 0.7930951118469238), ('nice', 0.7271484136581421), ('real', 0.7238540053367615)]


In [43]:
import numpy as np

def create_embedding_matrix(wordvectors, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    count=0
    
    for word in wordvectors.vocab:
        if word in word_index:
            idx = word_index[word]
            count+=1
            embedding_matrix[idx] = np.array(wordvectors[word], dtype=np.float32)[:embedding_dim]
    return embedding_matrix,count

In [111]:
embedding_dim = 300
c=0
embedding_matrix, c = create_embedding_matrix(word_vectors,tokenizer.word_index,embedding_dim)

In [112]:
embedding_matrix.shape

(2038, 300)

In [113]:
len(tokenizer.word_index)
c

1420

In [114]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.6967615309126595

In [120]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import save_model

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=True))
#model.add(Flatten())
model.add(Bidirectional(LSTM(50),merge_mode='mul'))
model.add(Dropout(0.2))
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 30, 300)           611400    
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 50)                140400    
_________________________________________________________________
dropout_33 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_33 (Dense)             (None, 10)                510       
_________________________________________________________________
dropout_34 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_34 (Dense)             (None, 1)                 11        
Total params: 752,321
Trainable params: 752,321
Non-trainable params: 0
_________________________________________________________________


In [121]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Train on 750 samples, validate on 250 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Accuracy: 1.0000
Testing Accuracy:  0.7800


In [63]:
model.save('train_100_test_80.h5')