In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
reviewDataPath = {'yelp': '../SentimentAnalysis/data/yelp_labelled.txt',
                 'amazon': '../SentimentAnalysis/data/amazon_cells_labelled.txt',
                 'imdb': '../SentimentAnalysis/data/imdb_labelled.txt'}
reviewList = []

for source, filepath in reviewDataPath.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    # Add another column filled with the source name
    df['source'] = source 
    reviewList.append(df)

df = pd.concat(reviewList)



In [3]:
review_yelp = df[df['source'] == 'yelp']

sentences = review_yelp['sentence'].values

y = review_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)



In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1

from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


Using TensorFlow backend.


In [6]:
dfSentences = pd.DataFrame(data = sentences_train,columns=['sentence'])

In [7]:
# For each row, combine all the columns into one column
df1 = pd.DataFrame({'review_text':dfSentences['sentence']})

In [9]:
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
# Toeknize 
df2 = df1['review_text'].apply(lambda x: word_tokenize(x))
df2.head()

0    [The, food, was, barely, lukewarm, ,, so, it, ...
1    [Sorry, ,, I, will, not, be, getting, food, fr...
2    [Of, all, the, dishes, ,, the, salmon, was, th...
3    [The, fries, were, not, hot, ,, and, neither, ...
4    [In, fact, I, 'm, going, to, round, up, to, 4,...
Name: review_text, dtype: object

In [10]:
# Create the list of list format of the custom corpus for gensim modeling 

newlist = []
for i in df2:
    newlist.append(i)

In [11]:
print(newlist)



In [13]:
from gensim.models import FastText
# FastText word embedding
ft_model_review = FastText(newlist, 
                    size=100, window=5, min_count=4, workers=4, min_n=3, max_n=10)
 

In [15]:
print(ft_model_review['take'][0:50])

[ 0.01355879 -0.14747097  0.14288212  0.10979275  0.02218762  0.11582632
 -0.03886069 -0.0390524  -0.02623653  0.13664816 -0.02285317  0.0433706
 -0.0731048  -0.1364464  -0.03824088 -0.05605302 -0.07640898 -0.11711276
 -0.01456512  0.01591831 -0.1338591  -0.02567076 -0.03557475  0.1639803
  0.00757322  0.01696297  0.02539162 -0.1028528   0.11327029  0.07226017
  0.03497475  0.02524179  0.10158563  0.07132339  0.06801736 -0.03382779
 -0.09607399  0.17618418 -0.0816518   0.00401394 -0.12039667  0.07680925
 -0.1152503   0.12132717 -0.14810444 -0.0298215   0.09335233  0.00936908
  0.04789409  0.06505845]


  """Entry point for launching an IPython kernel.


In [18]:
# Create embedded matrix
import numpy as np
def create_embedding_matrix(model,word_index, embedding_dim):
    
    vocab_size = len(word_index) + 1 
    # Adding again 1 because of reserved 0 index
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    #words = list(wordEmbedding.wv.vocab)
    words = list(model.wv.vocab)
    for i, word in enumerate(words):
        if word in word_index:
            idx = word_index[word]
            embedding_matrix[idx] = np.array(model[word], dtype=np.float32)[:embedding_dim]
    return embedding_matrix        

In [19]:
embedding_dime = 100
embedding_matrix_review = create_embedding_matrix(ft_model_review,
                                           tokenizer.word_index, 
                                           embedding_dime)

  


In [55]:
# word2vec embedding
from gensim.models import Word2Vec

wordEmbedding = Word2Vec(newlist, min_count=1,size= 200,
                         workers=3, window =3, sg = 1)

In [56]:
embedding_dime = 200
embedding_matrix_review_w2vec = create_embedding_matrix(wordEmbedding,
                                           tokenizer.word_index, 
                                           embedding_dime)

  


In [57]:
from keras.models import Sequential
from keras import layers
from keras.layers.core import Dropout

In [58]:
model3 = Sequential()

model3.add(layers.Embedding(vocab_size, 
                            embedding_dime,
                            #weights=[embedding_matrix_review],
                            weights=[embedding_matrix_review_w2vec],
                            input_length=100,
                            trainable=True)) # Make it False
model3.add(layers.Conv1D(128, 5, activation='relu'))
model3.add(layers.GlobalMaxPool1D())

#model3.add(layers.Dense(50, activation='relu'))
#model3.add(Dropout(0.3))
model3.add(layers.Dense(10, activation='relu'))
model3.add(layers.Dense(1, activation='sigmoid'))
model3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 200)          349400    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 96, 128)           128128    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 128)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 11        
Total params: 478,829
Trainable params: 478,829
Non-trainable params: 0
_________________________________________________________________


In [59]:
history3 = model3.fit(X_train, y_train,
                    epochs=50,verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Train on 750 samples, validate on 250 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [60]:
loss, accuracy = model3.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

Training Accuracy: 1.0000


In [61]:
loss, accuracy = model3.evaluate(X_test, y_test, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

Training Accuracy: 0.7680
