In [1]:
from google.colab import drive
drive.mount('drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at drive


In [0]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, SpatialDropout1D, Dense
from keras.callbacks import ModelCheckpoint

#Tgnore tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

Using TensorFlow backend.


In [4]:
data = pd.read_csv('drive/My Drive/deep_learning/nlp/sentiment_analysis/imdb.csv')
data['review'] = data['review'].apply(lambda x: x.lower())
data['review'] = data['review'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

pos = 0
neg = 0
for item in data['sentiment']:
    if item == 'positive':
        pos += 1
    elif item == 'negative':
        neg += 1

print(f'Positive: {pos} | Negative: {neg}')

Positive: 25000 | Negative: 25000


In [0]:
vocab_size = 7500

#by defining the vocab_size, we limit our vocabulary to the first 10,000 words of the tokenizer.word_index
#this prevents overfitting.

tokenizer = Tokenizer(num_words=vocab_size, split=' ')    #num_words : max integer in a sentences i.e. vocab_size.
tokenizer.fit_on_texts(data['review'].values)
X = tokenizer.texts_to_sequences(data['review'].values)     #X is a list of sentences in the form of a collention of tokens.
X = pad_sequences(X)       #pre-padding used. in post-padding the model might forget the previous content.

In [0]:
tokenizer.word_index #is a pre-defined dict of tokens for words in keras.

In [6]:
#length of each sentence
X.shape[1]

2099

In [7]:
embedding_size = 128
lstm_out = 196

model = Sequential()

#Layers
model.add(Embedding(vocab_size + 1, embedding_size, input_length=X.shape[1]))
#in case of less data. Use pre-trained embeddings (e.g. GloVe).
#then additional parameters to the Embedding layer : weights=[embedding_matrix], trainable=False.
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 

print(model.summary())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2099, 128)         960128    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 2099, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 1,215,322
Trainable params: 1,215,322
Non-trainable params: 0
_________________________________________________________________
None


In [51]:
Y = pd.get_dummies(data['sentiment']).values

for i in range(5):
    print(data['sentiment'][i], Y[i])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=16)
#random_state: seed for the randomness generator
#'seed' is the initial value, on which ops are performed, to generate the random output.

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

positive [0 1]
positive [0 1]
positive [0 1]
negative [1 0]
positive [0 1]
(33500, 2099) (33500, 2)
(16500, 2099) (16500, 2)


In [43]:
#To save the model with the best accuracy (checkpoint).
filepath='drive/My Drive/deep_learning/nlp/sentiment_analysis/weights.hdf5'
checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True)
callbacks_list = [checkpoint]

batch_size = 128
#a smaller batch size generalizes well. (preferred 32, 64, 128)
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, callbacks = callbacks_list, verbose = 1)
#verbose: way of displaying the epochs.

Epoch 1/1




<keras.callbacks.History at 0x7f762aca89b0>

In [52]:
validation_size = 7000

X_val = X_test[-validation_size:]
Y_val = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

X_test.shape

(9500, 2099)

In [0]:
from keras.models import load_model

model1 = load_model('drive/My Drive/deep_learning/nlp/sentiment_analysis/model.h5')





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [53]:
score, acc = model.evaluate( X_test, Y_test, verbose=1, batch_size=256 )
print(f'Score: {score} | Accuracy: {acc}')

Score: 0.3547507938711267 | Accuracy: 0.8479999999247099


In [0]:
accuracy = acc * 100
accuracy = str(round(accuracy, 3))
model.save(f'drive/My Drive/deep_learning/nlp/sentiment_analysis/model_{accuracy}.h5')

In [89]:
#Test positive & negative accuracy separately. (for the 1st 100 values of X_val)
from tqdm import tqdm

pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0

# for x in tqdm(range(len(X_val))):
for x in tqdm(range(100)):
    if x == 100:
        break
    result = model.predict(X_val[x].reshape(1, X_test.shape[1]),batch_size=1,verbose = 0)[0]
   
    if np.argmax(result) == np.argmax(Y_val[x]):
        if np.argmax(Y_val[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_val[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1

print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

100%|██████████| 100/100 [02:40<00:00,  1.60s/it]

pos_acc 88.23529411764706 %
neg_acc 83.6734693877551 %





In [91]:
while(True):
    message = [input('Enter message: ').lower()]
    if message[0]== 'exit':
        break
    message = tokenizer.texts_to_sequences(message)
    message = pad_sequences(message, maxlen=X_val.shape[1], dtype='int32', value=0)
    sentiment = model.predict(message, batch_size=1, verbose = 2)[0]
    if(np.argmax(sentiment) == 0):
        # print("Negative")
        if sentiment[0] > 0.5 and sentiment[0] <= 0.625:
            print("\U0001f610") #neg1
        elif sentiment[0] > 0.625 and sentiment[0] <= 0.75:
            print("\U0001f611") #neg2
        elif sentiment[0] > 0.75 and sentiment[0] <= 0.875:
            print("\U0001f624") #neg3
        elif sentiment[0] > 0.875 and sentiment[0] <= 1:
            print("\U0001f621") #neg4

    elif (np.argmax(sentiment) == 1):
        # print("Positive")
        if sentiment[1] > 0.5 and sentiment[1] <= 0.625:
            print("\U0001f642") #pos1
        elif sentiment[1] > 0.625 and sentiment[1] <= 0.75:
            print("\U0001f60A") #pos2
        elif sentiment[1] > 0.75 and sentiment[1] <= 0.875:
            print("\U0001f604") #pos3
        elif sentiment[1] > 0.875 and sentiment[1] <= 1:
            print("\U0001f929") #pos4

Enter message: hey lets hang out
🙂
Enter message: i am a kind and happy person
😊
Enter message: you are the worst
😡
Enter message: i am very bored
😑
Enter message: Today is an AMAZING day
🤩
Enter message: exit
