In [17]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, GRU,SimpleRNN
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, BatchNormalization

from keras.layers.core import Dense, Activation, Dropout
#from keras.layers.embeddings import Embedding
#from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [6]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [7]:
# load excel
data = pd.read_csv("/Users/hansangjun/Desktop/Springboard/Capstone3/Data/IMDB_Dataset.csv")

In [8]:
df = data.sample(n=5000, random_state=123)
df['sentiment'].value_counts()

positive    2519
negative    2481
Name: sentiment, dtype: int64

In [12]:
# replace 1 and 0
df.sentiment = df.sentiment.replace({'positive': 1, 'negative': 0})

In [10]:
# Check the maximum number of text
df['review'].apply(lambda x:len(str(x).split())).max()


1196

In [11]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [29]:
xtrain, xvalid, ytrain, yvalid = train_test_split(df.review.values, df.sentiment.values, 
                                                  stratify=df.sentiment.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [30]:
print(xtrain.shape,ytrain.shape)
print(xvalid.shape,yvalid.shape)

(4000,) (4000,)
(1000,) (1000,)


In [31]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 1200

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [32]:
xtrain_pad

array([[   0,    0,    0, ...,  376, 1348,   22],
       [   0,    0,    0, ..., 2150,    4, 1646],
       [   0,    0,    0, ...,  750,   15,   49],
       ...,
       [   0,    0,    0, ...,  288,    3,  105],
       [   0,    0,    0, ...,   70, 1778,   68],
       [   0,    0,    0, ...,   25,   73, 2265]], dtype=int32)

In [33]:
%%time
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1200, 300)         12907500  
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 100)               40100     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 12,947,701
Trainable params: 12,947,701
Non-trainable params: 0
_________________________________________________________________
CPU times: user 205 ms, sys: 90.2 ms, total: 296 ms
Wall time: 188 ms


In [34]:
model.fit(xtrain_pad, ytrain, epochs=5, 
          batch_size=64) #Multiplying by Strategy to run on TPU's

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9a08e3a5b0>

In [35]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.73%


In [37]:
loss, accuracy = model.evaluate(xtrain_pad, ytrain, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(xvalid_pad, yvalid, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.6530


In [38]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

In [39]:
scores_model

[{'Model': 'SimpleRNN', 'AUC_Score': 0.7250184011776755}]

In [40]:
xtrain_seq[:1]


[[172,
  295,
  8,
  3,
  131,
  8,
  1,
  359,
  168,
  4,
  176,
  8925,
  23,
  2549,
  2,
  26,
  499,
  336,
  18234,
  31,
  90,
  1848,
  3758,
  10,
  6,
  58,
  330,
  71,
  3,
  4231,
  10,
  6,
  3,
  19,
  24395,
  4,
  14758,
  2111,
  6116,
  2,
  461,
  814,
  2,
  11,
  102,
  41,
  2452,
  5,
  132,
  26,
  4,
  54,
  1714,
  176,
  106,
  4,
  30,
  57,
  2,
  4,
  256,
  9,
  1391,
  259,
  1,
  762,
  81,
  977,
  42,
  673,
  721,
  8,
  1,
  208,
  4,
  494,
  4,
  176,
  172,
  57,
  11,
  60,
  1,
  19,
  2,
  9,
  218,
  53,
  5,
  1,
  215,
  115,
  22,
  119,
  1,
  4762,
  84,
  556,
  11,
  375,
  5,
  368,
  674,
  51,
  11,
  84,
  27,
  5475,
  42,
  4,
  54,
  6117,
  18,
  9,
  103,
  992,
  5,
  556,
  11,
  103,
  77,
  9,
  200,
  2,
  11,
  163,
  552,
  14,
  14759,
  14,
  1,
  86,
  57,
  11,
  225,
  9,
  139,
  9,
  199,
  27,
  331,
  5,
  2600,
  3,
  4588,
  176,
  328,
  37,
  12,
  12,
  6,
  152,
  332,
  4,
  1346,
  2647,
  987,
  1,
 

## Word Embedding

In [42]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('/Users/hansangjun/Desktop/Kaggle/nlp-getting-started/data/glove_word_embeddings/glove.6B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

400000it [00:37, 10564.57it/s]

Found 400000 word vectors.





## LSTM

In [43]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 43024/43024 [00:02<00:00, 18841.02it/s]


In [44]:
%%time
with strategy.scope():
    
    # A simple LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

    model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1200, 300)         12907500  
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 13,068,001
Trainable params: 160,501
Non-trainable params: 12,907,500
_________________________________________________________________
CPU times: user 373 ms, sys: 279 ms, total: 653 ms
Wall time: 736 ms


In [51]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f999d58c9a0>

In [52]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.85%


In [54]:
loss, accuracy = model.evaluate(xtrain_pad, ytrain, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(xvalid_pad, yvalid, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.7575
Testing Accuracy:  0.7370


In [53]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,yvalid)})

## GRU's

In [55]:
%%time
with strategy.scope():
    # GRU with glove embeddings and two dense layers
     model = Sequential()
     model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
     model.add(SpatialDropout1D(0.3))
     model.add(GRU(300))
     model.add(Dense(1, activation='sigmoid'))

     model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1200, 300)         12907500  
                                                                 
 spatial_dropout1d (SpatialD  (None, 1200, 300)        0         
 ropout1D)                                                       
                                                                 
 gru (GRU)                   (None, 300)               541800    
                                                                 
 dense_3 (Dense)             (None, 1)                 301       
                                                                 
Total params: 13,449,601
Trainable params: 542,101
Non-trainable params: 12,907,500
_________________________________________________________________
CPU times: user 533 ms, sys: 562 ms, total: 1.09 s
Wall time: 1.83 s


In [57]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f991d6de7c0>

In [58]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.93%


In [59]:
loss, accuracy = model.evaluate(xtrain_pad, ytrain, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(xvalid_pad, yvalid, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9035
Testing Accuracy:  0.8520


In [60]:
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,yvalid)})

In [61]:
scores_model


[{'Model': 'SimpleRNN', 'AUC_Score': 0.7250184011776755},
 {'Model': 'LSTM', 'AUC_Score': 0.8511824756784434},
 {'Model': 'GRU', 'AUC_Score': 0.9318476382488479}]

## Bi-Directional RNN's

In [62]:
%%time
with strategy.scope():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
    model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 1200, 300)         12907500  
                                                                 
 bidirectional (Bidirectiona  (None, 600)              1442400   
 l)                                                              
                                                                 
 dense_4 (Dense)             (None, 1)                 601       
                                                                 
Total params: 14,350,501
Trainable params: 1,443,001
Non-trainable params: 12,907,500
_________________________________________________________________
CPU times: user 696 ms, sys: 452 ms, total: 1.15 s
Wall time: 1.67 s


In [64]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64)


Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 