<a href="https://colab.research.google.com/github/somkiatth/git/blob/master/Copy_of_4_Recurrent_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Part 1:  Recurrent Neural Network 

###  Importing packages

In [None]:
%tensorflow_version 1.x
import re
import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.datasets import imdb

from keras.utils.np_utils import to_categorical

import warnings
warnings.filterwarnings('ignore')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

TensorFlow 1.x selected.


Using TensorFlow backend.


### Preparing Dataset

In [None]:
max_features = 1000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

# save np.load
#np_load_old = np.load

# modify the default parameters of np.load
#np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

#np.load = np_load_old

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


In [None]:
x_train[10]

array([  6, 176,   7,   2,   2,   4,   2, 114,   4, 105,  26,  32,  55,
       221,  11,  68, 205,  96,   5,   4, 192,  15,   4, 274, 410, 220,
       304,  23,  94, 205, 109,   9,  55,  73, 224, 259,   2,  15,   4,
        22, 528,   2,  34,   4, 130, 528,  30, 685, 345,  17,   4, 277,
       199, 166, 281,   5,   2,   8,  30, 179,   2, 444,   2,   9,   6,
       371,  87, 189,  22,   5,  31,   7,   4, 118,   7,   4,   2, 545,
         2, 829], dtype=int32)

### Visualize the data

In [None]:
INDEX_FROM=3   # word index offset

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[10] ))

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
a lot of <UNK> <UNK> the <UNK> plot the characters are all very interesting in their own way and the fact that the book itself almost takes on its own character is very well done anyone <UNK> that the film won't <UNK> by the end won't be disappointed either as the ending both makes sense and <UNK> to be quite <UNK> overall <UNK> is a truly great horror film and one of the best of the <UNK> highly <UNK> viewing


### Building a Model

In [None]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Build model...
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                1600      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 9,617
Trainable params: 9,617
Non-trainable params: 0
_________________________________________________________________


### Model Training

In [None]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Write the training input and output, batch size, and testing input and output

model.fit(x_train, y_train, 
          batch_size=batch_size, 
          epochs=3, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7fe42f26b710>

### Testing

In [None]:
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.4353389002799988
Test accuracy: 0.7972400188446045


### Prediction

In [None]:
prediction = model.predict(x_test[221:222])
print('Prediction value:',prediction[0])
print('Test Label:',y_test[221:222])

Prediction value: [0.96468526]
Test Label: [1]


### Other RNN Layers

* keras.layers.RNN(cell, return_sequences=False)
* keras.layers.SimpleRNN(units, activation='tanh')
* keras.layers.GRU(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.ConvLSTM2D(filters, kernel_size, strides=(1, 1), padding='valid', )
* keras.layers.SimpleRNNCell(units, activation='tanh')
* keras.layers.GRUCell(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.LSTMCell(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform')
* keras.layers.CuDNNLSTM(units, kernel_initializer='glorot_uniform')

# Part 2: Recurrent Neural Network with Custom Dataset

In [None]:
# Credits to Peter Nagy

In [None]:
!wget https://github.com/mishravipul/personalityprediction01/raw/main/Senti.csv

--2022-03-10 09:11:08--  https://github.com/mishravipul/personalityprediction01/raw/main/Senti.csv
Resolving github.com (github.com)... 52.69.186.44
Connecting to github.com (github.com)|52.69.186.44|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/mishravipul/personalityprediction01/main/Senti.csv [following]
--2022-03-10 09:11:09--  https://raw.githubusercontent.com/mishravipul/personalityprediction01/main/Senti.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 384 [text/plain]
Saving to: ‘Senti.csv’


2022-03-10 09:11:09 (16.3 MB/s) - ‘Senti.csv’ saved [384/384]



### Load data

In [None]:
import pandas as pd
data = pd.read_csv('sentiment_train.csv')
# Keeping only the neccessary columns
data = data[['sentence','label']]

### Visualize data

In [None]:
data.head(1)

Unnamed: 0,sentence,label
0,Ok brokeback mountain is such a horrible movie.,0


### Format data

In [None]:
data = data[data.label != "Neutral"]
data['sentence'] = data['sentence'].apply(lambda x: x.lower())
data['sentence'] = data['sentence'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['sentence'].values)
X = tokenizer.texts_to_sequences(data['sentence'].values)
X = pad_sequences(X)

### Training set

In [None]:
Y = pd.get_dummies(data['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print('Shape of training samples:',X_train.shape,Y_train.shape)
print('Shape of testing samples:',X_test.shape,Y_test.shape)

Shape of training samples: (3797, 40) (3797, 2)
Shape of testing samples: (1871, 40) (1871, 2)


### Design a model

In [None]:
model = Sequential()
model.add(Embedding(max_fatures, 128 ,input_length = X.shape[1], dropout=0.2))
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 128)           256000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 387,842
Trainable params: 387,842
Non-trainable params: 0
_________________________________________________________________
None


### Training 

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)


Epoch 1/5
 - 8s - loss: 0.2188 - accuracy: 0.9018
Epoch 2/5
 - 7s - loss: 0.0295 - accuracy: 0.9918
Epoch 3/5
 - 7s - loss: 0.0046 - accuracy: 0.9987
Epoch 4/5
 - 7s - loss: 0.0013 - accuracy: 1.0000
Epoch 5/5
 - 7s - loss: 4.6340e-04 - accuracy: 1.0000


<keras.callbacks.callbacks.History at 0x7f182a300e50>

### Validation

In [None]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Accuracy: %.2f" % (acc))

Score: 0.03
Accuracy: 0.99


### Formatting Test Example

In [None]:
text = 'We are so happy and going Delhi'
tester = np.array([text])
tester = pd.DataFrame(tester)
tester.columns = ['text']

tester['text'] = tester['text'].apply(lambda x: x.lower())
tester['text'] = tester['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

max_fatures = 2000
test = tokenizer.texts_to_sequences(tester['text'].values)
test = pad_sequences(test)

if X.shape[1]>test.shape[1]:
    test = np.pad(test[0], (X.shape[1]-test.shape[1],0), 'constant')
    
test = np.array([test])

prediction = model.predict(test)
print('Prediction value:',prediction[0])

Prediction value: [0.02027714 0.9797229 ]


# Part 3: RNN Design Choices

## Influence of number of nodes

### LSTM with 8 nodes

In [None]:
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_4 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 8,553
Trainable params: 8,553
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Test score: 0.42648837185241284
Test accuracy: 0.8046000003814697


### LSTM with 16 nodes

In [None]:
# Write your code here 

# Use the same layer design from the above cell 

## Influence of Embedding

In [None]:
model = Sequential()
model.add(Embedding(max_features, 4))
model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 4)           4000      
_________________________________________________________________
lstm_5 (LSTM)                (None, 16)                1344      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 5,361
Trainable params: 5,361
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Test score: 0.4381872456140816
Test accuracy: 0.8062400221824646


## Influence of Dropout

### Dropout with probability 0.5

In [None]:
model = Sequential()
model.add(Embedding(max_features, 32))
model.add(LSTM(8, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 32)          32000     
_________________________________________________________________
lstm_6 (LSTM)                (None, 8)                 1312      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 9         
Total params: 33,321
Trainable params: 33,321
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Test score: 0.47898627907700836
Test accuracy: 0.7699599862098694


### Dropout with probability 0.9

In [None]:
# Write your code here 

# Use the same model design from the above cell 

## Multilayered RNNs

### RNN with 2 layer LSTM

In [None]:
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

### RNN with 3 layer LSTM

In [None]:
# Write your code here 

# Use the same node design from the above cell 

### What are your findings?