<a href="https://colab.research.google.com/github/retwick/NLTK-Sentiment-Analysis/blob/master/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive')

import os
os.environ['PYTHONHASHSEED'] = '0'
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

import sys
import re
import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding, Flatten
from keras.layers import Embedding, LSTM, BatchNormalization, Multiply, Permute, Dot
from keras.layers import Dropout, Lambda, RepeatVector, multiply
from keras.layers import Input, Activation, Bidirectional, GRU, Dense, CuDNNGRU, CuDNNLSTM
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras import backend as K
# from sklearn.metrics import f1_score, confusion_matrix, mean_squared_error
np.random.seed(42)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
tf.set_random_seed(1234)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)


BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, '')
MAX_SEQUENCE_LENGTH = 100  # max input sequence length
EMBEDDING_DIM = 300  # word embedding size


def RNN_model(input_layer, num_class):  # RNN model 
    def smoothing_attention(x):
        e = K.sigmoid(x)
        s = K.sum(e, axis=-1, keepdims=True)
        return e / s
    def sharpening(x):
        s = K.exp(x)
        d = K.sum(s, axis=-1, keepdims=True)
        return s/d
    reg = 0.0001
    dropout = 0.5
    hidden_dim = 1024
    vector = Bidirectional(CuDNNGRU(hidden_dim, return_sequences=False, dropout = 0.5))(input_layer)
    lstm = Bidirectional(CuDNNGRU(hidden_dim, return_sequences=True))(input_layer)
    ee = Dot(axes=-1, normalize=True)([vector, lstm])  # calculate cosine similarity

    #     #ADD SHARPENING
#     weights = Lambda(sharpening)(ee)
    weights = Lambda(smoothing_attention)(ee)
    weights = RepeatVector(2*hidden_dim)(weights)
    weights = Permute([2, 1])(weights)   #transpose
    
    output = Multiply()([weights, lstm])
    output = Lambda(lambda x: K.sum(x, axis=1))(output)
    output = Dense(512)(output)
    output = BatchNormalization()(output)
    output = Activation("relu")(output)
    output = Dense(256)(output)
    output = BatchNormalization()(output)
    output = Activation("relu")(output)
    output = Dropout(dropout)(output)
    output = Dense(num_class, activation='softmax')(output)
    model = Model(sequence_input, output)
    print(model.summary())
    return model



Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


Using TensorFlow backend.


In [2]:
##Created a dictionary for word embeddings such that,

X_train = []
Y_train = []
relation_id = {
    'Cause-Effect(e1,e2)':0,
    'Cause-Effect(e2,e1)':1,
    'Content-Container(e1,e2)':2,
    'Content-Container(e2,e1)':3,
    'Component-Whole(e1,e2)':4,
    'Component-Whole(e2,e1)':5,
    'Entity-Origin(e1,e2)':6,
    'Entity-Origin(e2,e1)':7,
    'Entity-Destination(e1,e2)':8,
    'Entity-Destination(e2,e1)':9,
    'Instrument-Agency(e2,e1)':10,
    'Instrument-Agency(e1,e2)':11,
    'Message-Topic(e1,e2)':12,
    'Message-Topic(e2,e1)':13,
    'Member-Collection(e1,e2)':14,
    'Member-Collection(e2,e1)':15,
    'Product-Producer(e1,e2)':16,
    'Product-Producer(e2,e1)':17,
    'Other':18
              }
num_class = 19

def id_to_relation(id):
  for k in relation_id.keys():
    if relation_id[k] == id:
      return k
  assert(False)
  return -1

with open("/gdrive/My Drive/NLP-Project/Dataset/TRAIN_FILE.txt") as f:
  lines = f.readlines()
new_lines = []
for i in range(0, len(lines), 4):
  sentence = lines[i].strip().split('\t')[1][1:-1]
  relation = lines[i+1].strip()
  X_train.append(sentence)
  assert(relation in relation_id)
  Y_train.append(relation_id[relation])
  
Y_train = to_categorical(Y_train)

print('Created X_train, Y_train')


Created X_train, Y_train


In [3]:
from sklearn.model_selection import train_test_split

## TOKENIZE DATA POINTS
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)    #Fit on train data
sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)  #pad zeros at the beginning of each sequence to make all of the same length
print('Tokenized.')
# print(type(X_train))
## CREATE VALIDATION SET
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

X_test = []
ID_test = []
print('Start test')
##PREPROCESSING
with open("/gdrive/My Drive/NLP-Project/Dataset/TEST_FILE.txt") as f:
    for l in f:
        ID, sentence = l.strip().split("\t")
        sentence = sentence[1:-1]
        ID_test.append(ID)
        X_test.append(sentence)
##TOKENIZE TEST POINTS
sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
## END TOKENIZE


Tokenized.
Start test


In [4]:
print('Start embedding')
embeddings_index = {}
with open('/gdrive/My Drive/NLP-Project/Dataset/glove.42B.300d.txt') as f:  # read pre-trained word embedding
    for line in f:
        values = line.split()
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[values[0]] = coefs
print('Created a dictionary for word embeddings')


Start embedding
Created a dictionary for word embeddings


In [5]:
print('Create embedding weights')

# print('Words not in glove:')
word_index = tokenizer.word_index  # word dictionary <word, index>
## CREATE EMBEDDING MATRIX
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))  # create word embedding matrix
for word, i in word_index.items():
    if word in embeddings_index.keys():
        embedding_matrix[i] = embeddings_index[word]
    else:
#         print(word)   
        pass

print('')
print('Stored embedding weights.')

Create embedding weights

Stored embedding weights.


In [11]:
## MAGIC BEGINS 
print('Magic Begins')

##################################################################
## USED EMBEDDING FROM KERAS
embedding_layer = Embedding(num_words,  # inital word embedding weights
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')  # input layer
embedded_sequences = embedding_layer(sequence_input)  # word embedding

print('Call RNN')
################################################
# model = RNN_model(embedded_sequences, num_class)

  
dropout = 0.5
hidden_dim = 512

vector = Bidirectional(CuDNNGRU(hidden_dim, return_sequences=False))(embedded_sequences)
lstm = Bidirectional(CuDNNGRU(hidden_dim, return_sequences=True))(embedded_sequences)
ee = Dot(axes=-1, normalize=True)([vector, lstm])  # calculate cosine similarity

# Source: https://github.com/keras-team/keras/issues/4962#issuecomment-271934502
weights = Activation('softmax')(ee)    
weights = RepeatVector(2*hidden_dim)(weights) 
weights = Permute([2, 1])(weights)  
output = multiply([weights, lstm])
output = Lambda(lambda x: K.sum(x, axis=-2))(output)
# end of code from link

output = Dense(256)(output)
output = BatchNormalization()(output)
output = Activation("relu")(output)
output = Dropout(dropout)(output)

output = Dense(128)(output)
output = BatchNormalization()(output)
output = Activation("relu")(output)

# output = Dense(32)(output)
# output = BatchNormalization()(output)
# output = Activation("relu")(output)

# output = Dropout(dropout)(output)
output = Dense(num_class, activation='softmax')(output)

model = Model(sequence_input, output)
print(model.summary())

# run RNN 
##################################################################

print('Compiling Model')

model.compile(loss='categorical_crossentropy',
            optimizer=keras.optimizers.adam(lr= 0.001, amsgrad=True, clipvalue=10),
            metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=15, mode='min')

print('Start Fitting')
model.fit(X_train, Y_train,
        batch_size=200,
        epochs=25,
        callbacks=[early_stop],
        validation_data=(X_val, Y_val))


Magic Begins
Call RNN
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 100, 300)     5865000     input_3[0][0]                    
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 1024)         2500608     embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, 100, 1024)    2500608     embedding_3[0][0]                
_______________________________________________________________________________________

<keras.callbacks.History at 0x7fd521421a58>

In [12]:
import time
named_tuple = time.localtime() # get struct_time
time_string = time.strftime("%m_%d_%H:%M:%S", named_tuple)

print('Saving to file ',time_string,'.txt')
## fit model
print('Start prediction')
Y_pre = model.predict(X_test)
# print(Y_pre[0])
Y_pre = np.argmax(Y_pre, axis=1)
Y_pre = [id_to_relation(i) for i in Y_pre]

outfile = "/gdrive/My Drive/NLP-Project/Output/" + time_string + '.txt'
with open(outfile, 'w') as f:
    for ID, label in zip(ID_test, Y_pre):
        f.write(ID + "\t" + label + "\n")
## do prediction  
print('Finished Prediction, Check result!')

Saving to file  05_06_13:56:12 .txt
Start prediction
Finished Prediction, Check result!
