### Import Libraries

In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model, model_from_json
from tensorflow.keras.layers import Embedding,Dense, Activation, MaxPool1D, LSTM
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, BatchNormalization, SpatialDropout1D, Embedding, Conv1D, GlobalAveragePooling1D, Activation 
import json
import pickle

In [30]:
print ("Tensorflow Version - ",tf.__version__)
print ("Keras Version - ",keras.__version__)
print ("Pandas Version - ",pd.__version__)

Tensorflow Version -  2.3.0
Keras Version -  2.4.0
Pandas Version -  1.1.5


In [3]:
tf.compat.v1.enable_eager_execution()

In [4]:
tf.test.gpu_device_name()

'/device:GPU:0'

### Load Input File - data with text and label from amazon reviews, twitter data and imdb reviews

In [5]:
# Load input file
df = pd.read_csv('/data/master_data.csv',encoding='utf-8')
df = df.rename(columns={"sentiment":"label"})
df['label'] = [[1,0] if i == 1 else [0,1] for i in df.label]
df.head(5)

Unnamed: 0,text,label
0,one of the other reviewers has mentioned that ...,"[1, 0]"
1,a wonderful little production. the filming tec...,"[1, 0]"
2,i thought this was a wonderful way to spend ti...,"[1, 0]"
3,basically there's a family where a little boy ...,"[0, 1]"
4,petter mattei's love in the time of money is a...,"[1, 0]"


In [6]:
# remove apostrophe in input data
df['text'] = df['text'].apply(lambda x: x.replace('\'',''))

In [7]:
df_copy = df.copy(deep=True)

In [8]:
# sample to train
# import random
# # df_copy = df
# random.seed(42)
# df = df.sample(100000)
# df = df_copy.sample(int(len(df_copy)/3))

In [9]:
len(df)

471330

In [10]:
# train test split 
X = df['text'].values
Y = df['label'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.05)

In [11]:
del df,X,Y

In [12]:
del df_copy

### Create a vocabulary index

In [13]:
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

In [14]:
word_index = tokenizer.word_index
voc = list(word_index.keys())

In [15]:
len(voc)

406248

### prepare training and testing vectors

In [16]:
maxlen=196

x_train = tokenizer.texts_to_sequences(X_train)
x_test = tokenizer.texts_to_sequences(X_test)

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

y_train = np.asarray([np.asarray(i) for i in Y_train])
y_test = np.asarray([np.asarray(i) for i in Y_test])
del X_train, X_test, Y_train, Y_test

### Load pre-trained GloVe embeddings

In [17]:
%%time
embeddings_index = {}
path_to_glove_file = '/data/glove.6B.200d.txt'
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.
CPU times: user 10 s, sys: 769 ms, total: 10.8 s
Wall time: 14.4 s


In [18]:
%%time
# prepare a corresponding embedding matrix that we can use in a Keras Embedding layer. 
# It's a simple NumPy matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary.

num_tokens = len(voc) + 2
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 116915 words (289333 misses)
CPU times: user 398 ms, sys: 204 ms, total: 602 ms
Wall time: 601 ms


### Build Sequence Model

In [19]:
# main model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense, Activation, MaxPool1D, LSTM,Bidirectional,TimeDistributed,Flatten
from tensorflow.keras.optimizers import Adam

In [20]:
maxlen = 196
embedding_dim = 200

In [21]:
int_sequences_input = Input(shape=(maxlen,))
embedded_sequences = Embedding(input_dim = num_tokens, output_dim = embedding_dim, weights = [embedding_matrix]\
                  , input_length = maxlen, trainable = True)(int_sequences_input)
model =  Bidirectional (LSTM (100,return_sequences=True,dropout=0.30),merge_mode='concat')(embedded_sequences)
# model = BatchNormalization()(model)
model = TimeDistributed(Dense(100,activation='relu'))(model)
model = Flatten()(model)
model = Dense(50,activation='relu')(model)
output = Dense(2,activation='softmax')(model)
model = Model(int_sequences_input,output)
model.compile(loss='categorical_crossentropy',optimizer='Adam', metrics=['accuracy'])
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 196)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 196, 200)          81250000  
_________________________________________________________________
bidirectional (Bidirectional (None, 196, 200)          240800    
_________________________________________________________________
time_distributed (TimeDistri (None, 196, 100)          20100     
_________________________________________________________________
flatten (Flatten)            (None, 19600)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                980050    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                

In [22]:
history = model.fit(x_train, y_train, batch_size=128, epochs=5, validation_split=0.10)

Epoch 1/5

KeyboardInterrupt: 

In [23]:
import matplotlib.pyplot as plt
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

NameError: name 'history' is not defined

In [24]:
model.evaluate (x_test, y_test)



[0.1761256605386734, 0.932108461856842]

### Save Model

In [37]:
# save model config
model_json = model.to_json()
with open("/data/sentiment/LSTM_config_emb_train.json", "w") as json_file:
    json_file.write(model_json) 

In [38]:
# save model tokenizer
with open('/data/sentiment/LSTM_model_tokenizer_emb_train.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle)

In [39]:
### Add the directory location where you want to save the weights of final models ###   
model.save("/data/sentiment/LSTM_model_emb_train.h5")
### Add the directory location where you want to save the weights of final models(use this file for reloading the model for inferencing) ### 
model.save_weights("/data/sentiment/LSTM_weights_emb_train.h5")

### Load model and predict on new data

In [31]:
from tensorflow.keras.models import load_model

json_file = open('/data/sentiment/LSTM_config.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# Add weights to the model structure
loaded_model.load_weights("/data/sentiment/LSTM_weights.h5")


AttributeError: 'str' object has no attribute 'decode'

In [26]:
# loading model tokenizer
with open('/data/sentiment/LSTM_model_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [27]:
# read prediction file
with open('senti.json') as data:
    input_json = json.load(data)
DATA = input_json['data'][0]['content'] 
df = pd.read_json(json.dumps(DATA), orient='records', dtype={"text": object})

In [None]:
# pre-process input file for predictions
# df['text'] = df['text'].apply(lambda x: x.replace('\'',''))
x_pred = tokenizer.texts_to_sequences(df.text)
maxlen=196
x_pred = pad_sequences(x_pred, padding='post', maxlen=maxlen)

In [None]:
pred = loaded_model.predict(x_pred)
df['sentiment'] = pred[:,0]

In [None]:
df

### BONUS : cleaning steps

In [None]:
import re
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_names(text):
        return re.sub(r'@\w+','',text)
    
# text = remove_URL(text)
# text = remove_html(text)
# text = remove_names(text)
# s = text.lower().replace("[^a-z0-9.\s]+","")

In [None]:
df['text'] = df['text'].apply(lambda x : remove_URL(x))
df['text'] = df['text'].apply(lambda x : remove_html(x))
df['text'] = df['text'].str.lower().str.replace("[^a-z0-9.\s]+","")