In [1]:
#Import required libraries
import keras as k
import tensorflow as tf
import pandas as pd
import numpy as np

from keras.models import Sequential
from sklearn.metrics.pairwise import cosine_similarity
from keras.utils import plot_model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.models import Model, load_model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Input, Bidirectional

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
#load GloVe embeddings in a matrix
word2idx = {} # dict so we can lookup indices for tokenising our text later from string to sequence of integers
idx2word = {}
weights = []

#with (glove_data_directory / 'glove.6B.50d.txt').open('r') as file:
with open('glove.6B/glove.6B.100d.txt', 'rb') as file:    
    for index, line in enumerate(file):
        values = line.split() # Word and weights separated by space
        word = values[0].decode("utf-8") # Word is first symbol on each line
        word_weights = np.asarray(values[1:], dtype=np.float32) # Remainder of line is weights for word
        word2idx[word] = index # PAD is our zeroth index so shift by one
        idx2word[index] = word
        weights.append(word_weights)

In [7]:
#Function to get word embedding and index
def get_embedding_and_index(v_word):
    #Print embeddings for a word
    ix = word2idx[v_word]
    return(ix , weights[ix])

print(get_embedding_and_index("product"))
emb1 = get_embedding_and_index("requirements")
emb2 = get_embedding_and_index("compliance")
emb3 = get_embedding_and_index("standards")
print("Sim1:", cosine_similarity(emb1[1].reshape(1,-1), emb2[1].reshape(1,-1)))
print("Sim2:", cosine_similarity(emb1[1].reshape(1,-1), emb3[1].reshape(1,-1)))
idx2word[5466]

(1459, array([ 0.12804  ,  0.34131  ,  0.33106  , -0.026678 , -0.022675 ,
       -1.0228   ,  0.65186  , -0.14204  ,  0.29102  ,  0.56137  ,
       -0.1294   , -0.77794  , -0.014738 , -0.0082412,  0.19769  ,
        0.42299  ,  0.64201  ,  0.89195  ,  0.28199  ,  0.038209 ,
       -0.066105 , -0.39848  , -0.025111 ,  0.45934  , -0.45628  ,
        0.36668  ,  0.56928  , -0.15604  , -0.82312  , -0.46751  ,
        0.35949  ,  0.97564  , -0.047988 , -0.47062  ,  0.65927  ,
        0.66212  ,  0.18403  , -0.052545 , -0.63723  , -0.53374  ,
        0.50934  , -0.55863  ,  0.011983 ,  0.096682 ,  0.053548 ,
        0.29566  , -0.15537  , -0.40615  , -0.58044  , -0.92148  ,
        0.61701  , -0.019925 , -0.19368  ,  0.72811  ,  0.076774 ,
       -1.6533   , -0.6374   , -0.060303 ,  1.9839   ,  0.13529  ,
        0.47406  , -0.1415   , -0.37578  ,  0.15041  ,  0.89496  ,
       -0.073249 ,  0.6373   , -0.33459  ,  0.97642  , -0.41846  ,
        0.26385  ,  0.6476   , -0.057542 ,  0.0052852, 

'funny'

In [12]:
embed_size = emb_dim = 50
vocab_len = len(word2idx) + 1
vocab_len

400001

In [13]:
#List of sentences to list of indices
def sentences_to_indices(v_sentences,max_len):
    #print(len(v_sentences))
    sentence_idxs = np.zeros((len(v_sentences),max_len),np.int64)

    #Pick up each sentence
    for isx, v_s in enumerate(v_sentences):
        #sentence to words
        words = v_s.lower().split()

        #For each word convert to index:
        #print(isx, words)
        for iwx, w in enumerate(words):
            sentence_idxs[isx, iwx] = get_embedding_and_index(w)[0]

    return(sentence_idxs)

In [14]:
sentences_to_indices(["funny lol", "lets play baseball", "food is ready for you"], max_len = 5)

array([[ 5466, 73048,     0,     0,     0],
       [ 8235,   282,  1444,     0,     0],
       [  565,    14,  1188,    10,    81]], dtype=int64)

In [15]:
#convert list of sentences to list of embed vectors (m, max_len) -> (m, max_len, embed_size)
def sentences_to_embeds(v_sentences, max_len):
    m = len(v_sentences)
    
    #Convert sentences to indices first
    sen_idx = sentences_to_indices(v_sentences, max_len = max_len)
    
    #Declare array of zeros first of shape (m, max_len, embed_size)
    sen_embs = np.zeros((m, max_len, embed_size))
    
    #For each index of word, convert to embedding
    for s_ix, s_txt in enumerate(v_sentences):
        #print(s_txt)
        for word_index in range(max_len):
            #Replace 0s with word embeddings
            #if s_ix == 0 and word_index == 0:
                #print(weights[int(sen_idx[s_ix, word_index])])
                #print(sen_idx[s_ix, word_index])
            sen_embs[s_ix,word_index,:] = weights[int(sen_idx[s_ix, word_index])]
    
    #Return embeddings
    return(sen_embs)

In [16]:
sentences_to_embeds(["funny lol", "lets play baseball", "food is ready for you"], max_len = 10)[0,0,0]

-0.014546999707818031

In [17]:
#Load training data
all_data = pd.read_csv('train_data.csv')
all_data.head()

Unnamed: 0,text,label
0,never talk to me again,3
1,I am proud of your achievements,2
2,It is the worst day in my life,3
3,Miss you so much,0
4,food is life,4


In [18]:
#Get length of maximum sentence
max_len = all_data.text.map(lambda x: len(x.split())).max()
max_len

10

In [21]:
#Create one hot encoding of labels
def create_yoh_list(v_classes, num_classes):
    yoh_class = np.zeros((len(v_classes),num_classes))
    
    for ix, v_cl in enumerate(v_classes):
        yoh_class[ix, v_cl] = 1
    
    return(yoh_class)

In [12]:
#Test one hot encoder code
create_yoh_list([1,3,4,2],num_classes = 5)

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.]])

In [22]:
#Def class to text
def class_to_text(cls):
    dict_class = {0:'love', 1:'playful', 2:'happy',3:'sad', 4:'foodie'}
    return dict_class[cls]

class_to_text(2)

'happy'

In [19]:
#Random split dataset into Train and Validation
train_data = all_data.sample(frac=0.95,random_state=1)
valid_data = all_data.drop(train_data.index)

train_data.info()
valid_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 31 to 134
Data columns (total 2 columns):
text     179 non-null object
label    179 non-null int64
dtypes: int64(1), object(1)
memory usage: 4.2+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 37 to 144
Data columns (total 2 columns):
text     9 non-null object
label    9 non-null int64
dtypes: int64(1), object(1)
memory usage: 216.0+ bytes


In [36]:
#Create Keras' embedding layer
#Emb layer => Takes word's index as input and converts it to word embeddings from preloaded GloVe embedding
def get_embedding_layer():
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for index in range(len(weights)):
        emb_matrix[index, :] = weights[index]

    e = Embedding(len(word2idx) + 1, 50, input_length=max_len, trainable=False)
    
    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    e.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix
    e.set_weights([emb_matrix])
    return(e)

In [37]:
#Test embedding layer
embed_layer = get_embedding_layer()
print(embed_layer.get_weights()[0][5466][0]) #Get embedding of second word's "funny" first feature

-0.014547


In [38]:
def create_keras_model(max_len, embed_layer):
    #Start creating Tensorflow based Keras model
    model = Sequential()

    #Add input layer
    #You don't need to include the batch size here
    Inp = Input(shape=(max_len,))

    #Call embedding layer and pass input
    X = embed_layer(Inp)

    #Add first LSTM layer
    ###Very important -> Return_sequences is used to refer that every LSTM time step will return output or only last one
    X = Bidirectional(LSTM(units=128,return_sequences = True))(X)
    
    #Add dropout 0.5
    X = Dropout(0.5)(X)
    
    #Add second LSTM layer
    ###Very important -> Return_sequences is used to refer that every LSTM time step will return output or only last one
    X = Bidirectional(LSTM(units=256,return_sequences = True))(X)
    
    #Add dropout 0.5
    X = Dropout(0.5)(X)    

    #Add third LSTM layer
    ###Very important -> Return_sequences is used to refer that every LSTM time step will return output or only last one
    X = LSTM(units=128,return_sequences = False)(X)
    
    #Add dropout 0.5
    X = Dropout(0.5)(X)    

    #Add final softmax layer with 5 possible outputs
    X = Dense(5, activation='softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=Inp, outputs=X)
    
    return model

In [39]:
#Initialize model and show summary
k_model = create_keras_model(max_len ,embed_layer)
k_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 10, 256)           183296    
_________________________________________________________________
dropout_4 (Dropout)          (None, 10, 256)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 10, 512)           1050624   
_________________________________________________________________
dropout_5 (Dropout)          (None, 10, 512)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               328192    
__________

In [40]:
k_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
#Prepare training data using helper methods created above
X_train = sentences_to_indices(train_data['text'].values, max_len = max_len)
Y_train = create_yoh_list(train_data['label'].values,num_classes = 5)

In [42]:
#start training process
k_model.fit(X_train, Y_train, epochs = 100, batch_size = 32, shuffle=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x201008d5d68>

In [43]:
#Evaluate model
X_valid = sentences_to_indices(valid_data['text'].values, max_len = max_len)
Y_valid = create_yoh_list(valid_data['label'].values,num_classes = 5)

#Run on validation dataset
loss, acc = k_model.evaluate(X_valid, Y_valid)
print("Test accuracy = ", acc)

Test accuracy =  1.0


In [73]:
#Test model with custom sentence
test_sentence = 'stop it !'
x_test = np.array([test_sentence])
X_test_indices = sentences_to_indices(x_test, max_len = max_len)
print(test_sentence +' --->  '+  class_to_text(int(np.argmax(k_model.predict(X_test_indices)))))

stop it ! --->  sad


In [52]:
#Save trained model on system
k_model.save('trained_model.h5')

In [23]:
#Load pretrained model trial
k_model_trained = load_model('trained_model.h5')

test_sentence = 'i am not happy'
x_test = np.array([test_sentence])
X_test_indices = sentences_to_indices(x_test, max_len = max_len)
k_model_trained.predict(X_test_indices)
print(test_sentence +' --->  '+  class_to_text(int(np.argmax(k_model_trained.predict(X_test_indices)))))

i am not happy --->  sad
