In [1]:
import pandas as pd

In [2]:
import tensorflow as tf
devices = tf.config.experimental.list_physical_devices("GPU")
for device in devices:
    tf.config.experimental.set_memory_growth(device, True)

In [3]:
import spacy
import re
nlp = spacy.load('en_core_web_lg')
def tokenize(sentence):
    sentence = sentence.lower()
    sent = nlp(sentence)
    tokens = [token.lemma_ for token in sent if not token.is_stop]
    return tokens

def load_dataset(path):
    df = pd.read_csv(path, header=None, sep=';')
    df.columns=['sentence', 'sentiment']
    mapping = {'sadness': 1,
              'fear' : 1,
              'anger' : 1,
              'love' : 0,
              'surprise' : 0,
              'joy' : 0}
    df['sentiment'] = df.sentiment.map(mapping)
    df['sentence'] = df['sentence'].apply(tokenize)
    return df

In [4]:
#https://www.kaggle.com/praveengovi/emotions-dataset-for-nlp

df_train = load_dataset('train.txt')
df_val = load_dataset('val.txt')
df_test = load_dataset('test.txt')

In [5]:
import numpy as np
def X_y_builder(df):
    X = list([sent for sent in df.sentence ])
    y = np.array([label for label in df.sentiment])
    return X, y
X_train, y_train = X_y_builder(df_train)
X_val, y_val = X_y_builder(df_val)
X_test, y_test = X_y_builder(df_test)

In [6]:
import gensim.downloader as api
from gensim.models import Word2Vec

model_wiki = api.load('fasttext-wiki-news-subwords-300')
model_w2v = Word2Vec(X_train, size = 300, min_count = 2).wv  


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer



DICT_SIZE = 15000

# creating a dictionary with most used words
# where num of words = DICT_SIZE
tokenizer = Tokenizer(num_words=DICT_SIZE)
tokenizer.fit_on_texts(X_train)
print(list(tokenizer.word_index.items())[:5])
len(tokenizer.word_index)


[('feel', 1), ('like', 2), ('m', 3), ('nt', 4), ('feeling', 5)]


11856

In [8]:
x_train_max_len = max([len(i) for i in X_train])
x_test_max_len = max([len(i) for i in X_test])
x_validation_max_len = max([len(i) for i in X_val])

MAX_LEN = max(x_train_max_len, x_test_max_len, x_validation_max_len)

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train, maxlen=MAX_LEN)

X_test = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test, maxlen=MAX_LEN)

X_val = tokenizer.texts_to_sequences(X_val)
X_val_pad = pad_sequences(X_val, maxlen=MAX_LEN)

In [10]:
def create_weight_matrix(model, second_model=False):
    '''
    Accepts word embedding model
    and the second model, if provided
    Returns weight matrix of size m*n, where
    m - size of the dictionary
    n - size of the word embedding vector
    '''
    vector_size = model.get_vector('like').shape[0]
    w_matrix = np.zeros((DICT_SIZE, vector_size))
    skipped_words = []

    for word, index in tokenizer.word_index.items():
        if index < DICT_SIZE:
            if word in model.index2word: 
                w_matrix[index] = model.get_vector(word)
        else:
            if second_model:
                if word in second_model.index2word:
                    w_matrix[index] = second_model.get_vector(word)
                else:
                    skipped_words.append(word)
            else:
                skipped_words.append(word)
 
    print(f'{len(skipped_words)} words were skipped. Some of them:')
    print(skipped_words[:50])
    return w_matrix

In [11]:
weight_matrix = create_weight_matrix(model_wiki, model_w2v)

0 words were skipped. Some of them:
[]


In [12]:
weight_matrix.shape

(15000, 300)

In [13]:
# import models, layers, optimizers from tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [14]:
# import and initialize early stopping
from tensorflow.keras.callbacks import EarlyStopping
stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)


In [15]:


# initialize sequential model
model = Sequential()
model.add(Embedding(input_dim = DICT_SIZE, # the whole vocabulary size
                    output_dim = weight_matrix.shape[1], # vector space dimension
                    input_length = X_train_pad.shape[1], # max_len of text sequence
                    weights=[weight_matrix], # assign the embedding weight with embedding marix
                    trainable=False)) # set the weight to be not trainable (static)



2021-09-22 10:56:28.352982: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:


#model.add(Bidirectional(LSTM(128, return_sequences=True))) 
#model.add(Dropout(0.2))
#model.add(Bidirectional(LSTM(256, return_sequences=True)))
#model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=False)))
#model.add(Dropout(0.2))
model.add(Dense(2, activation = 'softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='accuracy')


model.summary()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 34, 300)           4500000   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               439296    
_________________________________________________________________
dense (Dense)                (None, 2)                 514       
Total params: 4,939,810
Trainable params: 439,810
Non-trainable params: 4,500,000
_________________________________________________________________


In [17]:

history = model.fit(X_train_pad, y_train, 
                    validation_data = (X_val_pad, y_val),
                    batch_size = 8,
                    epochs = 3, 
                    callbacks = stop)

2021-09-22 10:56:39.298144: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [18]:
model.evaluate(X_test_pad, y_test) 



[0.10989311337471008, 0.9585000276565552]

In [19]:
def predict(sentence):
    sent = [token for token in tokenize(sentence)]
    sent = [token[0]for token in tokenizer.texts_to_sequences(sent) if len(token)>0]
    sent = pad_sequences([sent], maxlen=MAX_LEN)
    return np.argmax(model(sent))
    
predict(" You're incompetent!")

1

In [23]:
model.save('../flask_app/static/sentiment_analyser/model')




INFO:tensorflow:Assets written to: ../flask_app/static/sentiment_analyser/model/assets


INFO:tensorflow:Assets written to: ../flask_app/static/sentiment_analyser/model/assets


In [25]:
import pickle

with open('../flask_app/static/tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
import json
import io
tokenizer_json = tokenizer.to_json()
with io.open('../flask_app/static/tokenizer.json', 'w', encoding='utf-8') as file:
    file.write(json.dumps(tokenizer_json, ensure_ascii=False))
