In [None]:
import pandas as pd

In [None]:
import tensorflow as tf
device = tf.config.experimental.list_physical_devices("GPU")[0]
tf.config.experimental.set_memory_growth(device, True)

In [None]:
import spacy
import re
nlp = spacy.load('en_core_web_lg')
def tokenize(sentence):
    sentence = sentence.lower()
    sent = nlp(sentence)
    tokens = [token.lemma_ for token in sent if not token.is_stop]
    return tokens

def load_dataset(path):
    df = pd.read_csv(path, header=None, sep=';')
    df.columns=['sentence', 'sentiment']
    mapping = {'sadness': 1,
              'fear' : 1,
              'anger' : 1,
              'love' : 0,
              'surprise' : 0,
              'joy' : 0}
    df['sentiment'] = df.sentiment.map(mapping)
    df['sentence'] = df['sentence'].apply(tokenize)
    return df

In [None]:
#https://www.kaggle.com/praveengovi/emotions-dataset-for-nlp

df_train = load_dataset('train.txt')
df_val = load_dataset('val.txt')
df_test = load_dataset('test.txt')

In [None]:
import numpy as np
def X_y_builder(df):
    X = list([sent for sent in df.sentence ])
    y = np.array([label for label in df.sentiment])
    return X, y
X_train, y_train = X_y_builder(df_train)
X_val, y_val = X_y_builder(df_val)
X_test, y_test = X_y_builder(df_test)

In [None]:
import gensim.downloader as api
from gensim.models import Word2Vec

model_wiki = api.load('fasttext-wiki-news-subwords-300')
model_w2v = Word2Vec(X_train, size = 300, min_count = 2).wv  


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer



DICT_SIZE = 15000

# creating a dictionary with most used words
# where num of words = DICT_SIZE
tokenizer = Tokenizer(num_words=DICT_SIZE)
tokenizer.fit_on_texts(X_train)
print(list(tokenizer.word_index.items())[:5])
len(tokenizer.word_index)


In [None]:
x_train_max_len = max([len(i) for i in X_train])
x_test_max_len = max([len(i) for i in X_test])
x_validation_max_len = max([len(i) for i in X_val])

MAX_LEN = max(x_train_max_len, x_test_max_len, x_validation_max_len)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train, maxlen=MAX_LEN)

X_test = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test, maxlen=MAX_LEN)

X_val = tokenizer.texts_to_sequences(X_val)
X_val_pad = pad_sequences(X_val, maxlen=MAX_LEN)

In [None]:
def create_weight_matrix(model, second_model=False):
    '''
    Accepts word embedding model
    and the second model, if provided
    Returns weight matrix of size m*n, where
    m - size of the dictionary
    n - size of the word embedding vector
    '''
    vector_size = model.get_vector('like').shape[0]
    w_matrix = np.zeros((DICT_SIZE, vector_size))
    skipped_words = []

    for word, index in tokenizer.word_index.items():
        if index < DICT_SIZE:
            if word in model.index2word: 
                w_matrix[index] = model.get_vector(word)
        else:
            if second_model:
                if word in second_model.index2word:
                    w_matrix[index] = second_model.get_vector(word)
                else:
                    skipped_words.append(word)
            else:
                skipped_words.append(word)
 
    print(f'{len(skipped_words)} words were skipped. Some of them:')
    print(skipped_words[:50])
    return w_matrix

In [None]:
weight_matrix = create_weight_matrix(model_wiki, model_w2v)

In [None]:
weight_matrix.shape

In [None]:
# import models, layers, optimizers from tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
# import and initialize early stopping
from tensorflow.keras.callbacks import EarlyStopping
stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)


In [None]:


# initialize sequential model
model = Sequential()
model.add(Embedding(input_dim = DICT_SIZE, # the whole vocabulary size
                    output_dim = weight_matrix.shape[1], # vector space dimension
                    input_length = X_train_pad.shape[1], # max_len of text sequence
                    weights=[weight_matrix], # assign the embedding weight with embedding marix
                    trainable=False)) # set the weight to be not trainable (static)



In [None]:


#model.add(Bidirectional(LSTM(128, return_sequences=True))) 
#model.add(Dropout(0.2))
#model.add(Bidirectional(LSTM(256, return_sequences=True)))
#model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=False)))
#model.add(Dropout(0.2))
model.add(Dense(2, activation = 'softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='accuracy')


model.summary()



In [None]:

history = model.fit(X_train_pad, y_train, 
                    validation_data = (X_val_pad, y_val),
                    batch_size = 8,
                    epochs = 3, 
                    callbacks = stop)

In [None]:
model.evaluate(X_test_pad, y_test) 

In [None]:
def predict(sentence):
    sent = [token for token in tokenize(sentence)]
    sent = [token[0]for token in tokenizer.texts_to_sequences(sent) if len(token)>0]
    sent = pad_sequences([sent], maxlen=MAX_LEN)
    return np.argmax(model(sent))
    
predict(" You're incompetent!")

In [None]:
model.save('sentiment_analyser/model')


In [None]:
import pickle

with open('tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import json
import io
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as file:
    file.write(json.dumps(tokenizer_json, ensure_ascii=False))
