In [None]:
import random
import json
import spacy
import re
import numpy as np
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import CSVLogger, TensorBoard

In [2]:
nlp = spacy.load('en_core_web_sm')

#All these words you can modify. I used them for my specific problem during the study

DAYS = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
        'october', 'november', 'december', 'ja', 'jan', 'fe', 'feb', 'march', 'mar',  'ma', 'ap',
        'apr', 'jun', 'ju', 'jul', 'aug', 'au', 'sept', 'sep', 'se', 'oct', 'nov', 'decembe', 'dec']
SUFIX = ["st", "nd", "rd", "th"]
TIME = ['hr', 'hrs', 'hour', 'hours', 'date', 'gmt', 'time']
MEASUREMENT_UNITS = ['mt', 'lt', 'foot', 'lat']

for word in DAYS + MONTHS + SUFIX + TIME + MEASUREMENT_UNITS:
    nlp.vocab[word].is_stop = True

stop_words = nlp.Defaults.stop_words

In [5]:
def remove_stop_words(text):
    doc = nlp(text)  
    tokens_without_stopwords = [token.text for token in doc if not token.is_stop]  
    return " ".join(tokens_without_stopwords)  

def remove_numbers_caracters(text):
    if not isinstance(text, str):
        return ""
    regex = r'[^a-zA-Z\s]'
    text = re.sub(regex, ' ', text)
    text = text.lower() 
    return text

def correct_white_spaces(text):
    text_fixed = re.sub(r'\s+', ' ', text)
    return text_fixed

#used to clean the data not obligatory apply this function
def remove_data_by_key(data):
    data_no_ev = list(filter(lambda item: item['key'] != 'example_key', data))
    return data_no_ev

#not used. i tried to use for some improvement 
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop]
    return " ".join(lemmatized_tokens)

In [6]:
with open('/data/output.json', encoding='utf-8') as file:
    json_data = json.load(file)

data = json_data 

for item in data:
    
    item['lineText'] = remove_numbers_caracters(item['lineText']).strip()
    item['lineText'] = remove_stop_words(item['lineText']).strip()
    item['lineText'] = correct_white_spaces(item['lineText']).strip()

In [7]:

with open('/data/output.json', 'w') as arquivo:
    json.dump(data, arquivo, indent=4)

In [None]:
with open('/data/clean.json', encoding='utf-8') as file:
    json_data = json.load(file)

data = None
print(data)
data = json_data    

In [3]:
random.shuffle(data)
train_data = data[:int(0.8 * len(data))]
test_data = data[int(0.8 * len(data)):]

In [4]:
train_texts = [item['lineText'] for item in train_data]
test_texts  = [item['lineText'] for item in test_data]

# Separate the labels of dataset
train_labels = [item['key'] for item in train_data]
test_labels = [item['key'] for item in test_data]

# Unifying all dataset labels without duplication
all_labels_unique = set(train_labels).union(set(test_labels))

class_to_index = {cls: i for i, cls in enumerate(all_labels_unique)}
train_label_indices = [class_to_index[cls] for cls in train_labels]
test_label_indices = [class_to_index[cls] for cls in test_labels]

num_classes = len(all_labels_unique)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)

max_sequence_length = 100

In [None]:
#########CALLBACKS##############
csv_logger = CSVLogger('training_final.log')
tbCallBack = TensorBoard(log_dir='./tensorboard_final', histogram_freq=0, write_graph=True, write_grads=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)


In [12]:
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
model.add(LSTM(100))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 200)          842400    
                                                                 
 lstm_1 (LSTM)               (None, 200)               320800    
                                                                 
 dense_2 (Dense)             (None, 256)               51456     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 191)               49087     
                                                                 
Total params: 1,263,743
Trainable params: 1,263,743
Non-trainable params: 0
_________________________________________________________________


In [13]:
num_classes

191

In [14]:
train_sequences_padded = pad_sequences(train_sequences, maxlen=max_sequence_length)
train_labels_onehot = np.zeros((len(train_label_indices), num_classes))
for i, index in enumerate(train_label_indices):
    train_labels_onehot[i, index] = 1

In [None]:
model.fit(train_sequences_padded, train_labels_onehot, epochs=100, batch_size=64, callbacks=[csv_logger, tbCallBack], use_multiprocessing=True)
model.save('model_example.h5')

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_texts)

test_sequences_padded = pad_sequences(test_sequences, maxlen=max_sequence_length)

test_labels_onehot = np.zeros((len(test_label_indices), num_classes))
for i, indices in enumerate(test_label_indices):
    test_labels_onehot[i, indices] = 1

loss, accuracy = model.evaluate(test_sequences_padded, test_labels_onehot)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

In [None]:
from keras.models import load_model

model = load_model('model_example.h5')

new_texts = ['here you can insert your phrase to test the resulted model with the phrase and check the result'] 
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_sequences_padded = pad_sequences(new_sequences, maxlen=max_sequence_length)

predictions = model.predict(new_sequences_padded)

teste = []

for x in all_labels_unique:
    teste.append(x)

predicted_labels = [teste[np.argmax(pred)] for pred in predictions]

prob_max = predictions.max()

for pred, text, label in zip(predictions, new_texts, predicted_labels):
    print('Text:', text)
    print('Predicted label:', label)
    print('higher probability:', pred.max())        
    print('---')