In [1]:
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Bidirectional, GlobalMaxPooling1D, Dropout
from keras.models import Sequential
import json
import csv
import numpy as np

In [2]:
def load_data_from_csv(file_path):
    data = []
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            row['label'] = int(row['label'])
            data.append(row)
    return data

train_data = load_data_from_csv(r'C:\Users\srini\Documents\text-deploy\models\splitdata\train.csv')
val_data = load_data_from_csv(r'C:\Users\srini\Documents\text-deploy\models\splitdata\validation.csv')
test_data = load_data_from_csv(r'C:\Users\srini\Documents\text-deploy\models\splitdata\test.csv')

train_texts = [entry['text'] for entry in train_data]
train_labels = [entry['label'] for entry in train_data]

val_texts = [entry['text'] for entry in val_data]
val_labels = [entry['label'] for entry in val_data]

test_texts = [entry['text'] for entry in test_data]
test_labels = [entry['label'] for entry in test_data]

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

max_words = 30000
max_sequence_length = 100
embedding_dim = 100
lstm_units = 64
dropout_rate = 0.5
num_classes = 6

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts)

In [3]:
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to ensure consistent length
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)
val_sequences = pad_sequences(val_sequences, maxlen=max_sequence_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

In [4]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(lstm_units, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout_rate))
model.add(Dense(128, activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          3000000   
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         84480     
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0

In [5]:
model.fit(train_sequences, train_labels, epochs=10, batch_size=64, validation_data=(val_sequences, val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18d104cee50>

In [6]:
model.save("models/emotion.h5")

In [7]:
tokenizer_word_index = tokenizer.word_index
tokenizer_config = tokenizer.get_config()

with open("tokenizer_word_index.json", "w") as json_file:
    json.dump(tokenizer_word_index, json_file)

with open("tokenizer_config.json", "w") as json_file:
    json.dump(tokenizer_config, json_file)