In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from keras.layers import Dropout
from nltk.tokenize import word_tokenize
import tensorflow as tf
# from nltk.corpus import stopwords


In [2]:
import pandas as pd
import numpy as np
import re

# Read the Excel file into a pandas DataFrame
df = pd.read_csv('Dataset/Text_Emotion_Data.csv')

# Define a function to tokenize text into word sequences and remove stopwords
def tokenize_text(text):
    # Remove non-letter characters using the defined pattern
    pattern = r'[^a-zA-Z]'
    text = re.sub(pattern, ' ', text)
    # Convert the text to lowercase
    text = text.lower()
    # Tokenize the text into word sequences
    word_sequences = text.split()
    # Remove stopwords using the provided list
    with open('Dataset/stopwords.txt', 'r') as f:
        stopwords = f.read().splitlines()
    # Remove words with length less than or equal to 2
    word_sequences = [word for word in word_sequences if (word not in stopwords and len(word) > 2)]
    return word_sequences

# Tokenize each row of the text column into word sequences using the defined function
df['word_sequences'] = df['Text'].apply(tokenize_text)

# Find the maximum length of a word sequence
max_len = max(df['word_sequences'].apply(len))

# Define a function to pad the sequences to the maximum length
def pad_sequence(sequence):
    padded_sequence = sequence[:max_len] + ['']*(max_len-len(sequence))
    return padded_sequence

# Pad each sequence to the maximum length
df['word_sequences'] = df['word_sequences'].apply(pad_sequence)

# Combine all word sequences into a single list
all_sequences = []
for seq in df['word_sequences']:
    all_sequences.append(seq)

# Create a dictionary with unique words as keys and their corresponding index as values
word_dict = {}
index = 0
for seq in all_sequences:
    for word in seq:
        if word not in word_dict:
            word_dict[word] = index
            index += 1

# Convert each word sequence into a numerical vector with the corresponding index in the dictionary
num_vectors = []
for sequence in all_sequences:
    vector = []
    for word in sequence:
        if word in word_dict:
            index = word_dict[word]
            vector.append(index)
    num_vectors.append(vector)

# Convert the list of numerical vectors into a numpy array
X = np.array(num_vectors)

# Define the labels
labels = df['Label'].values
classes = np.unique(labels)
label_map = {label: i for i, label in enumerate(classes)}
y = np.array([label_map[label] for label in labels])

# Split last 150 text of each class for the test dataset
test_data = []
for c in classes:
    class_data = [(X[i], y[i]) for i in range(len(X)) if y[i] == label_map[c]]
    test_data.extend(class_data[-150:])

# Use the rest of the data for training
train_data = []
for i in range(len(X)):
    found = False
    for j in range(len(test_data)):
        if all(X[i] == test_data[j][0]) and y[i] == test_data[j][1]:
            found = True
            break
    if not found:
        train_data.append((X[i], y[i]))

# Separate the input features and labels for the training and test sets
X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)


X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = np.array(X_test)
y_test = np.array(y_test)



In [3]:
df

Unnamed: 0,Text,Label,word_sequences
0,i now feel like im finally in a position to de...,joy,"[now, feel, like, finally, position, decide, w..."
1,i feel resigned to my lot in life being that i...,sadness,"[feel, resigned, lot, life, being, that, watch..."
2,i feel thrilled with your presence in your eye...,joy,"[feel, thrilled, with, your, presence, your, e..."
3,i may finally sit down and feel sweet release ...,joy,"[may, finally, sit, down, and, feel, sweet, re..."
4,i feel a bit jealous because i been trying to ...,anger,"[feel, bit, jealous, because, been, trying, da..."
...,...,...,...
3245,i just want to say all the things i want to sa...,sadness,"[just, want, say, all, the, things, want, say,..."
3246,i feel that i need to be more generous with my...,love,"[feel, that, need, more, generous, with, offer..."
3247,im sitting here in the belmont library listeni...,sadness,"[sitting, here, the, belmont, library, listeni..."
3248,i can t let go of that sad feeling that i want...,love,"[can, let, that, sad, feeling, that, want, acc..."


In [38]:
print('X_train shape :', X_train.shape)
print('X_test shape :' ,X_test.shape)
print('y_train shape :' ,y_train.shape)
print('y_test shape :', y_test.shape)
print('word_dict len :', len(word_dict))
print('max_len  :' ,max_len) 
X_train[20]

X_train shape : (2500, 27)
X_test shape : (750, 27)
y_train shape : (2500,)
y_test shape : (750,)
word_dict len : 6623
max_len  : 27


array([203,  65, 204,  10, 205,   2,  19, 206, 207, 133, 208, 176, 209,
       210,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,
        15])

In [39]:
X_train[20]

array([203,  65, 204,  10, 205,   2,  19, 206, 207, 133, 208, 176, 209,
       210,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,
        15])

In [40]:
# Reshape the input data to have a third dimension
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
# Convert y_train to one-hot encoding
y_train = to_categorical(y_train, num_classes=5)
y_test= to_categorical(y_test, num_classes=5)

In [41]:
print('X_train shape :', X_train.shape)
print('X_test shape :' ,X_test.shape)
print('y_train shape :' ,y_train.shape)
print('y_test shape :', y_test.shape)
print('word_dict len :', len(word_dict))
print('max_len  :' ,max_len) 
X_train[20]

X_train shape : (2500, 27, 1)
X_test shape : (750, 27, 1)
y_train shape : (2500, 5)
y_test shape : (750, 5)
word_dict len : 6623
max_len  : 27


array([[203],
       [ 65],
       [204],
       [ 10],
       [205],
       [  2],
       [ 19],
       [206],
       [207],
       [133],
       [208],
       [176],
       [209],
       [210],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15],
       [ 15]])

In [31]:
# Define the model architecture
model = Sequential()
model.add(SimpleRNN(64, input_shape=(max_len, 1), return_sequences=True))
model.add(Dropout(0.2))
model.add(SimpleRNN(32))
model.add(Dropout(0.2))
model.add(Dense(len(classes), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=1000, batch_size=32)

# Evaluate the model on the train dataset
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
print('Train Loss:', train_loss)
print('Train Accuracy:', train_acc)

# Evaluate the model on the test dataset
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)


Epoch 1/1000


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "/Users/saeed/opt/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Users/saeed/opt/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/traitlets/config/application.py", line 992, in launch_instance
      app.start()
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 677, in start
      self.io_loop.start()
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Users/saeed/opt/anaconda3/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
      self._run_once()
    File "/Users/saeed/opt/anaconda3/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
      handle._run()
    File "/Users/saeed/opt/anaconda3/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 367, in dispatch_shell
      await result
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2863, in run_cell
      result = self._run_cell(
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2909, in _run_cell
      return runner(coro)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3106, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3309, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3369, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/gv/29bx0dbj5fvf2ql1sq4xwxsh0000gn/T/ipykernel_91652/2009924155.py", line 13, in <cell line: 13>
      model.fit(X_train, y_train, epochs=1000, batch_size=32)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 890, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 948, in compute_loss
      return self.compiled_loss(
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 139, in __call__
      losses = call_fn(y_true, y_pred)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 243, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 1860, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "/Users/saeed/opt/anaconda3/lib/python3.9/site-packages/keras/backend.py", line 5238, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
logits and labels must have the same first dimension, got logits shape [32,5] and labels shape [160]
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_38593]