In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from keras.layers import Dropout


In [3]:
import pandas as pd
import numpy as np
import re

# Read the Excel file into a pandas DataFrame
df = pd.read_csv('Dataset/Text_Emotion_Data.csv')

# Define a function to tokenize text into word sequences and remove stopwords
def tokenize_text(text):
    # Remove non-letter characters using the defined pattern
    pattern = r'[^a-zA-Z]'
    text = re.sub(pattern, ' ', text)
    # Convert the text to lowercase
    text = text.lower()
    # Tokenize the text into word sequences
    word_sequences = text.split()
    # Remove stopwords using the provided list
    with open('Dataset/stopwords.txt', 'r') as f:
        stopwords = f.read().splitlines()
    # Remove words with length less than or equal to 2
    word_sequences = [word for word in word_sequences if (word not in stopwords and len(word) > 2)]
    return word_sequences

# Tokenize each row of the text column into word sequences using the defined function
df['word_sequences'] = df['Text'].apply(tokenize_text)

# Find the maximum length of a word sequence
max_len = max(df['word_sequences'].apply(len))

# Define a function to pad the sequences to the maximum length
def pad_sequence(sequence):
    padded_sequence = sequence[:max_len] + ['']*(max_len-len(sequence))
    return padded_sequence

# Pad each sequence to the maximum length
df['word_sequences'] = df['word_sequences'].apply(pad_sequence)

# Combine all word sequences into a single list
all_sequences = []
for seq in df['word_sequences']:
    all_sequences.append(seq)

# Create a dictionary with unique words as keys and their corresponding index as values
word_dict = {}
index = 0
for seq in all_sequences:
    for word in seq:
        if word not in word_dict:
            word_dict[word] = index
            index += 1

# Convert each word sequence into a numerical vector with the corresponding index in the dictionary
num_vectors = []
for sequence in all_sequences:
    vector = []
    for word in sequence:
        if word in word_dict:
            index = word_dict[word]
            vector.append(index)
    num_vectors.append(vector)

# Convert the list of numerical vectors into a numpy array
X = np.array(num_vectors)

# Define the labels
labels = df['Label'].values
classes = np.unique(labels)
label_map = {label: i for i, label in enumerate(classes)}
y = np.array([label_map[label] for label in labels])

# Split last 150 text of each class for the test dataset
test_data = []
for c in classes:
    class_data = [(X[i], y[i]) for i in range(len(X)) if y[i] == label_map[c]]
    test_data.extend(class_data[-150:])

# Use the rest of the data for training
train_data = []
for i in range(len(X)):
    found = False
    for j in range(len(test_data)):
        if all(X[i] == test_data[j][0]) and y[i] == test_data[j][1]:
            found = True
            break
    if not found:
        train_data.append((X[i], y[i]))

# Separate the input features and labels for the training and test sets
X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)


X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = np.array(X_test)
y_test = np.array(y_test)



In [4]:
print('X_train shape :', X_train.shape)
print('X_test shape :' ,X_test.shape)
print('y_train shape :' ,y_train.shape)
print('y_test shape :', y_test.shape)
print('word_dict len :', len(word_dict))
print('max_len  :' ,max_len) 
X_train[20]

X_train shape : (2500, 27)
X_test shape : (750, 27)
y_train shape : (2500,)
y_test shape : (750,)
word_dict len : 6623
max_len  : 27


(27,)

In [85]:
# Reshape the input data to have a third dimension
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
# Convert y_train to one-hot encoding
y_train = to_categorical(y_train, num_classes=5)
y_test= to_categorical(y_test, num_classes=5)

In [5]:
# Define the model architecture
model = Sequential()
model.add(SimpleRNN(64, input_shape=(max_len, 1), return_sequences=True))
model.add(Dropout(0.2))
model.add(SimpleRNN(64))
model.add(Dropout(0.2))
model.add(Dense(len(classes), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=1000, batch_size=32)

# Evaluate the model on the train dataset
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
print('Train Loss:', train_loss)
print('Train Accuracy:', train_acc)

# Evaluate the model on the test dataset
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)


NameError: name 'Sequential' is not defined