A classification label, with possible values including sadness (0), joy (1), love (2), anger (3), fear (4).

In [3]:
import pandas as pd 
import numpy as np
import re
import string
import calendar
import nltk 

In [4]:
# IMPORT THE DOCUMENTS TEST, TRAINING AND VALIDATION DATASETS

test_document = '/Users/churnika/Desktop/Projects/IoT_Project/Speech/Dataset/test.csv'
test_doc = pd.read_csv(test_document)
test_doc = test_doc.dropna()
print(len(test_doc))

training_documnent = '/Users/churnika/Desktop/Projects/IoT_Project/Speech/Dataset/training.csv'
training_doc = pd.read_csv(training_documnent)
training_doc = training_doc.dropna()
print(len(training_doc))

validation_document = '/Users/churnika/Desktop/Projects/IoT_Project/Speech/Dataset/validation.csv'
validation_doc = pd.read_csv(validation_document)
validation_doc = validation_doc.dropna()
print(len(validation_doc))

2000
16000
2000


In [5]:
frames = [test_doc, training_doc, validation_doc]
text = pd.concat(frames)

print(len(text))
text.describe()
text.columns

20000


Index(['text', 'label'], dtype='object')

In [6]:
# accessing the message column and converting it into lower case

msg_exp = text['text'].str.lower()

In [7]:
# removing punctuations

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

print("Length of text_without_urls:", len(msg_exp))

msg_exp = msg_exp.apply(remove_punctuation)

Length of text_without_urls: 20000


In [8]:
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

months_days = [calendar.month_name[i].lower() for i in range(1, 13)] + [calendar.day_name[i].lower() for i in range(7)]
remove_words = ["vo","n","m","c","ra","xx","r","date","hii","hi","ye","pa","xxx","p","sir","mam","good","morning","time","ur","you","status","father"]

def clean_text(text):
    tokens = nltk.word_tokenize(text)
    # Filter out URLs
    tokens = [token for token in tokens if not re.match(r'http[s]?://', token)]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token not in months_days]
    tokens = [token for token in tokens if token not in remove_words]
    return tokens

# Assuming text_without_urls is your pandas Series
text_without_urls = msg_exp.apply(clean_text)
text_without_urls = text_without_urls.reset_index(drop=True)


[nltk_data] Downloading package punkt to /Users/churnika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/churnika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# LEMMATIZATION

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [[lemmatizer.lemmatize(token) for token in tokens]for tokens in text_without_urls]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/churnika/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
# REMOVING RARE OR FREQUENTLY USED WORDS

from collections import Counter

freq = Counter([token for tokens in lemmatized_tokens for token in tokens])
freq_threshold = 10

lemmatized_tokens = [[token for token in tokens if freq[token] > freq_threshold] for tokens in lemmatized_tokens]

In [11]:
#REMOVING WHITESPACES

cleaned_lemmatized_tokens = [' '.join(tokens) for tokens in lemmatized_tokens]
cleaned_lemmatized_tokens = [text.split() for text in cleaned_lemmatized_tokens]

In [12]:
# Dictionary mapping abbreviations to their full forms
abbreviation_dict = {
    "don't": "do not",
    "can't": "cannot",
    "u": "you",
    "approv": "approve",
    "approved": "approve",
    "rejected": "reject"
}

# Function to expand abbreviations
def expand_abbreviations(tokens):
    expanded_tokens = []
    for token in tokens:
        if token in abbreviation_dict:
            expanded_tokens.extend(abbreviation_dict[token].split())
        else:
            expanded_tokens.append(token)
    return expanded_tokens

# Expand abbreviations in cleaned_lemmatized_tokens
expanded_lemmatized_tokens = [expand_abbreviations(tokens) for tokens in cleaned_lemmatized_tokens]


In [13]:
#REMOVE ACCENTS AND DIACRITICS FROM EXPANDED_LEMMATIZED_TOKENS

from unidecode import unidecode

def remove_accents_diacritics(tokens):
    cleaned_tokens = []
    for token in tokens:
        cleaned_token = unidecode(token)
        cleaned_tokens.append(cleaned_token)
    return cleaned_tokens

expanded_lemmatized_tokens = [remove_accents_diacritics(tokens) for tokens in expanded_lemmatized_tokens]

In [14]:
# Reset the index of the DataFrame

text.reset_index(drop=True, inplace=True)
text_sync = text.loc[[index for index in text.index if index < len(expanded_lemmatized_tokens)]]
assert len(expanded_lemmatized_tokens) == len(text_sync), "Lengths do not match"


In [15]:
# TOKENIZATION OF TEXT INTO INTEGER VALUES

from tensorflow.keras.preprocessing.text import Tokenizer

# Create a tokenizer, configured to only take into account the top-10000 most common words
tokenizer = Tokenizer(num_words=10000)

# Build the word index
tokenizer.fit_on_texts(expanded_lemmatized_tokens)

# Turn strings into lists of integer indices
sequences = tokenizer.texts_to_sequences(expanded_lemmatized_tokens)

In [16]:
from keras.preprocessing.sequence import pad_sequences

max_seq_length = 100  # or any other value based on your data

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length)

In [25]:
print(padded_sequences)

[[   0    0    0 ...  602    4   35]
 [   0    0    0 ...  113    1  345]
 [   0    0    0 ...    1    3  288]
 ...
 [   0    0    0 ...  212  232   13]
 [   0    0    0 ...  395  601 2058]
 [   0    0    0 ... 1202   20   14]]


In [17]:
labels = text_sync.label

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

labels = to_categorical(labels, num_classes=6)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [18]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100))  # Adjust input_dim to the size of your vocabulary and output_dim to the size of the word vectors you want to use
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))  # Increase the number of LSTM units and use return_sequences=True for stacking LSTM layers
model.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.2))  # Add another LSTM layer
model.add(Dense(32, activation='relu'))  # Add a Dense layer with ReLU activation function
model.add(Dropout(0.5))  # Add a Dropout layer for regularization
model.add(Dense(6, activation='softmax'))  # Change the number of units to the number of classes and use 'softmax' for multi-class problems

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Use 'categorical_crossentropy' for multi-class problems


In [19]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 41ms/step - accuracy: 0.3086 - loss: 1.6265 - val_accuracy: 0.6109 - val_loss: 1.0984
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 39ms/step - accuracy: 0.6737 - loss: 0.8899 - val_accuracy: 0.8628 - val_loss: 0.4324
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 40ms/step - accuracy: 0.8757 - loss: 0.3850 - val_accuracy: 0.9075 - val_loss: 0.2921
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 42ms/step - accuracy: 0.9113 - loss: 0.2747 - val_accuracy: 0.9094 - val_loss: 0.2821
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 41ms/step - accuracy: 0.9320 - loss: 0.2144 - val_accuracy: 0.9128 - val_loss: 0.2825
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 41ms/step - accuracy: 0.9390 - loss: 0.1798 - val_accuracy: 0.9191 - val_loss: 0.2766
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x3207ae510>

In [20]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9117 - loss: 0.4034


In [21]:
# Save the model
model.save('my_model.h5')  

