### install necessary packages and libraries

In [23]:
!pip install fasttext
!pip install tensorflow




### Import necessary packages and libraries

In [22]:
import numpy as np
import pandas as pd
from google.colab import drive
import fasttext
import fasttext.util
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, Dropout,Bidirectional, BatchNormalization
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
import gensim.models as gsm
from keras.layers import LeakyReLU
from keras.layers import LSTM
from tensorflow.keras.utils import to_categorical

### Import the dataset

In [24]:
drive.mount('/content/drive')
dataset_file = '/content/drive/MyDrive/new_updated_dataset.csv'

data=pd.read_csv(dataset_file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import pre-trained models(fasttext and emoji2Vec)

In [25]:
# import fastText model for text

fastText_model_path = '/content/drive/MyDrive/cc.si.300.bin/cc.si.300.bin'
fasttext.FastText.eprint = lambda x: None
ft = fasttext.load_model(fastText_model_path)


#import emoji2Vec model for emojis

e2v = gsm.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/emoji2vec.bin', binary=True)

shuffle the dataset

In [26]:

data = shuffle(data)


### Split the dataset

In [27]:
# combined the emoji and text column for split in same points

combined_data = pd.concat([data['comment'], data['expression_emoji']], axis=1)

#split the dataset into train, test, validation
combined_train, combined_test, y_train, y_test = train_test_split(combined_data, data['label'], test_size=0.2, random_state=42, stratify=data['label'])

combined_actual_train,combined_val,y_actual_train,y_val=train_test_split(combined_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

#devide each column under train, test, validation
X_actual_train_text, X_actual_train_emoji = combined_actual_train['comment'], combined_actual_train['expression_emoji']

X_test_text, X_test_emoji= combined_test['comment'], combined_test['expression_emoji']

x_val_text,x_val_emoji=combined_val['comment'],combined_val['expression_emoji']


Perform word to vector using fastText model for comment column

In [28]:
def get_word_vector(word):
    try:
        wordVec=ft.get_word_vector(word)


        return wordVec
    except KeyError:
        return np.zeros(ft.get_dimension())



def get_sentence_vector(sentence):

    if not isinstance(sentence, str):
        sentence = str(sentence)
    vectors = [get_word_vector(word) for word in sentence.split()]

    return np.mean(vectors, axis=0)

In [29]:
X_train_text_vectors=np.array([get_sentence_vector(sentence) for sentence in X_actual_train_text])
X_test_text_vectors=np.array([get_sentence_vector(sentence) for sentence in X_test_text])
x_text_val_vectors=np.array([get_sentence_vector(sentence) for sentence in x_val_text])

 perform Emoji to vector task using emoji2Vec model for expression_emoji column

In [None]:
print(np.mean(get_word_vector('තියෙනවා')))

0.0009083676


In [None]:
print(np.mean(get_sentence_vector('මේකෙත් වෙනමම ආතල් එකක් තියෙනවා!!')))

In [30]:
def get_emoji_vector(emoji):

    try:
        value=-e2v[emoji]

        return value

    except KeyError:
        return np.zeros(300)

In [None]:
textHate=np.mean(get_word_vector('පොන්නයෙක්ද'))
print(textHate)
emojiHate=np.mean(get_emoji_vector('😡'))
print(emojiHate)

In [None]:
textNon=np.mean(get_word_vector('තියෙනවා'))
print(textNon)
emojiNon=np.mean(get_emoji_vector('😂'))
print(emojiNon)

In [None]:
textNon=np.mean(get_word_vector('තියෙනවා'))
print(textNon)
emojiNon=np.mean(get_emoji_vector('🥰'))
print(emojiNon)

In [None]:
CombineHate=np.mean([textHate,emojiHate])
print(CombineHate)

combineNon=np.mean([textNon,emojiNon])
print(combineNon)

combineMid=np.mean([textHate,emojiNon])
print(combineMid)

In [31]:
X_train_emoji_vectors = np.array([get_emoji_vector(emoji) for emoji in X_actual_train_emoji])
X_test_emoji_vectors = np.array([get_emoji_vector(emoji) for emoji in X_test_emoji])
X_val_emoji_vectors=np.array([get_emoji_vector(emoji) for emoji in x_val_emoji])

### Combined the emoji and comment vectors

In [32]:
X_train_vectors=np.mean([X_train_text_vectors,X_train_emoji_vectors],axis=0)

X_test_vectors=np.mean([X_test_emoji_vectors,X_test_text_vectors],axis=0)

X_val_vectors=np.mean([x_text_val_vectors,X_val_emoji_vectors],axis=0)

Define model nature and aspects

In [33]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=6, restore_best_weights=True)

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train),y= y_train)

class_weight_dict = dict(enumerate(class_weights))

In [34]:
# convert target classes into onehot encoding
num_classes = 3
y_actual_train_encoded = to_categorical(y_actual_train, num_classes=num_classes)
y_val_encoded = to_categorical(y_val, num_classes=num_classes)
y_test_encoded=to_categorical(y_test,num_classes=num_classes)

#Define the model

Using Feed forword neural network

In [None]:
input_dim = 300

model = Sequential()

model.add(Dense(512,activation='relu',input_dim=input_dim))
model.add(Dropout(0.2))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(3, activation='softmax'))

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

##Trying to optimize the simple neural network

In [35]:
input_dim = 300

model = Sequential()

model.add(Dense(512, activation='relu', input_dim=input_dim))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Output layer with 3 neurons
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy',  metrics=['accuracy'])

# Model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [36]:
# Train the model
history = model.fit(X_train_vectors, y_actual_train_encoded, epochs=100, batch_size=64, validation_data=(X_val_vectors, y_val_encoded), callbacks=[early_stopping], class_weight=class_weight_dict)


Epoch 1/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.6643 - loss: 0.8114 - val_accuracy: 0.3336 - val_loss: 1.1120
Epoch 2/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8155 - loss: 0.4862 - val_accuracy: 0.3336 - val_loss: 1.2274
Epoch 3/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8368 - loss: 0.4472 - val_accuracy: 0.3349 - val_loss: 1.2045
Epoch 4/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8515 - loss: 0.4008 - val_accuracy: 0.3397 - val_loss: 1.1697
Epoch 5/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8520 - loss: 0.4045 - val_accuracy: 0.6138 - val_loss: 0.7822
Epoch 6/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8563 - loss: 0.3828 - val_accuracy: 0.6398 - val_loss: 0.7124
Epoch 7/100
[1m92/92[0m [32m

In [37]:
# test the model

prediction=model.predict(X_test_vectors)
loss, accuracy = model.evaluate(X_test_vectors, y_test_encoded)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8778 - loss: 0.3176
Test Loss: 0.347136914730072, Test Accuracy: 0.8649535179138184


classification report for feedforward neural network

In [38]:
from sklearn.metrics import classification_report

y_test = np.array(y_test)
predicted_classes = np.array(prediction)
prediction=prediction.argmax(axis=1)
print("Classification Report:")
print(classification_report(y_test, prediction))

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       610
           1       0.85      0.86      0.86       610
           2       1.00      0.85      0.92       609

    accuracy                           0.86      1829
   macro avg       0.88      0.86      0.87      1829
weighted avg       0.88      0.86      0.87      1829



Using LSTM model

In [20]:

input_dim = 300

modelLSTM = Sequential()
timesteps=1

modelLSTM.add(Bidirectional(LSTM(128, return_sequences=True,input_shape=(timesteps, input_dim))))
modelLSTM.add(Bidirectional(LSTM(64, return_sequences=True)))

modelLSTM.add(Dense(num_classes, activation='softmax'))

  super().__init__(**kwargs)


In [21]:
# Reshape the vectors
X_train_vecotrs_reshaped = X_train_vectors.reshape(X_train_vectors.shape[0],timesteps, X_train_vectors.shape[1])
X_val_vectors_reshaped = X_val_vectors.reshape(X_val_vectors.shape[0], timesteps, X_val_vectors.shape[1])
X_test_vecotrs_reshaped = X_test_vectors.reshape(X_test_vectors.shape[0], timesteps, X_test_vectors.shape[1])

In [None]:
# compile the model
modelLSTM.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'],run_eagerly=True)

In [None]:
# print(y_actual_train_encoded.shape)
# print(X_train_vecotrs_reshaped.shape)
# # Remove the extra dimension from the target data
y_actual_train_encoded_reshaped  = y_actual_train_encoded.reshape(y_actual_train_encoded.shape[0],1, num_classes)
y_val_encoded_reshaped=y_val_encoded.reshape(y_val_encoded.shape[0],1,num_classes)
y_actual_train_encoded_reshaped = y_actual_train_encoded_reshaped.reshape(y_actual_train_encoded_reshaped.shape)
y_val_encoded_reshaped = y_val_encoded_reshaped.reshape(y_val_encoded_reshaped.shape)
y_test_encoded_reshaped=y_test_encoded_reshaped.reshape(y_test_encoded_reshaped.shape)
# y_test_encoded_reshaped=y_test_encoded.reshape(y_test_encoded.shape[0],1,num_classes)
# y_actual_train_encoded_reshaped = y_actual_train_encoded_reshaped.reshape(-1, 1, 3)
# print(y_actual_train_encoded_reshaped.shape)
# # y_actual_train_encoded_reshaped = y_actual_train_encoded_reshaped.reshape(-1, 1, 3)
print(y_actual_train_encoded_reshaped.shape)
print(y_val_encoded_reshaped.shape)
print(X_train_vecotrs_reshaped.shape)
print(X_val_vectors_reshaped.shape)
# print(modelLSTM.output_shape)

(5851, 1, 3)
(1463, 1, 3)
(5851, 1, 300)
(1463, 1, 300)


In [None]:
y_actual_train=np.array(y_actual_train)
print(X_train_vectors.shape)

(5851, 300)


In [None]:
modelLSTM.summary()

In [None]:
# Train the model
# y_actual_train_encoded_reshaped = y_actual_train_encoded_reshaped.squeeze()
# y_val_encoded_reshaped = y_val_encoded_reshaped.squeeze()
# modelLSTM.fit(X_train_vecotrs_reshaped.reshape(-1, 1, 300), y_actual_train_encoded_reshaped.reshape(-1, 1, 3), epochs=100, batch_size=64, validation_data=(X_val_vectors_reshaped.reshape(-1, 1, 300), y_val_encoded_reshaped.reshape(-1, 1, 3)), callbacks=[early_stopping])
# modelLSTM.fit(X_train_vecotrs_reshaped, y_actual_train_encoded_reshaped.reshape(-1, 1, 3), epochs=100, batch_size=64, validation_data=(y_val_encoded_reshaped, y_val_encoded), callbacks=[early_stopping])
# Train the model
modelLSTM.fit(X_train_vecotrs_reshaped, y_actual_train_encoded_reshaped, epochs=100, batch_size=64, validation_data=(X_val_vectors_reshaped, y_val_encoded_reshaped), callbacks=[early_stopping])

In [None]:
# Test LSTM model

prediction=modelLSTM.predict(X_test_vecotrs_reshaped)

loss, accuracy = modelLSTM.evaluate(X_test_vecotrs_reshaped, y_test_encoded_reshaped)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Classification report for LSTM

In [None]:
from sklearn.metrics import classification_report

y_test = np.array(y_test)
predicted_classes = np.array(prediction)
prediction=prediction.argmax(axis=1)
print("Classification Report:")
print(classification_report(y_test, prediction))

In [None]:
X_train_vecotrs_reshaped = X_train_vectors.reshape(X_train_vectors.shape[0],X_train_vectors.shape[1])
X_val_vectors_reshaped = X_val_vectors.reshape(X_val_vectors.shape[0], 1,X_val_vectors.shape[1])
X_test_vecotrs_reshaped = X_test_vectors.reshape(X_test_vectors.shape[0],1, X_test_vectors.shape[1])

Test by manually

In [None]:
import regex as re

user_comment = input("Enter a comment: ")


emoji_pattern = re.compile(r'\p{So}')

emojis = emoji_pattern.findall(user_comment)

text_without_emojis = emoji_pattern.sub('', user_comment)

user_text_vector = get_sentence_vector(user_comment)
user_emoji_vector=get_emoji_vector(emojis[0])

user_comment_vector=np.mean([user_text_vector,user_emoji_vector],axis=0)

user_comment_vector = user_comment_vector.reshape(1,-1)

prediction = knn.predict(user_comment_vector)

print(f"Predicted Class: {prediction}")

Enter a comment: සංහිදියාවට අවුලක් වෙයි ද😂
Predicted Class: [0]
