### install necessary packages and libraries

In [None]:
!pip install fasttext
!pip install tensorflow


### Import necessary packages and libraries

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
import fasttext
import fasttext.util
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, Dropout,Bidirectional
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
import gensim.models as gsm
from keras.layers import LeakyReLU
from keras.layers import LSTM
from tensorflow.keras.utils import to_categorical

### Import the dataset

In [None]:
drive.mount('/content/drive')
dataset_file = '/content/drive/MyDrive/new_updated_dataset.csv'

data=pd.read_csv(dataset_file)

Mounted at /content/drive


### Import pre-trained models(fasttext and emoji2Vec)

In [None]:
# import fastText model for text

fastText_model_path = '/content/drive/MyDrive/cc.si.300.bin/cc.si.300.bin'
fasttext.FastText.eprint = lambda x: None
ft = fasttext.load_model(fastText_model_path)


#import emoji2Vec model for emojis

e2v = gsm.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/emoji2vec.bin', binary=True)

shuffle the dataset

In [None]:

data = shuffle(data)


### Split the dataset

In [None]:
# combined the emoji and text column for split in same points

combined_data = pd.concat([data['comment'], data['expression_emoji']], axis=1)

#split the dataset into train, test, validation
combined_train, combined_test, y_train, y_test = train_test_split(combined_data, data['label'], test_size=0.2, random_state=42, stratify=data['label'])

combined_actual_train,combined_val,y_actual_train,y_val=train_test_split(combined_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

#devide each column under train, test, validation
X_actual_train_text, X_actual_train_emoji = combined_actual_train['comment'], combined_actual_train['expression_emoji']

X_test_text, X_test_emoji= combined_test['comment'], combined_test['expression_emoji']

x_val_text,x_val_emoji=combined_val['comment'],combined_val['expression_emoji']


Perform word to vector using fastText model for comment column

In [None]:
def get_word_vector(word):
    try:
        wordVec=ft.get_word_vector(word)


        return wordVec
    except KeyError:
        return np.zeros(ft.get_dimension())



def get_sentence_vector(sentence):

    if not isinstance(sentence, str):
        sentence = str(sentence)
    vectors = [get_word_vector(word) for word in sentence.split()]

    return np.mean(vectors, axis=0)

In [None]:
X_train_text_vectors=np.array([get_sentence_vector(sentence) for sentence in X_actual_train_text])
X_test_text_vectors=np.array([get_sentence_vector(sentence) for sentence in X_test_text])
x_text_val_vectors=np.array([get_sentence_vector(sentence) for sentence in x_val_text])

 perform Emoji to vector task using emoji2Vec model for expression_emoji column

In [None]:
print(np.mean(get_word_vector('තියෙනවා')))

0.0009083676


In [None]:
print(np.mean(get_sentence_vector('මේකෙත් වෙනමම ආතල් එකක් තියෙනවා!!')))

In [None]:
def get_emoji_vector(emoji):

    try:
        value=-e2v[emoji]

        return value

    except KeyError:
        return np.zeros(300)

In [None]:
textHate=np.mean(get_word_vector('පොන්නයෙක්ද'))
print(textHate)
emojiHate=np.mean(get_emoji_vector('😡'))
print(emojiHate)

In [None]:
textNon=np.mean(get_word_vector('තියෙනවා'))
print(textNon)
emojiNon=np.mean(get_emoji_vector('😂'))
print(emojiNon)


In [None]:
textNon=np.mean(get_word_vector('තියෙනවා'))
print(textNon)
emojiNon=np.mean(get_emoji_vector('🥰'))
print(emojiNon)


In [None]:
CombineHate=np.mean([textHate,emojiHate])
print(CombineHate)

combineNon=np.mean([textNon,emojiNon])
print(combineNon)

combineMid=np.mean([textHate,emojiNon])
print(combineMid)

In [None]:
X_train_emoji_vectors = np.array([get_emoji_vector(emoji) for emoji in X_actual_train_emoji])
X_test_emoji_vectors = np.array([get_emoji_vector(emoji) for emoji in X_test_emoji])
X_val_emoji_vectors=np.array([get_emoji_vector(emoji) for emoji in x_val_emoji])

### Combined the emoji and comment vectors

In [None]:
X_train_vectors=np.mean([X_train_text_vectors,X_train_emoji_vectors],axis=0)

X_test_vectors=np.mean([X_test_emoji_vectors,X_test_text_vectors],axis=0)

X_val_vectors=np.mean([x_text_val_vectors,X_val_emoji_vectors],axis=0)

Define model nature and aspects

In [None]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=6, restore_best_weights=True)

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train),y= y_train)

class_weight_dict = dict(enumerate(class_weights))

In [None]:
# convert target classes into onehot encoding
num_classes = 3
y_actual_train_encoded = to_categorical(y_actual_train, num_classes=num_classes)
y_val_encoded = to_categorical(y_val, num_classes=num_classes)
y_test_encoded=to_categorical(y_test,num_classes=num_classes)

#Define the model

Using Feed forword neural network

In [None]:
input_dim = 300

model = Sequential()

model.add(Dense(512,activation='relu',input_dim=input_dim))
model.add(Dropout(0.2))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(3, activation='softmax'))

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train_vectors, y_actual_train_encoded, epochs=100, batch_size=64, validation_data=(X_val_vectors, y_val_encoded), callbacks=[early_stopping], class_weight=class_weight_dict)


In [None]:
# test the model

prediction=model.predict(X_test_vectors)
loss, accuracy = model.evaluate(X_test_vectors, y_test_encoded)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

classification report for feedforward neural network

In [None]:
from sklearn.metrics import classification_report

y_test = np.array(y_test)
predicted_classes = np.array(prediction)
prediction=prediction.argmax(axis=1)
print("Classification Report:")
print(classification_report(y_test, prediction))

Using LSTM model

In [None]:

input_dim = 300

modelLSTM = Sequential()
timesteps=1

modelLSTM.add(Bidirectional(LSTM(128, return_sequences=True,input_shape=(timesteps, input_dim))))
modelLSTM.add(Bidirectional(LSTM(64, return_sequences=True)))

modelLSTM.add(Dense(num_classes, activation='softmax'))

In [None]:
# Reshape the vectors
X_train_vecotrs_reshaped = X_train_vectors.reshape(X_train_vectors.shape[0],timesteps, X_train_vectors.shape[1])
X_val_vectors_reshaped = X_val_vectors.reshape(X_val_vectors.shape[0], timesteps, X_val_vectors.shape[1])
X_test_vecotrs_reshaped = X_test_vectors.reshape(X_test_vectors.shape[0], timesteps, X_test_vectors.shape[1])

In [None]:
# compile the model
modelLSTM.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'],run_eagerly=True)

In [None]:
# print(y_actual_train_encoded.shape)
# print(X_train_vecotrs_reshaped.shape)
# # Remove the extra dimension from the target data
y_actual_train_encoded_reshaped  = y_actual_train_encoded.reshape(y_actual_train_encoded.shape[0],1, num_classes)
y_val_encoded_reshaped=y_val_encoded.reshape(y_val_encoded.shape[0],1,num_classes)
y_actual_train_encoded_reshaped = y_actual_train_encoded_reshaped.reshape(y_actual_train_encoded_reshaped.shape)
y_val_encoded_reshaped = y_val_encoded_reshaped.reshape(y_val_encoded_reshaped.shape)
y_test_encoded_reshaped=y_test_encoded_reshaped.reshape(y_test_encoded_reshaped.shape)
# y_test_encoded_reshaped=y_test_encoded.reshape(y_test_encoded.shape[0],1,num_classes)
# y_actual_train_encoded_reshaped = y_actual_train_encoded_reshaped.reshape(-1, 1, 3)
# print(y_actual_train_encoded_reshaped.shape)
# # y_actual_train_encoded_reshaped = y_actual_train_encoded_reshaped.reshape(-1, 1, 3)
print(y_actual_train_encoded_reshaped.shape)
print(y_val_encoded_reshaped.shape)
print(X_train_vecotrs_reshaped.shape)
print(X_val_vectors_reshaped.shape)
# print(modelLSTM.output_shape)

(5851, 1, 3)
(1463, 1, 3)
(5851, 1, 300)
(1463, 1, 300)


In [None]:
y_actual_train=np.array(y_actual_train)
print(X_train_vectors.shape)

(5851, 300)


In [None]:
modelLSTM.summary()

In [None]:
# Train the model
# y_actual_train_encoded_reshaped = y_actual_train_encoded_reshaped.squeeze()
# y_val_encoded_reshaped = y_val_encoded_reshaped.squeeze()
# modelLSTM.fit(X_train_vecotrs_reshaped.reshape(-1, 1, 300), y_actual_train_encoded_reshaped.reshape(-1, 1, 3), epochs=100, batch_size=64, validation_data=(X_val_vectors_reshaped.reshape(-1, 1, 300), y_val_encoded_reshaped.reshape(-1, 1, 3)), callbacks=[early_stopping])
# modelLSTM.fit(X_train_vecotrs_reshaped, y_actual_train_encoded_reshaped.reshape(-1, 1, 3), epochs=100, batch_size=64, validation_data=(y_val_encoded_reshaped, y_val_encoded), callbacks=[early_stopping])
# Train the model
modelLSTM.fit(X_train_vecotrs_reshaped, y_actual_train_encoded_reshaped, epochs=100, batch_size=64, validation_data=(X_val_vectors_reshaped, y_val_encoded_reshaped), callbacks=[early_stopping])

In [None]:
# Test LSTM model

prediction=modelLSTM.predict(X_test_vecotrs_reshaped)

loss, accuracy = modelLSTM.evaluate(X_test_vecotrs_reshaped, y_test_encoded_reshaped)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Classification report for LSTM

In [None]:
from sklearn.metrics import classification_report

y_test = np.array(y_test)
predicted_classes = np.array(prediction)
prediction=prediction.argmax(axis=1)
print("Classification Report:")
print(classification_report(y_test, prediction))

In [None]:
X_train_vecotrs_reshaped = X_train_vectors.reshape(X_train_vectors.shape[0],X_train_vectors.shape[1])
X_val_vectors_reshaped = X_val_vectors.reshape(X_val_vectors.shape[0], 1,X_val_vectors.shape[1])
X_test_vecotrs_reshaped = X_test_vectors.reshape(X_test_vectors.shape[0],1, X_test_vectors.shape[1])

# Try with KMeans (Optional)

In [None]:
from re import X
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)

kmeans.fit(X_train_vectors)

predictWithKmeans = kmeans.predict(X_test_vectors)




In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictWithKmeans)
print("Accuracy of K-means on test data:", accuracy)


Accuracy of K-means on test data: 0.13504647348277748


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_test = np.array(y_test)
predicted_classes = np.array(predictWithKmeans)


print("Classification Report:")
print(classification_report(y_test, predicted_classes))


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       610
           1       0.23      0.40      0.30       610
           2       0.00      0.00      0.00       609

    accuracy                           0.14      1829
   macro avg       0.08      0.13      0.10      1829
weighted avg       0.08      0.14      0.10      1829



# Try with K nearest Neighbours(Optional)

In [None]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_train_vectors, y_actual_train)

predictWithKNN = knn.predict(X_test_vectors)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictWithKNN)
print("Accuracy of KNN on test data:", accuracy)
print(y_test)
print(predictWithKNN)

Accuracy of KNN on test data: 0.8305084745762712
[1 1 2 ... 2 1 1]
[0 1 2 ... 2 1 1]


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_test = np.array(y_test)
predicted_classes = np.array(predictWithKNN)
print(predicted_classes)

print("Classification Report:")
print(classification_report(y_test, predicted_classes))


[0 1 2 ... 2 1 1]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.76      0.77       610
           1       0.75      0.89      0.82       610
           2       0.98      0.84      0.91       609

    accuracy                           0.83      1829
   macro avg       0.84      0.83      0.83      1829
weighted avg       0.84      0.83      0.83      1829



Test by manually

In [None]:
import regex as re

user_comment = input("Enter a comment: ")


emoji_pattern = re.compile(r'\p{So}')

emojis = emoji_pattern.findall(user_comment)

text_without_emojis = emoji_pattern.sub('', user_comment)

user_text_vector = get_sentence_vector(user_comment)
user_emoji_vector=get_emoji_vector(emojis[0])

user_comment_vector=np.mean([user_text_vector,user_emoji_vector],axis=0)

user_comment_vector = user_comment_vector.reshape(1,-1)

prediction = knn.predict(user_comment_vector)

print(f"Predicted Class: {prediction}")

Enter a comment: සංහිදියාවට අවුලක් වෙයි ද😂
Predicted Class: [0]
