### install necessary packages and libraries

In [64]:
!pip install fasttext
!pip install tensorflow




### Import necessary packages and libraries

In [65]:
import numpy as np
import pandas as pd

### Import the dataset

In [66]:
from google.colab import drive

drive.mount('/content/drive')
dataset_file = '/content/drive/MyDrive/new_updated_dataset.csv'

data=pd.read_csv(dataset_file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import pre-trained models(fasttext and emoji2Vec)

In [67]:
# import fastText model for text
import fasttext
import fasttext.util
import gensim.models as gsm

fastText_model_path = '/content/drive/MyDrive/cc.si.300.bin/cc.si.300.bin'
fasttext.FastText.eprint = lambda x: None
ft = fasttext.load_model(fastText_model_path)


#import emoji2Vec model for emojis

e2v = gsm.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/emoji2vec.bin', binary=True)

####Preprocessing data

In [68]:
import re

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE) # Eliminate links
    text = re.sub(r'@\w+', '', text) # Eliminate mention names
    text = re.sub(r'#\w+', '', text) #Eliminate # signs
    text = re.sub(r'$\w+', '', text) # Eliminate $ sign
    text = re.sub(r"[^අ-ෆa-zA-Z\s]", '', text)  # Keep Sinhala and English letters only
    text = re.sub(r'\s+', ' ', text).strip() # k
    return text


# Preprocess comments
column_name = 'comments'
if column_name in data.columns:
    data[column_name] = data[column_name].apply(preprocess_text)

shuffle the dataset

In [69]:
from sklearn.utils import shuffle

data = shuffle(data)


### Split the dataset

In [70]:
from sklearn.model_selection import train_test_split

# combined the emoji and text column for split in same points

combined_data = pd.concat([data['comment'], data['expression_emoji']], axis=1)

#split the dataset into train, test, validation
combined_train, combined_test, y_train, y_test = train_test_split(combined_data, data['label'], test_size=0.2, random_state=42, stratify=data['label'])

combined_actual_train,combined_val,y_actual_train,y_val=train_test_split(combined_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

#devide each column under train, test, validation
X_actual_train_text, X_actual_train_emoji = combined_actual_train['comment'], combined_actual_train['expression_emoji']

X_test_text, X_test_emoji= combined_test['comment'], combined_test['expression_emoji']

x_val_text,x_val_emoji=combined_val['comment'],combined_val['expression_emoji']


Perform word to vector using fastText model for comment column

In [71]:
def get_word_vector(word):
    try:
        wordVec=ft.get_word_vector(word)

        return wordVec
    except KeyError:
        return np.zeros(ft.get_dimension())

def get_sentence_vector(sentence):

    if not isinstance(sentence, str):
        sentence = str(sentence)

    if not sentence.strip():
        return np.zeros(ft.get_dimension())

    vectors = [get_word_vector(word) for word in sentence.split()]

    return np.mean(vectors, axis=0)

In [72]:
X_train_text_vectors=np.array([get_sentence_vector(sentence) for sentence in X_actual_train_text])
X_test_text_vectors=np.array([get_sentence_vector(sentence) for sentence in X_test_text])
x_text_val_vectors=np.array([get_sentence_vector(sentence) for sentence in x_val_text])

 perform Emoji to vector task using emoji2Vec model for expression_emoji column

In [None]:
print(np.mean(get_word_vector('තියෙනවා')))

0.0009083676


In [None]:
print(np.mean(get_sentence_vector('මේකෙත් වෙනමම ආතල් එකක් තියෙනවා!!')))

In [73]:
def get_emoji_vector(emoji):

    try:
        value=-e2v[emoji]

        return value

    except KeyError:
        return np.zeros(300)

In [None]:
textHate=np.mean(get_word_vector('පොන්නයෙක්ද'))
print(textHate)
emojiHate=np.mean(get_emoji_vector('😡'))
print(emojiHate)

In [None]:
textNon=np.mean(get_word_vector('තියෙනවා'))
print(textNon)
emojiNon=np.mean(get_emoji_vector('😂'))
print(emojiNon)

In [None]:
textNon=np.mean(get_word_vector('තියෙනවා'))
print(textNon)
emojiNon=np.mean(get_emoji_vector('🥰'))
print(emojiNon)

In [None]:
CombineHate=np.mean([textHate,emojiHate])
print(CombineHate)

combineNon=np.mean([textNon,emojiNon])
print(combineNon)

combineMid=np.mean([textHate,emojiNon])
print(combineMid)

In [74]:
X_train_emoji_vectors = np.array([get_emoji_vector(emoji) for emoji in X_actual_train_emoji])
X_test_emoji_vectors = np.array([get_emoji_vector(emoji) for emoji in X_test_emoji])
X_val_emoji_vectors=np.array([get_emoji_vector(emoji) for emoji in x_val_emoji])

### Combined the emoji and comment vectors

In [75]:
X_train_vectors=np.mean([X_train_text_vectors,X_train_emoji_vectors],axis=0)

X_test_vectors=np.mean([X_test_emoji_vectors,X_test_text_vectors],axis=0)

X_val_vectors=np.mean([x_text_val_vectors,X_val_emoji_vectors],axis=0)

Define model nature and aspects

In [76]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_accuracy', patience=6, restore_best_weights=True)


Dataset convertion into the one hot encoding

In [77]:
from tensorflow.keras.utils import to_categorical

num_classes = 3
y_actual_train_encoded = to_categorical(y_actual_train, num_classes=num_classes)
y_val_encoded = to_categorical(y_val, num_classes=num_classes)
y_test_encoded=to_categorical(y_test,num_classes=num_classes)

#Define the model

Using Feed forword neural network

In [78]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout

input_dim = 300

model = Sequential()

model.add(Dense(512,activation='relu',input_dim=input_dim))
model.add(Dropout(0.3))

model.add(Dense(256,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(128,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(32,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(16,activation='relu'))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(3, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [79]:
from keras.optimizers import Adam

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [80]:

# Train the model
history = model.fit(X_train_vectors, y_actual_train_encoded, epochs=100, batch_size=64, validation_data=(X_val_vectors, y_val_encoded), callbacks=[early_stopping])


Epoch 1/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.4284 - loss: 0.9399 - val_accuracy: 0.7943 - val_loss: 0.5036
Epoch 2/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7922 - loss: 0.5225 - val_accuracy: 0.7936 - val_loss: 0.4829
Epoch 3/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8043 - loss: 0.5085 - val_accuracy: 0.7943 - val_loss: 0.4919
Epoch 4/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.8031 - loss: 0.4952 - val_accuracy: 0.7929 - val_loss: 0.4807
Epoch 5/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8134 - loss: 0.4763 - val_accuracy: 0.7929 - val_loss: 0.4771
Epoch 6/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8177 - loss: 0.4559 - val_accuracy: 0.7943 - val_loss: 0.4580
Epoch 7/100
[1m92/92[0m [3

In [81]:
# test the model

prediction=model.predict(X_test_vectors)
loss, accuracy = model.evaluate(X_test_vectors, y_test_encoded)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8554 - loss: 0.3459
Test Loss: 0.3524697721004486, Test Accuracy: 0.8605795502662659


classification report for feedforward neural network

In [82]:
from sklearn.metrics import classification_report

y_test = np.array(y_test)
predicted_classes = np.array(prediction)
prediction=prediction.argmax(axis=1)
print("Classification Report:")
print(classification_report(y_test, prediction))

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.91      0.83       610
           1       0.87      0.82      0.84       610
           2       1.00      0.85      0.92       609

    accuracy                           0.86      1829
   macro avg       0.87      0.86      0.86      1829
weighted avg       0.87      0.86      0.86      1829



Test by manually

In [None]:

user_comment = input("Enter a comment: ")


emoji_pattern = re.compile(r'\p{So}')

emojis = emoji_pattern.findall(user_comment)

text_without_emojis = emoji_pattern.sub('', user_comment)

user_text_vector = get_sentence_vector(user_comment)
user_emoji_vector=get_emoji_vector(emojis[0])

user_comment_vector=np.mean([user_text_vector,user_emoji_vector],axis=0)

user_comment_vector = user_comment_vector.reshape(1,-1)

prediction = knn.predict(user_comment_vector)

print(f"Predicted Class: {prediction}")

Enter a comment: සංහිදියාවට අවුලක් වෙයි ද😂
Predicted Class: [0]
