<a href="https://colab.research.google.com/github/senaldm/Research_Project/blob/main/hate_speech_detection/train_models_with_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### install necessary packages and libraries

In [None]:

!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227138 sha256=c1c36776f8801aae99beb87e5498bd3275701e2e3fb361a6d2f66650999bc267
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.12.0


### Import necessary packages and libraries

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
import fasttext
import fasttext.util
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, Dropout,Bidirectional
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
import gensim.models as gsm

### Import the dataset

In [None]:
drive.mount('/content/drive')
dataset_file = '/content/drive/MyDrive/dataset.csv'
# data[0]='😂'
# data[1]='😡'
# print(data[0])
data=pd.read_csv(dataset_file)

Mounted at /content/drive


### Import pre-trained models(fasttext and emoji2Vec)

In [None]:
# import fastText model for text

fastText_model_path = '/content/drive/MyDrive/cc.si.300.bin/cc.si.300.bin'
fasttext.FastText.eprint = lambda x: None
ft = fasttext.load_model(fastText_model_path)


#import emoji2Vec model for emojis

e2v = gsm.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/emoji2vec.bin', binary=True)

shuffle the dataset

In [None]:

data = shuffle(data)
print(data.tail())

      index                                            comment  \
2468   2469  අම්මො මට නං මේකත් ඇති .. 2021 වෙනකල් ඉන්න හිත ...   
9044   9045  ජාතිවාදය අවුස්සන්නට තැත් කරමින් ගොන් කම ප්‍රදර...   
891     892  මිනිස්සු ආතල් එකේ ඉන්නවට මං පට්ට කැමතියි. . හැ...   
5859   5860  ඇත්ත ජීවිතේ කොන්ද පන නැති වුන් ට්විටරෙට ආවම පක...   
4183   4184                        නියමයි වැඩේ දිගටම කරගෙන යමං   

     expression_emoji  label  
2468                😂      0  
9044               ❤️      2  
891                 😡      1  
5859                😂      1  
4183                😂      0  


### Split the dataset

In [None]:
# combined the emoji and text column for split in same points

combined_data = pd.concat([data['comment'], data['expression_emoji']], axis=1)

#split the dataset into train, test, validation
combined_train, combined_test, y_train, y_test = train_test_split(combined_data, data['label'], test_size=0.2, random_state=42, stratify=data['label'])

combined_actual_train,combined_val,y_actual_train,y_val=train_test_split(combined_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

#devide each column under train, test, validation
X_actual_train_text, X_actual_train_emoji = combined_actual_train['comment'], combined_actual_train['expression_emoji']

X_test_text, X_test_emoji= combined_test['comment'], combined_test['expression_emoji']

x_val_text,x_val_emoji=combined_val['comment'],combined_val['expression_emoji']


In [None]:
# for text and class values


X_train_text,X_test_text, y_train, y_test=train_test_split(data['comment'], data['label'],test_size=0.2,random_state=42,stratify=data['label'])

X_actual_train_text,x_val_text,y_actual_train,y_val=train_test_split(X_train_text,y_train,test_size=0.2,random_state=42,stratify=y_train)


In [None]:
# for emoji


X_train_emoji, X_test_emoji= train_test_split(data['Expression_emoji'], test_size=0.2, random_state=42,stratify=data['label'])
X_actual_emoji_train, x_emoji_val = train_test_split(X_train_emoji, test_size=0.2, random_state=42)
#  X_actual_emoji_train,x_emoji_val= train_test_split(X_train_emoji,test_size=0.2,random_state=42,stratify=data['label'])

Perform word to vector using fastText model for comment column

In [None]:
def get_word_vector(word):
    try:
        wordVec=ft.get_word_vector(word)


        return wordVec
    except KeyError:
        return np.zeros(ft.get_dimension())



def get_sentence_vector(sentence):

    if not isinstance(sentence, str):
        sentence = str(sentence)
    vectors = [get_word_vector(word) for word in sentence.split()]

    return np.mean(vectors, axis=0)

# data=data['comment'].head(4)

# for comment in data:
#   print(get_sentence_vector(comment))



In [None]:
X_train_text_vectors=np.array([get_sentence_vector(sentence) for sentence in X_actual_train_text])
X_test_text_vectors=np.array([get_sentence_vector(sentence) for sentence in X_test_text])
x_text_val_vectors=np.array([get_sentence_vector(sentence) for sentence in x_val_text])

 perform Emoji to vector task using emoji2Vec model for expression_emoji column

In [None]:
print(np.mean(get_word_vector('තියෙනවා')))

0.0009083676


In [None]:
print(np.mean(get_sentence_vector('මේකෙත් වෙනමම ආතල් එකක් තියෙනවා!!')))

0.00010161365


In [None]:
def get_emoji_vector(emoji):

    try:
        value=-e2v[emoji]

        return value

    except KeyError:
        return np.zeros(300)

Vector Normalization

In [None]:
textHate=np.mean(get_word_vector('පොන්නයෙක්ද'))
print(textHate)
emojiHate=np.mean(get_emoji_vector('😡'))
print(emojiHate)

-0.0007260342
-0.0015154603


In [None]:
textNon=np.mean(get_word_vector('තියෙනවා'))
print(textNon)
emojiNon=np.mean(get_emoji_vector('😂'))
print(emojiNon)


0.0009083676
0.0021346046


In [None]:
textNon=np.mean(get_word_vector('තියෙනවා'))
print(textNon)
emojiNon=np.mean(get_emoji_vector('🥰'))
print(emojiNon)


0.0009083676
0.0


In [None]:
CombineHate=np.mean([textHate,emojiHate])
print(CombineHate)

combineNon=np.mean([textNon,emojiNon])
print(combineNon)

combineMid=np.mean([textHate,emojiNon])
print(combineMid)

-0.0011207473
0.0004541838134173304
-0.0003630171122495085


In [None]:
X_train_emoji_vectors = np.array([get_emoji_vector(emoji) for emoji in X_actual_train_emoji])
X_test_emoji_vectors = np.array([get_emoji_vector(emoji) for emoji in X_test_emoji])
X_val_emoji_vectors=np.array([get_emoji_vector(emoji) for emoji in x_val_emoji])

In [None]:
# X_train_emoji_vectors = np.linalg.norm(X_train_emoji_vectors, axis=1, keepdims=True)
# X_test_emoji_vectors = np.linalg.norm(X_test_emoji_vectors, axis=1, keepdims=True)
# X_val_emoji_vectors = np.linalg.norm(X_val_emoji_vectors, axis=1, keepdims=True)

### Combined the emoji and comment vectors

In [None]:
X_train_vectors=np.mean([X_train_text_vectors,X_train_emoji_vectors],axis=0)

X_test_vectors=np.mean([X_test_emoji_vectors,X_test_text_vectors],axis=0)

X_val_vectors=np.mean([x_text_val_vectors,X_val_emoji_vectors],axis=0)

In [None]:
print(X_test_vectors)

[[-1.0005704e-01 -2.5081947e-02 -9.6125782e-02 ...  2.8096240e-02
  -7.6157525e-02 -6.9955558e-02]
 [-3.6899541e-03  7.4781240e-03 -4.7976583e-02 ...  6.2614366e-02
  -7.2210715e-03  4.8995636e-02]
 [-3.6899541e-03  7.4781240e-03 -4.7976583e-02 ...  6.2614366e-02
  -7.2210715e-03  4.8995636e-02]
 ...
 [ 2.6991028e-02  1.2518818e-02  1.2590548e-02 ...  1.1667472e-02
  -1.8293642e-02 -1.0773489e-02]
 [ 1.7217604e-02 -3.7000027e-02  3.2607704e-02 ...  1.7880958e-02
  -7.7907625e-03  9.4048366e-05]
 [ 5.2058958e-02  1.5794160e-02 -1.4743252e-03 ...  1.9772295e-02
  -1.9245077e-02  1.6191572e-02]]


Define model nature and aspects

In [None]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=6, restore_best_weights=True)

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train),y= y_train)

class_weight_dict = dict(enumerate(class_weights))

Define the model

In [None]:
from keras.layers import LeakyReLU
from keras.layers import LSTM
input_dim = 300

model = Sequential()

# model.add(Dense(2048, activation='relu', input_dim=input_dim))
# model.add(Dense(1024, activation='relu', input_dim=input_dim))
# model.add(Dropout(0.2))
# model.add(Dense(512))
# model.add(LeakyReLU(alpha=0.1,input_dim=input_dim))
# model.add(Dropout(0.2))
# model.add(Dense(256))
# model.add(LeakyReLU(alpha=0.1))
# model.add(Dropout(0.2))
# model.add(Dense(128))
# model.add(LeakyReLU(alpha=0.1))
# model.add(Dropout(0.2))
# model.add(Dense(64))
# model.add(LeakyReLU(alpha=0.1))
# model.add(Dropout(0.2))
# model.add(Dense(32))
# model.add(LeakyReLU(alpha=0.1))
# model.add(Dropout(0.2))

# model.add(Flatten())
timesteps=None
# model.add(Bidirectional(LSTM(2048,return_sequences=True, input_shape=(timesteps, input_dim))))

model.add(Bidirectional(LSTM(512, return_sequences=True,input_shape=(timesteps, input_dim))))
model.add(Bidirectional(LSTM(128, return_sequences=True,)))
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64, activation='relu'))
# Output layer
model.add(Dense(1, activation='sigmoid'))

Define the Optimization of the model

In [None]:
custom_optimizer = Adam(learning_rate=0.001)


In [None]:
y_test = pd.Series(y_test)

print(y_test.value_counts())

label
1    610
0    610
2    609
Name: count, dtype: int64


In [None]:
print(y_val.value_counts())

label
1    1081
0     465
2      32
Name: count, dtype: int64


In [None]:
print(y_actual_train.value_counts())

label
1    4324
0    1857
2     130
Name: count, dtype: int64


Train the model

In [None]:
# model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])
# history=model.fit(X_train_vecotrs, y_actual_train, epochs=100, batch_size=64, validation_data=(X_val_vectors, y_val), callbacks=[early_stopping])

# Compile the model

X_train_vecotrs_reshaped = X_train_vectors.reshape(X_train_vectors.shape[0], 1, X_train_vectors.shape[1])
X_val_vectors_reshaped = X_val_vectors.reshape(X_val_vectors.shape[0], 1, X_val_vectors.shape[1])
X_test_vecotrs_reshaped = X_test_vectors.reshape(X_test_vectors.shape[0], 1, X_test_vectors.shape[1])

model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
# history = model.fit(X_train_text_vectors, y_actual_train, epochs=100, batch_size=64, validation_data=(x_text_val_vectors, y_val), callbacks=[early_stopping],class_weight=class_weight_dict)

history = model.fit(X_train_vecotrs_reshaped, y_actual_train, epochs=100, batch_size=64, validation_data=(X_val_vectors_reshaped, y_val), callbacks=[early_stopping],class_weight=class_weight_dict)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


Test LSTM model

In [None]:

prediction=model.predict(X_test_vecotrs_reshaped)

print(prediction)
loss, accuracy = model.evaluate(X_test_vecotrs_reshaped, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[[0.4832496 ]
 [0.48325294]
 [1.        ]
 ...
 [1.        ]
 [1.        ]
 [0.4832514 ]]
Test Loss: -14.873109817504883, Test Accuracy: 0.5254237055778503


Classification report for LSTM

In [None]:
from sklearn.metrics import classification_report

y_test = np.array(y_test)
predicted_classes = np.array(prediction)
prediction=prediction.argmax(axis=1)
print("Classification Report:")
print(classification_report(y_test, prediction))

Classification Report:
              precision    recall  f1-score   support

           0       0.33      1.00      0.50       610
           1       0.00      0.00      0.00       610
           2       0.00      0.00      0.00       609

    accuracy                           0.33      1829
   macro avg       0.11      0.33      0.17      1829
weighted avg       0.11      0.33      0.17      1829



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Try with KMeans (Optional)



In [None]:
X_train_vecotrs_reshaped = X_train_vectors.reshape(X_train_vectors.shape[0], X_train_vectors.shape[1])
X_val_vectors_reshaped = X_val_vectors.reshape(X_val_vectors.shape[0], X_val_vectors.shape[1])
X_test_vecotrs_reshaped = X_test_vectors.reshape(X_test_vectors.shape[0], X_test_vectors.shape[1])

In [None]:
from re import X
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)

kmeans.fit(X_train_vecotrs_reshaped)

y_train_pred = kmeans.predict(X_train_vecotrs_reshaped)

y_val_pred = kmeans.predict(X_test_vecotrs_reshaped)




In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_val_pred)
print("Accuracy of K-means on validation data:", accuracy)


Accuracy of K-means on validation data: 0.33351558228540185


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_test = np.array(y_test)
predicted_classes = np.array(y_val_pred)


print("Classification Report:")
print(classification_report(y_test, predicted_classes))


Classification Report:
              precision    recall  f1-score   support

           0       0.57      1.00      0.73       610
           1       0.00      0.00      0.00       610
           2       0.00      0.00      0.00       609

    accuracy                           0.33      1829
   macro avg       0.19      0.33      0.24      1829
weighted avg       0.19      0.33      0.24      1829



# Try with K nearest Neighbours(Optional)

In [None]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_train_vecotrs_reshaped, y_actual_train)

y_train_pred = knn.predict(X_train_vecotrs_reshaped)

KNN_predict = knn.predict(X_test_vecotrs_reshaped)


In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, KNN_predict)
print("Accuracy of KNN on validation data:", accuracy)
print(y_test)
print(KNN_predict)

Accuracy of KNN on validation data: 0.7993439037725533
[1 1 2 ... 2 1 1]
[0 0 2 ... 2 1 1]


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_test = np.array(y_test)
predicted_classes = np.array(KNN_predict)
print(predicted_classes)

print("Classification Report:")
print(classification_report(y_test, predicted_classes))


[0 0 2 ... 2 1 1]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.65      0.71       610
           1       0.71      0.88      0.78       610
           2       0.94      0.87      0.90       609

    accuracy                           0.80      1829
   macro avg       0.81      0.80      0.80      1829
weighted avg       0.81      0.80      0.80      1829



Test by manually

In [None]:
import regex as re

user_comment = input("Enter a comment: ")


emoji_pattern = re.compile(r'\p{So}')

emojis = emoji_pattern.findall(user_comment)

text_without_emojis = emoji_pattern.sub('', user_comment)

user_text_vector = get_sentence_vector(user_comment)
user_emoji_vector=get_emoji_vector(emojis[0])

user_comment_vector=np.mean([user_text_vector,user_emoji_vector],axis=0)

user_comment_vector = user_comment_vector.reshape(1,-1)

prediction = knn.predict(user_comment_vector)

print(f"Predicted Class: {prediction}")

Enter a comment: සංහිදියාවට අවුලක් වෙයි ද😂
Predicted Class: [0]
