In [163]:
#imports libraries
import numpy as np
import pandas as pd

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical
from keras import backend as K
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score


In [164]:
#import data
# df_twitter = pd.read_csv("twitter_hate_speech.csv")
# df_nepali = pd.read_excel("nepali_hate_speech.xlsx")

df = pd.read_csv("merge_data.csv")

In [165]:
df

Unnamed: 0,text,label,category
0,!!! RT @mayasolovely: As a woman you shouldn't...,2,neither
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,offensive_language
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,offensive_language
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,offensive_language
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,offensive_language
...,...,...,...
25855,hutihara,0,hate_speech
25856,hutihara,0,hate_speech
25857,haija,2,neither
25858,hwau nabhayeko,0,hate_speech


In [63]:
# df_nepali

### Pre Process

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25860 entries, 0 to 25859
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      25860 non-null  object
 1   label     25860 non-null  int64 
 2   category  25860 non-null  object
dtypes: int64(1), object(2)
memory usage: 606.2+ KB


In [166]:
df.shape
df.head(5)

Unnamed: 0,text,label,category
0,!!! RT @mayasolovely: As a woman you shouldn't...,2,neither
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,offensive_language
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,offensive_language
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,offensive_language
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,offensive_language


In [169]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Removing special characters, URLs, and extra whitespaces
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    # Tokenization
    text = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    
    # Stemming
    stemmer = SnowballStemmer('english')
    text = [stemmer.stem(word) for word in text]
    return ' '.join(text)

In [170]:
df['text'] = df['text'].apply(preprocess_text)

In [171]:
df['text']

0        rt mayasolov woman complain clean hous amp man...
1        rt mleew17 boy dat cold tyga dwn bad cuffin da...
2        rt urkindofbrand dawg rt 80sbaby4lif ever fuck...
3               rt c g anderson viva base look like tranni
4        rt shenikarobert shit hear might true might fa...
                               ...                        
25855                                             hutihara
25856                                             hutihara
25857                                                haija
25858                                       hwau nabhayeko
25859                                                 fuck
Name: text, Length: 25860, dtype: object

In [172]:
# Label Encoding
# label_encoder = LabelEncoder()
# df['label'] = label_encoder.fit_transform(data['class'])

# Text Vectorization using TF-IDF
# tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
# X = tfidf_vectorizer.fit_transform(data['text'])

In [173]:
# Feature Selection using chi-squared test
# k_best = SelectKBest(chi2, k=1000)
# X = k_best.fit_transform(X, data['label'])
# X = X.toarray()

In [174]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [175]:
# Split the data into training and validation sets
# y = data['label']
# Split the data into training, validation, and test sets
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

In [176]:
y_test

18696    1
341      2
1510     1
2201     1
24739    1
        ..
5198     1
2405     1
17462    2
21735    1
24631    1
Name: label, Length: 5172, dtype: int64

In [177]:
# declare the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# tokenize the train and test dataset
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

In [178]:
## Padding -> to uniform the datas
max_length = max(len(seq) for seq in X_train)

# to test an outlier case (if one of the test dataset has longer length)
for x in X_test:
    if len(x) > max_length:
        print(f"an outlier detected: {x}")

X_train = pad_sequences(X_train, maxlen = max_length)
X_test = pad_sequences(X_test, maxlen = max_length)

In [179]:
y_test = to_categorical(y_test, num_classes=3)
y_train = to_categorical(y_train, num_classes=3)

In [180]:
X_train

array([[   0,    0,    0, ..., 3378, 9744, 4836],
       [   0,    0,    0, ...,  112, 2215,  323],
       [   0,    0,    0, ...,  899, 1456, 9748],
       ...,
       [   0,    0,    0, ...,  345, 2212, 1362],
       [   0,    0,    0, ...,  214,  307,   18],
       [   0,    0,    0, ...,   27,  707,   91]], dtype=int32)

In [181]:
print(f"num test tweet: {y_test.shape[0]}")
print(f"num train tweet: {y_train.shape[0]}")

num test tweet: 5172
num train tweet: 20688


In [182]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    precisions = precision(y_true, y_pred)
    recalls = recall(y_true, y_pred)
    return 2*((precisions*recalls)/(precisions+recalls+K.epsilon()))

In [85]:
# change dis if u want
output_dim = 200

new_model = Sequential([
    Embedding(vocab_size, output_dim, input_length=max_length),
    # lstm for xxx
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    # dropout to prevent overfitting
    Dropout(0.5),
    # dense to connect the previous output with current layer
    Dense(128, activation="relu"),
    # dropout to prevent overfitting
    Dropout(0.5),
    # this is output layer, with 3 class (0, 1, 2)
    Dense(3, activation="softmax"),
])

new_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy',f1,precision, recall])

In [86]:
new_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 85, 200)           5063200   
                                                                 
 bidirectional_2 (Bidirecti  (None, 128)               135680    
 onal)                                                           
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 3)                 387       
                                                      

In [88]:
# Train the model
model_history = new_model.fit(
    X_train,
    y_train,
    batch_size = 64,
    epochs=1,
    validation_data=(X_test, y_test)
)



In [None]:
# model_history

In [17]:
## Padding -> to uniform the datas
# max_length = max(seq.shape[0] for seq in X_train)

In [18]:
# Define the BiLSTM Model
# model = Sequential()
# model.add(Embedding(X_train.shape[0], output_dim=200, input_length=max_length))
# model.add(Bidirectional(LSTM(64, return_sequences=True)))
# model.add(Bidirectional(LSTM(64)))
# model.add(Dense(1, activation='sigmoid'))

In [19]:
# Compile the Model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
# Define callbacks for optimization
checkpoint = ModelCheckpoint("best_model.h5", save_best_only=True, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6)

In [21]:
#  Training
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=1, batch_size=64, callbacks=[checkpoint, early_stopping, reduce_lr])

Epoch 1/10
Epoch 2/10


  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2a0bff790>

In [22]:
model.save('my_model')

INFO:tensorflow:Assets written to: my_model/assets


INFO:tensorflow:Assets written to: my_model/assets


In [41]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
# Evaluation (using the X_test and y_test from previous split)
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)



In [24]:
y_pred_binary

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

### Optimizing

In [92]:
from kerastuner.tuners import RandomSearch

  from kerastuner.tuners import RandomSearch


In [90]:
# Define the hypermodel
def build_model(hp):
    model = Sequential()
    model.add(Embedding(vocab_size, hp.Int('embedding_dim', min_value=50, max_value=300, step=50), input_length=max_length))
    model.add(Bidirectional(LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32), dropout=hp.Float('lstm_dropout', min_value=0.2, max_value=0.5, step=0.1), recurrent_dropout=hp.Float('recurrent_dropout', min_value=0.2, max_value=0.5, step=0.1))))
    model.add(Dropout(hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(hp.Int('dense_units', min_value=64, max_value=256, step=32), activation="relu"))
    model.add(Dropout(hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(3, activation="softmax"))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1, precision, recall])
    
    return model

In [94]:
# Instantiate the tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5, 
    directory='my_dir',
    project_name='hate_speech'
)

In [96]:
# Search for the best hyperparameter configuration
tuner.search(X_train, y_train, epochs=1, validation_data=(X_test, y_test))

Trial 5 Complete [00h 01m 19s]
val_accuracy: 0.8837973475456238

Best val_accuracy So Far: 0.895591676235199
Total elapsed time: 00h 14m 13s


In [97]:
# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

In [98]:
# Train the best model
best_model_history = best_model.fit(X_train, y_train, epochs=1, validation_data=(X_test, y_test))



###  GRU

In [100]:
from tensorflow.keras.layers import GRU

# Define the hypermodel for GRU
def build_gru_model(hp):
    model = Sequential()
    model.add(Embedding(vocab_size, hp.Int('embedding_dim', min_value=50, max_value=300, step=50), input_length=max_length))
    model.add(Bidirectional(GRU(hp.Int('gru_units', min_value=32, max_value=128, step=32), dropout=hp.Float('gru_dropout', min_value=0.2, max_value=0.5, step=0.1), recurrent_dropout=hp.Float('recurrent_dropout', min_value=0.2, max_value=0.5, step=0.1))))
    model.add(Dropout(hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(hp.Int('dense_units', min_value=64, max_value=256, step=32), activation="relu"))
    model.add(Dropout(hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(3, activation="softmax"))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1, precision, recall])
    
    return model

# Instantiate the tuner for GRU
gru_tuner = RandomSearch(
    build_gru_model,
    objective='val_accuracy',
    max_trials=5,
    directory='my_gru_dir',
    project_name='hate_speech_gru'
)

# Search for the best hyperparameter configuration for GRU
gru_tuner.search(X_train, y_train, epochs=1, validation_data=(X_test, y_test))

# Get the best GRU model
best_gru_model = gru_tuner.get_best_models(num_models=1)[0]

# Train the best GRU model
best_gru_model_history = best_gru_model.fit(X_train, y_train, epochs=1, validation_data=(X_test, y_test))

Trial 5 Complete [00h 02m 47s]
val_accuracy: 0.8934648036956787

Best val_accuracy So Far: 0.8934648036956787
Total elapsed time: 00h 07m 27s


In [150]:
def predict_category(text, model, max_length):
    # Preprocess the input text
    preprocessed_text = preprocess_text(text)

    # Tokenize and pad the sequence
    # declare the tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)

    # tokenize the train and test dataset
    X_input = tokenizer.texts_to_sequences(text)
    X_input = pad_sequences(X_input, maxlen=max_length)

    # Make predictions using the trained model
    prediction = model.predict(X_input)[0]

    #Convert the prediction to a category label
    predicted_label = np.argmax(prediction)

    # Map the category label to its original class name
#     predicted_category = category_mapping[predicted_label]

    return predicted_label

In [162]:
# Example usage:
input_text = "Black people not allowed"
predicted_category = predict_category(input_text, best_model, max_length)

print("Predicted Category:", predicted_category)

Predicted Category: 1
