In [None]:
!pip install pandas scikit-learn matplotlib tensorflow





**Import the necessary libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional


**Dataset**

In [None]:
from google.colab import files

uploaded = files.upload()

Saving speech.csv to speech.csv


In [None]:
# Load the dataset
data = pd.read_csv('speech.csv')

# Preprocessing
data['tweet'] = data['tweet'].str.lower()



In [None]:
class_mapping = {
    0: 'Offensive',
    1: 'Neither',
    2: 'Hate Speech'
}

# Map the 'class' column to descriptive labels
data['class'] = data['class'].map(class_mapping)

In [None]:

print(data.head())


   Unnamed: 0  count  hate_speech  offensive_language  neither        class  \
0           0      3            0                   0        3  Hate Speech   
1           1      3            0                   3        0      Neither   
2           2      3            0                   3        0      Neither   
3           3      3            0                   2        1      Neither   
4           4      6            0                   6        0      Neither   

                                               tweet  
0  !!! rt @mayasolovely: as a woman you shouldn't...  
1  !!!!! rt @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! rt @urkindofbrand dawg!!!! rt @80sbaby...  
3  !!!!!!!!! rt @c_g_anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! rt @shenikaroberts: the shit you...  


In [None]:
import pickle

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tweet'])
X = tokenizer.texts_to_sequences(data['tweet'])
X = pad_sequences(X)


# Encode the labels
label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['class'])

# Set the encoded labels as y
y = data['encoded_label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label encoder
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 87, 100)           3650900   
                                                                 
 bidirectional (Bidirection  (None, 128)               84480     
 al)                                                             
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 3735509 (14.25 MB)
Trainable params: 3735509 (14.25 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Model Training**

In [None]:
model.fit(X_train, y_train, validation_split=0.1, epochs=5, batch_size=64)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 64/279 [=====>........................] - ETA: 46s - loss: -4.9499 - accuracy: 0.9304

**Model Evaluation**

In [None]:
y_pred = model.predict(X_test)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

print("Accuracy:", accuracy_score(y_test, y_pred_binary))
print("Classification Report:\n", classification_report(y_test, y_pred_binary, target_names=label_encoder.classes_))


Accuracy: 0.8380068589872907
Classification Report:
               precision    recall  f1-score   support

 Hate Speech       0.83      0.48      0.61       835
     Neither       0.84      0.98      0.90      3832
   Offensive       0.00      0.00      0.00       290

    accuracy                           0.84      4957
   macro avg       0.56      0.49      0.50      4957
weighted avg       0.79      0.84      0.80      4957



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


** Save the model as a HDF5 file**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam


# Save the model to an HDF5 file
model.save('hate_speech_detection_model.h5')


  saving_api.save_model(


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Load the model and tokenizer
model = load_model('hate_speech_detection_model.h5')  # Load your trained model

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the label encoder
with open('label_encoder.pickle', 'rb') as handle:
    label_encoder = pickle.load(handle)

def detect_hate_speech(text, hate_speech_range=(0.0, 0.48), offensive_range=(0.48, 0.99)):
    # Preprocess the input text
    text = text.lower()
    text_seq = tokenizer.texts_to_sequences([text])
    text_seq = pad_sequences(text_seq, maxlen=87)

    # Predict using the trained model
    prediction = model.predict(text_seq)
    predicted_prob = prediction[0][0]  # Probability of hate speech

    # Classify based on the predicted probability
    if hate_speech_range[0] <= predicted_prob <= hate_speech_range[1]:
        prediction_label = "Non Hate Speech"
    elif offensive_range[0] <= predicted_prob <= offensive_range[1]:
        prediction_label = "Offensive"
    else:
        prediction_label = "Hate Speech"

    return prediction_label

# Example usage
user_input = input("Enter a text: ")
prediction = detect_hate_speech(user_input)

print("Prediction:", prediction)








Enter a text: You suck loser, you're gonna die alone!!
Prediction: Hate Speech
