<a href="https://colab.research.google.com/github/rakeshpuppala2590/Hate_speech_detection/blob/main/Hate_Speech_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torch transformers scikit-learn pandas




In [None]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 5e-5

In [None]:
!kaggle datasets download -d waalbannyantudre/hate-speech-detection-curated-dataset --unzip

Dataset URL: https://www.kaggle.com/datasets/waalbannyantudre/hate-speech-detection-curated-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading hate-speech-detection-curated-dataset.zip to /content
 78% 89.0M/114M [00:00<00:00, 98.8MB/s]
100% 114M/114M [00:01<00:00, 113MB/s]  


In [None]:
data_path = "HateSpeechDataset.csv"
df = pd.read_csv(data_path)

In [None]:
df.head()

Unnamed: 0,Content,Label,Content_int
0,denial of normal the con be asked to comment o...,1,"[146715, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,..."
1,just by being able to tweet this insufferable ...,1,"[146715, 14, 15, 16, 17, 7, 18, 19, 20, 21, 22..."
2,that is retarded you too cute to be single tha...,1,"[146715, 28, 29, 30, 26, 31, 32, 7, 5, 33, 28,..."
3,thought of a real badass mongol style declarat...,1,"[146715, 35, 1, 24, 36, 37, 38, 39, 40, 1, 41,..."
4,afro american basho,1,"[146715, 46, 47, 48, 146714]"


In [None]:
print(df['Label'].unique())

['1' '0' 'Label']


In [None]:
df = df[df['Label'].isin(['0', '1'])]

In [None]:
df = df[['Content', 'Label']].dropna()
print(df.head())

                                             Content Label
0  denial of normal the con be asked to comment o...     1
1  just by being able to tweet this insufferable ...     1
2  that is retarded you too cute to be single tha...     1
3  thought of a real badass mongol style declarat...     1
4                                afro american basho     1


In [None]:
df.dtypes

Unnamed: 0,0
Content,object
Label,object


In [None]:
df['Label'] = df['Label'].astype(int)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
texts = df['Content'].tolist()
labels = df['Label'].tolist()
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:
train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = HateSpeechDataset(val_texts, val_labels, tokenizer, MAX_LEN)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)




In [None]:
def train_model(model, train_loader, val_loader, optimizer, epochs, device):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_loss:.4f}")

        evaluate_model(model, val_loader, device)

In [None]:
def evaluate_model(model, val_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(classification_report(all_labels, all_preds))

In [None]:
train_model(model, train_loader, val_loader, optimizer, EPOCHS, device)

KeyboardInterrupt: 

In [None]:
model_save_path = "bert_hate_speech_model.pt"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to bert_hate_speech_model.pt


In [104]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Match num_labels to your dataset
model.load_state_dict(torch.load('bert_hate_speech_model.pt'))
model = model.to(device)
model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('bert_hate_speech_model.pt'))


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [105]:
def test_model(model, text, tokenizer, device, max_len=128):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

    label_map = {0: "Neutral", 1: "Offensive", 2: "Hate Speech"}
    return label_map[preds.item()]


In [106]:
text = "This is a sample hate speech!"
prediction = test_model(model, text, tokenizer, device)
print(f"Prediction: {prediction}")


Prediction: Neutral


In [107]:
text = "shut your mouth!"
prediction = test_model(model, text, tokenizer, device)
print(f"Prediction: {prediction}")


Prediction: Offensive


In [108]:
text = "can i order coffee please!"
prediction = test_model(model, text, tokenizer, device)
print(f"Prediction: {prediction}")


Prediction: Neutral


In [None]:
!pip install streamlit pyngrok


Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.2-py3-none-any.whl.metadata (8.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.2-py3-none-any.whl (22 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64

In [110]:
%%writefile app.py
import streamlit as st
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the saved model and tokenizer
model_path = 'bert_hate_speech_model.pt'  # Path to your saved model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.load_state_dict(torch.load(model_path))  # Load the saved model weights
model.eval()  # Set the model to evaluation mode
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to predict the label for the given text
def predict(text):
    # Preprocess input
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Map predicted class to label
    label_map = {0: "Neutral", 1: "Offensive", 2: "Hate Speech"}
    return label_map[predicted_class]

# Streamlit UI
st.title("Hate Speech Detection using BERT")
st.write("This app detects hate speech, offensive, and neutral content in text.")

# Text input
user_input = st.text_area("Enter text for prediction:")

# Optionally, run the model prediction on button click
if st.button("Run Prediction"):
    if user_input:
        prediction = predict(user_input)
        st.success(f"Prediction: {prediction}")
    else:
        st.error("Please enter some text to predict.")


Overwriting app.py


In [102]:
from pyngrok import ngrok
from subprocess import Popen
from threading import Thread

# Function to run Streamlit
def run_streamlit():
    Popen(['streamlit', 'run', 'app.py'])

# Start Streamlit in a background thread
thread = Thread(target=run_streamlit)
thread.start()

# Start ngrok tunnel for Streamlit (on port 8501)
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at: {public_url}")


Streamlit app is live at: NgrokTunnel: "https://8a6c-34-19-82-184.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!killall ngrok