# **Data preprocessing for Hate speech detection**

**Citation**
@inproceedings{chung-etal-2019-conan,
    title = "{CONAN} - {CO}unter {NA}rratives through Nichesourcing: a Multilingual Dataset of Responses to Fight 
    #Online Hate Speech",
    author = "Chung, Yi-Ling and Kuzmenko, Elizaveta and Tekiroglu, Serra Sinem and Guerini, Marco",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P19-1271",
    doi = "10.18653/v1/P19-1271",
    pages = "2819--2829"
}

In [3]:
!pip install nltk
!pip install keras
!pip install tensorflow
!pip install tensorflow_hub
!pip install transformers

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting click (from nltk)
  Obtaining dependency information for click from https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl.metadata
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Obtaining dependency information for regex>=2021.8.3 from https://files.pythonhosted.org/packages/8f/3e/4b8b40eb3c80aeaf360f0361d956d129bb3d23b2a3ecbe3a04a8f3bdd6d3/regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss


##### Data visualization

In [92]:
#importing and reading the dataset
read_data = pd.read_csv('/workspaces/CONAN/CONAN/CONAN.csv')
df = read_data.copy()
df.head()

Unnamed: 0,cn_id,hateSpeech,counterSpeech,hsType,hsSubType,cnType,age,gender,educationLevel
0,ENT1ST0001HS0033CN000021,"According to a recent OFSTED report, a school ...","To be fair, the OFSTED report is more concerne...",Islamophobia,crimes,facts,61.0,male,Bachelor
1,ENT1ST0001HS0033CN000021P1,In Birmingham there is a school where girls an...,"To be fair, the OFSTED report is more concerne...",Islamophobia,crimes,facts,61.0,male,Bachelor
2,ENT1ST0001HS0033CN000021P2,A school in Birningham is still separating gir...,"To be fair, the OFSTED report is more concerne...",Islamophobia,crimes,facts,61.0,male,Bachelor
3,ENT1ST0001HS0033CN000022,"According to a recent OFSTED report, a school ...",While this story is presented as an 'Islamic i...,Islamophobia,crimes,facts,61.0,male,Bachelor
4,ENT1ST0001HS0033CN000022P1,In Birmingham there is a school where girls an...,While this story is presented as an 'Islamic i...,Islamophobia,crimes,facts,61.0,male,Bachelor


In [93]:
#print distinct values in column'hsType'
print(df['hsType'].unique())
#print all columns in the dataset
print(df.columns)


['Islamophobia']
Index(['cn_id', 'hateSpeech', 'counterSpeech', 'hsType', 'hsSubType', 'cnType',
       'age', 'gender', 'educationLevel'],
      dtype='object')


The dataset Conan.csv only contains islamophobia related hate speech data and it's counterSpeech

In [94]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14988 entries, 0 to 14987
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cn_id           14988 non-null  object 
 1   hateSpeech      14988 non-null  object 
 2   counterSpeech   14988 non-null  object 
 3   hsType          14988 non-null  object 
 4   hsSubType       14988 non-null  object 
 5   cnType          14988 non-null  object 
 6   age             12207 non-null  float64
 7   gender          12207 non-null  object 
 8   educationLevel  12207 non-null  object 
dtypes: float64(1), object(8)
memory usage: 1.0+ MB
None


##### Data Tokenizing

In [99]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
df['text_tokens'] = df['cn_id'].apply(lambda text: tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512))
print(df['text_tokens'].head())

0    [101, 4372, 2102, 2487, 3367, 8889, 24096, 789...
1    [101, 4372, 2102, 2487, 3367, 8889, 24096, 789...
2    [101, 4372, 2102, 2487, 3367, 8889, 24096, 789...
3    [101, 4372, 2102, 2487, 3367, 8889, 24096, 789...
4    [101, 4372, 2102, 2487, 3367, 8889, 24096, 789...
Name: text_tokens, dtype: object


##### Data Padding and Truncating

In [100]:
from keras.preprocessing.sequence import pad_sequences
import torch
max_seq_length = 128
# Convert the 'text_tokens' column from a list of integers to PyTorch tensors
df['text_tokens'] = df['text_tokens'].apply(lambda x: torch.tensor(x))
# Ensure all sequences are of the same length
df['text_tokens'] = pad_sequences(df['text_tokens'], maxlen=max_seq_length, dtype="long", value=0, truncating="post", padding="post")
df['text_tokens']

# Set the maximum sequence length

# Pad or truncate the sequences
#df['text_tokens'] = pad_sequences(df['text_tokens'], maxlen=max_seq_length, dtype="long", value=0, truncating="post", padding="post")
# print some tokenized sentences
#df['text_tokens'].head()

0        101
1        101
2        101
3        101
4        101
        ... 
14983    101
14984    101
14985    101
14986    101
14987    101
Name: text_tokens, Length: 14988, dtype: int64

##### Hate speech encoding

In [101]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['hateSpeech'] = label_encoder.fit_transform(df['hateSpeech'])


##### data split

In [102]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [103]:
#print a snippet of thetraining dataset
train_df.head()


Unnamed: 0,cn_id,hateSpeech,counterSpeech,hsType,hsSubType,cnType,age,gender,educationLevel,text_tokens
14472,ITT1ST0019HS0010CN000943,173,"Anche gli Italiani violentano, che si fa, ci c...",Islamophobia,rapism,hypocrisy,37.0,male,High school,101
820,ENT1ST0008HS0053CN000445P1,613,Define western society? Who chooses this?,Islamophobia,culture,question,25.0,male,Master,101
1643,ENT1ST0012HS0032CN000548P2,244,"Many Muslims aren't homophobic and sexist, but...",Islamophobia,"culture , women",facts,21.0,female,Master,101
7530,FRT1ST0014HS0038CN000996P2,354,Avez vous étudiez l'islam?,Islamophobia,generic,question,34.0,male,Bachelor,101
11851,ITT1ST0013HS0013CN000287T1,729,I do not think there is anything that can conf...,Islamophobia,economics,denouncing,,,,101


##### Data Loader and Batch Processing:

In [104]:
import torch
from torch.utils.data import DataLoader, TensorDataset
batch_size = 32

train_data = TensorDataset(torch.tensor(train_df['text_tokens'].tolist()), torch.tensor(train_df['hateSpeech'].tolist()))
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)


In [105]:
# print input_ids of the first batch
data = next(iter(train_loader))
print(data[0])


tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
        101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
        101, 101, 101, 101])


##### Fine-Tuning BERT Model

In [108]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define lists to store training and validation metrics
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

# Fine-tune the model
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, labels = batch
        optimizer.zero_grad()

        outputs = model(input_ids)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Calculate training accuracy
    train_accuracy = accuracy_score(true_labels, predicted_labels)
    train_losses.append(total_loss / len(train_loader))
    train_accuracies.append(train_accuracy)

    print(f"Epoch {epoch+1}, Training Loss: {train_losses[-1]:.4f}, Training Accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    val_true_labels = []
    val_predicted_labels = []
    val_total_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, labels = batch
            outputs = model(input_ids)
            loss = outputs.loss

            _, predicted = torch.max(outputs.logits, 1)

            labels = labels.cpu().numpy()
            predicted = predicted.cpu().numpy()

            val_true_labels.extend(labels)
            val_predicted_labels.extend(predicted)
            val_total_loss += loss.item()

    # Calculate validation accuracy
    val_accuracy = accuracy_score(val_true_labels, val_predicted_labels)
    val_losses.append(val_total_loss / len(val_loader))
    val_accuracies.append(val_accuracy)

    print(f"Validation Loss: {val_losses[-1]:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Save the trained model
model.save_pretrained("path_to_save_model")


IndexError: too many indices for tensor of dimension 1

##### Model Evaluation:

In [None]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score



# Lists to store the true labels and predicted labels
true_labels = []
predicted_labels = []

model.eval()

val_data = TensorDataset(torch.tensor(val_df['text_tokens'].tolist()), torch.tensor(val_df['hateSpeech'].tolist()))
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)


with torch.no_grad():
    for batch in val_loader:
        input_ids, labels = batch
        outputs = model(input_ids)

        # Get predicted labels
        _, predicted = torch.max(outputs.logits, 1)

        # Convert tensors to numpy arrays
        labels = labels.cpu().numpy()
        predicted = predicted.cpu().numpy()

        # Append true and predicted labels to the lists
        true_labels.extend(labels)
        predicted_labels.extend(predicted)

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)
roc_auc = roc_auc_score(true_labels, predicted_labels)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

