In [None]:
import warnings

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import torch
from transformers import logging
logging.set_verbosity_error()

warnings.filterwarnings("ignore")


class hate_Classifier():
    def __init__(self):
        #load the tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.label_dict = {'neither': 2, 'offensive_language': 1, 'hate_speech': 0}
        #initalize the model:
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = len(self.label_dict),
        output_attentions=False,output_hidden_states=False)
        #set the device:
        #load the model:
        #self.model.load_state_dict(torch.load('./model/finetuned_BERT.model', map_location=torch.device(device)))


    def get_encoding(self,text:str):
        #tokenize the text and return the encoding
        encoded=self.tokenizer.batch_encode_plus(text,add_special_tokens=True,
        return_attention_mask=True,pad_to_max_length=True,max_length=256)
        return encoded
    
    def predict(self,text:str):
        #convert the text to encoding:
        encoded=self.get_encoding(text)
        #structuring the input to the required format:
        input={
        'input_ids': torch.tensor(encoded["input_ids"],dtype =torch.long),
        'attention_mask': torch.tensor(encoded["attention_mask"],dtype=torch.long)
        }
        # predict the text:
        y=self.model(**input)

        logits = y['logits']
        #logits = logits.detach().cpu().numpy()
        output_class=self.map_output(logits)
        return output_class

    def map_output(self,x:torch.tensor):
        # post process the prediction and map the output the String word classes:
        y=torch.max(x,axis=1)
        for key, value in self.label_dict.items():
            if value == y:
                return key

    def predict_batch(self,texts:list)-> list:
        result=[]
        for text in texts:
            print("Text:",text)
            result.append({'id':text['id'],'result':self.predict(text['text'])})
            print(result)
        return result

In [None]:
model = hate_Classifier()

In [None]:
import pandas as pd 
import numpy as np
import re

In [None]:
dataset_path = './hate_speech_dataset.csv'

In [None]:
df = pd.read_csv(dataset_path,index_col='Unnamed: 0')
df.head()

In [None]:
def preprocessing(x):
    x = x.replace("\n"," ")
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z,. \t])|(\w+:\/\/\S+)"," ",x).split())

In [None]:
df['text'] = df['tweet'].map(preprocessing)

In [None]:
df['text']

In [None]:
model.predict(df['text'][0])