In [None]:
import warnings
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import torch
from transformers import logging
logging.set_verbosity_error()

warnings.filterwarnings("ignore")


class hate_Classifier():
    def __init__(self):
        #load the tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.label_dict = {'neither': 2, 'offensive_language': 1, 'hate_speech': 0}
        #initalize the model:
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = len(self.label_dict),
        output_attentions=False,output_hidden_states=False)
        #set the device:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        #load the model:
        self.model.load_state_dict(torch.load('./model/finetuned_BERT.model', map_location=torch.device(device)))


    def get_encoding(self,text:str):
        #tokenize the text and return the encoding
        encoded=self.tokenizer.batch_encode_plus(text,add_special_tokens=True,
        return_attention_mask=True,pad_to_max_length=True,max_length=256)
        return encoded
    
    def predict(self,text:str):
        #convert the text to encoding:
        encoded=self.get_encoding(text)
        #structuring the input to the required format:
        input={
        'input_ids': torch.tensor(encoded["input_ids"],dtype =torch.long),
        'attention_mask': torch.tensor(encoded["attention_mask"],dtype=torch.long)
        }
        # predict the text:
        y=self.model(**input)

        logits = y['logits']
        #logits = logits.detach().cpu().numpy()
        output_class=self.map_output(logits)
        return output_class

    def map_output(self,x:torch.tensor):
        # post process the prediction and map the output the String word classes:
        y=torch.max(x,axis=1)
        for key, value in self.label_dict.items():
            if value == y:
                return key

    def predict_batch(self,texts:list)-> list:
        result=[]
        for text in texts:
            print("Text:",text)
            result.append({'id':text['id'],'result':self.predict(text['text'])})
            print(result)
        return result

In [None]:
model = hate_Classifier()

In [1]:
import pandas as pd 
import numpy as np
import re

In [2]:
dataset_path = './hate_speech_dataset.csv'

In [3]:
df = pd.read_csv(dataset_path,index_col='Unnamed: 0')
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
def preprocessing(x):
    x = x.replace("\n"," ")
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z,. \t])|(\w+:\/\/\S+)"," ",x).split())

In [5]:
df['text'] = df['tweet'].map(preprocessing)

In [10]:
df['target'] = df['hate_speech'].map(lambda x: True if(x>0) else False)

In [13]:
100*df['target'].value_counts()/len(df)

False    79.853125
True     20.146875
Name: target, dtype: float64

In [None]:
df['text']

In [None]:
model.predict(df['text'][0])

## other model

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.metrics import classification_report,confusion_matrix

In [7]:
#using the pretrained bert text processor
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')  
# using the pretrained bert encoder with trainable feature as false
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4',trainable=False)

In [None]:
query = tf.keras.layers.Input(shape = (),dtype = 'string',name = 'input') #input layer  
preprocessed_text = bert_preprocess(query) # preprocessing layer
outputs = bert_encoder(preprocessed_text) #encoding layer
final = tf.keras.layers.Dense(1,activation='sigmoid')(outputs['pooled_output']) #final prediction layer
model = tf.keras.models.Model(query,final)
model.summary()

In [None]:
loss = tf.keras.losses.binary_crossentropy
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-04)
METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),
           tf.keras.metrics.Precision(),
           tf.keras.metrics.Recall()]

In [None]:
model.compile(optimizer = optimizer,loss=loss,metrics=METRICS)