In [None]:
!pip install emoji
!pip install langdetect
!pip install contractions
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
# Text cleaning
import re
import string
import emoji
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

# Data preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from langdetect import detect, LangDetectException
import contractions
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Transformers library for BERT
import transformers
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, confusion_matrix

import time

# Set seed for reproducibility
import random
seed_value = 2042
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

# Set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

# Define stop words for text cleaning
stop_words = set(stopwords.words('english'))
batch_size=32
# Initialize lemmatizer for text cleaning
lemmatizer = WordNetLemmatizer()

profanity_txt=open('/content/drive/MyDrive/FYP/profanityList.txt','r')
profanity_txt=profanity_txt.read().strip()
profanity_list=profanity_txt.split('\n')

def highlight(text):
    highlighted_text = text

    for profane_word in profanity_list:
        pattern = r'\b{}\b'.format(re.escape(profane_word))
        highlighted_text = re.sub(pattern, '\033[91m{}\033[0m'.format(profane_word), highlighted_text, flags=re.IGNORECASE)

    return highlighted_text

device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def bert_tokenizer(data):
    input_ids = []
    attention_masks = []
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
            max_length=128,             # Choose max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            return_attention_mask=True      # Return attention mask
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

import torch.nn as nn
from transformers import BertModel

class Bert_Classifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(Bert_Classifier, self).__init__()
        # Specify hidden size of BERT, hidden size of the classifier, and number of labels
        n_input = 768
        n_hidden = 50
        n_output = 6

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate the classifier (a fully connected layer followed by a ReLU activation and another fully connected layer)
        self.classifier = nn.Sequential(
            nn.Linear(n_input, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_output)
        )

        # Freeze the BERT model weights if freeze_bert is True (useful for feature extraction without fine-tuning)
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # Feed input data (input_ids and attention_mask) to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        # Extract the last hidden state of the `[CLS]` token from the BERT output (useful for classification tasks)
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed the extracted hidden state to the classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

# Instantiate an instance of your model
model = Bert_Classifier(freeze_bert=False)

model_path = '/content/drive/MyDrive/FYP/bertmodel.pth'

# Load the model on CPU
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))['model_state_dict'])

class_id_to_label={0:'age',1:'ethnicity',2:'gender',3:'not_cyberbullying',4:'other_cyberbullying',5:'religion'}





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  plt.style.use("seaborn-whitegrid")


<Figure size 640x480 with 0 Axes>

In [None]:
sentence="smiteis on X: @XraigFree @hurrica45391237 @InvalidDimensi1 An example: everyone in school growing up during that period had gay, rape, murder, STD, etc jokes in their back pocket because that’s what got laughs. It wasn’t cool then and it isn’t cool now, but that’s how things were. / X (twitter.com)"
test_inputs, test_masks = bert_tokenizer([sentence])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
sentenceCheck = highlight(sentence)

model.eval()
logits = model(test_inputs, test_masks)

probs = nn.functional.softmax(logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1)

predicted_label = class_id_to_label[predicted_class.item()]
print(sentenceCheck)
print()
print("Predicted class:", predicted_label)

probabilitys=probs.tolist()[0]
for idx,prob in enumerate(probabilitys):
  print(class_id_to_label[idx]," : ",prob*100)

smiteis on X: @XraigFree @hurrica45391237 @InvalidDimensi1 An example: everyone in school growing up during that period had gay, [91mrape[0m, murder, STD, etc jokes in their back pocket because that’s what got laughs. It wasn’t cool then and it isn’t cool now, but that’s how things were. / X (twitter.com)

Predicted class: gender
age  :  13.765475153923035
ethnicity  :  1.4450113289058208
gender  :  71.86416387557983
not_cyberbullying  :  10.500755906105042
other_cyberbullying  :  1.9156375899910927
religion  :  0.5089579150080681
