In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd drive/My \Drive/NLP

In [None]:
pip install torch~=2.4.0 torch_xla[tpu]~=2.4.0 -f https://storage.googleapis.com/libtpu-releases/index.html

In [None]:
!pip install textattack==0.3.7

In [None]:
!pip install lime

In [None]:
# Load libraries
import nltk
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
import torch_xla
import torch_xla.core.xla_model as xm
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import random
import nltk
from lime.lime_text import LimeTextExplainer
import numpy as np
from lime.lime_text import LimeTextExplainer
import torch
from transformers import DistilBertTokenizer
import random
from nltk.corpus import wordnet
nltk.download('wordnet')

In [None]:
# Load the dataset
df = pd.read_csv('./Data/KaggleData.csv')

# Convert to lowercase, remove punctuation, extra spaces, URLs, mentions, and hashtags
df['tweet'] = df['tweet'].str.lower().replace(r'[^\w\s]', '', regex=True).replace(' {2,}', ' ', regex=True).replace('"', '')
df['tweet'] = df['tweet'].replace(r'http\S+|www.\S+|@\w+|#\w+', '', regex=True)

# Define Dataset class for tokenization and encoding
# 0 - hate speech, 1 - offensive language, 2 - neither as positive or negative
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = str(self.tweets[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Preprocessing
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 5

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['class'], test_size=0.3, stratify=df['class'], random_state=42)

# Reset index for train and test DataFrames
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Prepare DataLoaders
train_dataset = TweetDataset(X_train, y_train, tokenizer, MAX_LEN)
test_dataset = TweetDataset(X_test, y_test, tokenizer, MAX_LEN)
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Model setup
device = xm.xla_device()
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(y_train))).to(device)

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for batch in train_data_loader:
          input_ids = batch["input_ids"].to(device)
          attention_mask = batch["attention_mask"].to(device)
          labels = batch["labels"].to(device)
          outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

          loss = outputs.loss
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

          xm.optimizer_step(optimizer)
          scheduler.step()
          optimizer.zero_grad()
          xm.mark_step()
          epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {epoch_loss / len(train_data_loader)}")

    # Save Model
    torch.save(model, './Weights/KaggleDistilBERT.pth')

model = torch.load('./Weights/KaggleDistilBERT.pth')

# Evaluation
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for batch in test_data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        _, preds = torch.max(outputs.logits, dim=1)
        y_pred.extend(preds.detach().cpu().numpy().tolist())
        y_true.extend(labels.detach().cpu().numpy().tolist())
true_labels, predictions = np.asarray(y_true), np.asarray(y_pred)

# Calculate accuracy, precision, recall, F1-score, and confusion matrix
accuracy = np.mean(np.array(predictions) == np.array(true_labels))
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
conf_mat = confusion_matrix(true_labels, predictions)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1_score)
print("Confusion Matrix:\n", conf_mat)

In [None]:
word2index = {word: i for i, word in enumerate(set(df['tweet'].str.cat(sep=' ').split()), 1)}

word2index = {}
for tweet in df['tweet']:
    for word in tweet:
        if word not in word2index:
            word2index[word] = len(word2index)

import random
import numpy as np
import torch
import nltk
from lime.lime_text import LimeTextExplainer
from transformers import DistilBertTokenizer

# Wrapper for the DistilBERT model
class BERTWrapper:
    def __init__(self, model, tokenizer, max_len=128):
        self.model = model
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.device = next(model.parameters()).device

    def __call__(self, text_input_list):
        self.model.eval()
        preds = []
        for text in text_input_list:
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt',
                truncation=True
            )
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)

            pred = torch.softmax(outputs.logits, dim=1).squeeze().cpu().numpy()
            preds.append(pred)
        return np.array(preds)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Initialize the wrapped model
wrapped_model = BERTWrapper(model, tokenizer, MAX_LEN)

class_names = ['hate_speech', 'offensive_language', 'neither']

# Explainability
def lime_analysis(text, wrapped_model, class_names):
    explainer = LimeTextExplainer(class_names=class_names)
    exp = explainer.explain_instance(text, wrapped_model, num_features=10, num_samples=100)
    return exp.as_list()

# Ensure df['tweet'] contains preprocessed tweets
text_to_explain = random.choice(df['tweet'])
print("Text to explain:", text_to_explain)
lime_results = lime_analysis(text_to_explain, wrapped_model, class_names)
print("LIME analysis results:")
print(lime_results)

def calculate_doe(lime_results):
    feature_scores = [abs(score) for _, score in lime_results]
    std_dev = np.std(feature_scores)
    significant_features = len([score for score in feature_scores if score > std_dev])
    return significant_features / len(feature_scores)

doe = calculate_doe(lime_results)
print("Degree of Explainability (DoE):", doe)

In [None]:
import random
import numpy as np
import pandas as pd
from lime.lime_text import LimeTextExplainer
from transformers import DistilBertTokenizer

# Define number of samples for analysis
num_sam = 100

# Load dataset
def load_custom_dataset(path):
    df = pd.read_csv(path)
    df = df.dropna(subset=['tweet', 'class'])
    return df

# LIME Analysis
def lime_analysis(text, wrapped_model, class_names):
    explainer = LimeTextExplainer(class_names=class_names)
    exp = explainer.explain_instance(text, wrapped_model, num_features=10, num_samples=100)
    return exp.as_list()

# Calculate Degree of Explainability (DoE)
def calculate_doe(lime_results):
    feature_scores = [abs(score) for _, score in lime_results]
    std_dev = np.std(feature_scores)
    significant_features = len([score for score in feature_scores if score > std_dev])
    return significant_features / len(feature_scores)

# Calculate average DoE for Multiple samples
def calculate_average_doe(df, wrapped_model, class_names, samples=num_sam):
    doe_values = []
    sample_texts = random.sample(list(df['tweet']), samples)
    for text in sample_texts:
        lime_results = lime_analysis(text, wrapped_model, class_names)
        doe = calculate_doe(lime_results)
        doe_values.append(doe)
    average_doe = np.mean(doe_values)
    return average_doe

# Path to the dataset
test_data_path = "./Data/KaggleData.csv"
df = load_custom_dataset(test_data_path)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Wrapper for the DistilBERT model
class BERTWrapper:
    def __init__(self, model, tokenizer, max_len=128):
        self.model = model
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.device = next(model.parameters()).device

    def __call__(self, text_input_list):
        self.model.eval()
        preds = []
        for text in text_input_list:
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt',
                truncation=True
            )
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)

            pred = torch.softmax(outputs.logits, dim=1).squeeze().cpu().numpy()
            preds.append(pred)
        return np.array(preds)

# Initialize the wrapped model
wrapped_model = BERTWrapper(model, tokenizer, MAX_LEN)

# Define class names
class_names = ['Hate speech', 'Offensive language', 'Neutral']

# Calculate and print average DoE
average_doe = calculate_average_doe(df, wrapped_model, class_names)
print(f"Average Degree of Explainability (DoE) for {num_sam} samples:", average_doe)

In [None]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to get synonyms for a word
def get_synonym(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name() != word:
                synonyms.add(lemma.name().replace('_', ' '))
    return random.choice(list(synonyms)) if synonyms else word

# Function to generate adversarial example using LIME
def generate_adversarial_example(text, predictor, explainer, num_features=2):
    exp = explainer.explain_instance(text, predictor, num_features=num_features)
    words = text.split()
    for feature, _ in exp.as_list()[:num_features]:
        if feature in words:
            idx = words.index(feature)
            words[idx] = get_synonym(words[idx])
    return ' '.join(words)

# Wrapper for model prediction
def model_predict(texts):
    model.eval()
    encoded_inputs = tokenizer(texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    input_ids = encoded_inputs['input_ids'].to(device)
    attention_mask = encoded_inputs['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    return torch.softmax(outputs.logits, dim=1).cpu().numpy()

# LIME-based adversarial attack
def lime_based_attack(dataset, samples=20):
    correct_before_attack = 0
    correct_after_attack = 0
    total_samples = 0

    explainer = LimeTextExplainer(class_names=['hate speech', 'offensive language', 'neither'])

    for batch in dataset:
        for text, label in zip(batch['tweet_text'], batch['labels']):
            if total_samples >= samples:
                return total_samples, correct_before_attack, correct_after_attack

            # Original prediction
            original_pred = model_predict([text])[0].argmax()
            if original_pred == label:
                correct_before_attack += 1

            # Generate adversarial example
            adv_text = generate_adversarial_example(text, model_predict, explainer)
            adv_pred = model_predict([adv_text])[0].argmax()

            if adv_pred == label:
                correct_after_attack += 1

            total_samples += 1

    return total_samples, correct_before_attack, correct_after_attack

# Perform the attack
total_samples, correct_before_attack, correct_after_attack = lime_based_attack(train_data_loader)

# Calculate metrics
accuracy_before_attack = correct_before_attack / total_samples
accuracy_after_attack = correct_after_attack / total_samples
adv_rob = accuracy_after_attack / accuracy_before_attack if accuracy_before_attack > 0 else 0

attack_resilience = adv_rob / average_doe if average_doe > 0 else 0

# Print results
print("LIME-based adversarial attack results:")
print(f"Total samples: {total_samples}")
print(f"Correct before attack: {correct_before_attack}")
print(f"Correct after attack: {correct_after_attack}")
print(f"Accuracy before attack: {accuracy_before_attack}")
print(f"Accuracy after attack: {accuracy_after_attack}")
print("")
print("Results: ")
print("Adversarial Robustness (AdvRob):", adv_rob)
print("Attack Resilience (Ar):", attack_resilience)