In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd drive

/content/drive


In [3]:
cd My \Drive

/content/drive/My Drive


In [4]:
cd NLP/

/content/drive/My Drive/NLP


In [5]:
# !pip install lime
# !pip install textattack

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 KB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283859 sha256=d868e275199d18b262e3f041c8867d32d899f8d4774d5c1e9f18771b73552de2
  Stored in directory: /root/.cache/pip/wheels/ed/d7/c9/5a0130d06d6310bc6cbe55220e6e72dcb8c4eff9a478717066
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textattack
  Downloading textattack-0.3.8-py3-none-any.whl (418 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [7]:
# Load libraries
import pandas as pd
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Libraries for experimentation
import random
from lime.lime_text import LimeTextExplainer
from textattack.transformations import WordSwapQWERTY, CompositeTransformation
from textattack.attack_recipes import TextFoolerJin2019
from textattack.datasets import HuggingFaceDataset
from datasets import load_dataset
from textattack.models.wrappers import ModelWrapper
from textattack.attack_recipes import TextFoolerJin2019
from textattack.datasets import HuggingFaceDataset
from transformers import GPT2Tokenizer

# Load the dataset
df = pd.read_csv('./Data/KaggleData.csv')

# Convert to lowercase, remove punctuation, extra spaces, URLs, mentions, and hashtags
df['tweet'] = df['tweet'].str.lower().replace(r'[^\w\s]', '', regex=True).replace(' {2,}', ' ', regex=True).replace('"', '')
df['tweet'] = df['tweet'].replace(r'http\S+|www.\S+|@\w+|#\w+', '', regex=True)

# Tokenization
nltk.download('punkt')
df['tweet'] = df['tweet'].apply(nltk.word_tokenize)

# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['tweet'] = df['tweet'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Removing stop-words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x if word not in stop_words]))

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Encode the labels
# 0 - hate speech, 1 - offensive language, 2 - neither as positive or negative
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['class'])

# Splitting the Data using Stratified split
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], y, test_size=0.3, stratify=y, random_state=42)

# Tokenize and pad the input sequences
def tokenize_and_pad(texts, maxlen=100):
    tokenized_texts = [nltk.word_tokenize(text) for text in texts]
    return pad_sequence([torch.tensor([word_to_index[word] for word in text if word in word_to_index][:maxlen]) for text in tokenized_texts], batch_first=True, padding_value=len(word_to_index))

word_to_index = {word: i for i, word in enumerate(set(df['tweet'].str.cat(sep=' ').split()), 1)}
X_train = tokenize_and_pad(X_train)
X_test = tokenize_and_pad(X_test)

# Create PyTorch Datasets and DataLoaders
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Create a PyTorch LSTM model
class LSTMBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        packed_output, (hidden, cell) = self.lstm(x)
        x = self.fc(hidden[-1])
        return x

# Initialize the model, optimizer, and loss function
model = LSTMBaseline(len(word_to_index) + 1, 50, 100, len(set(y)))
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
    
# Train the model
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(train_loader)}")

    # Save Model
    torch.save(model, './Weights/KaggleLSTM.pth')

# Test the model and collect predictions and true labels
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.numpy())
        true_labels.extend(labels.numpy())

# Calculate accuracy, precision, recall, F1-score, and confusion matrix
accuracy = np.mean(np.array(predictions) == np.array(true_labels))
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
conf_mat = confusion_matrix(true_labels, predictions)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1_score)
print("Confusion Matrix:\n", conf_mat)

# Experimentation
# Drop the rows with NaN values
df = df.dropna(subset=['tweet'])
df = df.dropna(subset=['class'])

# Reset the index
df = df.reset_index(drop=True)

# Wrapper
class LSTMBaselineWrapper(ModelWrapper):
    def __init__(self, model):
        self.model = model

    def __call__(self, text_input_list):
        preds = []
        for text in text_input_list:
            input_tensor = tokenize_and_pad([text]).long()
            output = self.model(input_tensor)
            pred = torch.softmax(output, dim=1).squeeze().tolist()
            preds.append(pred)
        return np.array(preds)

wrapped_model = LSTMBaselineWrapper(model)
class_names = ['hate_speech', 'offensive_language', 'neither']

# Explainability
def lime_analysis(text, wrapped_model, class_names):
    explainer = LimeTextExplainer(class_names=class_names)
    exp = explainer.explain_instance(text, wrapped_model, num_features=10)
    return exp.as_list()

text_to_explain = random.choice(df['tweet'])
print(text_to_explain)
lime_results = lime_analysis(text_to_explain, wrapped_model, class_names)
print("LIME analysis results:")
print(lime_results)

def calculate_doe(lime_results):
    feature_scores = [abs(score) for _, score in lime_results]
    std_dev = np.std(feature_scores)
    significant_features = len([score for score in feature_scores if score > std_dev])
    return significant_features

doe = calculate_doe(lime_results)
print("Degree of Explainability (DoE):", doe)

# Adversarial Robustness
def attack(model, tokenizer, dataset, samples=20):
    correct_before_attack = 0
    attacked = 0
    correct_after_attack = 0

    attack = TextFoolerJin2019.build(model)

    for i in range(samples):
        example = dataset[i]
        input_text = example['tweet']
        ground_truth_label = example['class']
        result = attack.attack(input_text, ground_truth_label)
        if result.original_result.raw_output[0] > 0.5:
            correct_before_attack += 1
            if result.perturbed_result.raw_output[0] <= 0.5:
                attacked += 1
        elif result.original_result.raw_output[0] <= 0.5:
            correct_after_attack += 1

    return attacked, correct_before_attack, correct_after_attack

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def load_custom_dataset(path):
    dataset = load_dataset("csv", data_files=path, split="train")
    return dataset

test_data_path = "./Data/KaggleData.csv"
test_dataset = load_custom_dataset(test_data_path)
attacked, correct_before_attack, correct_after_attack = attack(wrapped_model, tokenizer, test_dataset)

print("Adversarial attack results:")
print(f"Total samples attacked: {attacked}")

if attacked > 0:
    print(f"Accuracy before attack: {correct_before_attack / attacked}")
    print(f"Accuracy after attack: {correct_after_attack / attacked}")

    adv_rob = correct_after_attack / correct_before_attack
    print("Adversarial Robustness (AdvRob):", adv_rob)

    attack_resilience = 1 - abs(adv_rob - 1)
    print("Attack Resilience (Ar):", attack_resilience)
else:
    print("No successful adversarial attacks.")

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10, Loss: 0.6687087821697003
Epoch 2/10, Loss: 0.663867186226678
Epoch 3/10, Loss: 0.5440628485441976
Epoch 4/10, Loss: 0.3620768300708706
Epoch 5/10, Loss: 0.30553061058178793
Epoch 6/10, Loss: 0.2544358592912637
Epoch 7/10, Loss: 0.21587258701523146
Epoch 8/10, Loss: 0.18585542365771412
Epoch 9/10, Loss: 0.15818725700076827
Epoch 10/10, Loss: 0.13018615218651228
Accuracy:  0.861600537995965
Precision:  0.8547182026080377
Recall:  0.861600537995965
F1-score:  0.8577852707075768
Confusion Matrix:
 [[ 131  253   45]
 [ 178 5348  231]
 [  35  287  927]]
rt nissa_jadee bad bitch thing like
LIME analysis results:
[('bitch', 0.5549173038783538), ('like', 0.0751452485259446), ('nissa_jadee', 0.05032505306768707), ('thing', -0.04642329515078761), ('bad', 0.02146980971795797), ('rt', -0.012113468316031627)]
Degree of Explainability (DoE): 1


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-c7e321e638aeeca9/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

textattack: Downloading https://textattack.s3.amazonaws.com/word_embeddings/paragramcf.


Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-c7e321e638aeeca9/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


100%|██████████| 481M/481M [00:44<00:00, 10.9MB/s]
textattack: Unzipping file /root/.cache/textattack/tmpprjsyoci.zip to /root/.cache/textattack/word_embeddings/paragramcf.
textattack: Successfully saved word_embeddings/paragramcf to cache.
textattack: Unknown if model of class <class '__main__.LSTMBaseline'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Adversarial attack results:
Total samples attacked: 0
No successful adversarial attacks.


In [1]:
import pandas as pd
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/pthamminedi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/pthamminedi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pthamminedi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pthamminedi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()


In [2]:
import random
from lime.lime_text import LimeTextExplainer
from textattack.transformations import WordSwapQWERTY, CompositeTransformation
from textattack.attack_recipes import TextFoolerJin2019
from textattack.datasets import HuggingFaceDataset
from datasets import load_dataset
from textattack.models.wrappers import ModelWrapper
from textattack.attack_recipes import TextFoolerJin2019
from textattack.datasets import HuggingFaceDataset
from transformers import GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm
2023-04-09 15:42:09.133870: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-09 15:42:09.482899: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv('/mnt/c/Users/tpras/Documents/UF/2nd Sem/NLP/Project/KaggleData.csv')

# Convert to lowercase, remove punctuation, extra spaces, URLs, mentions, and hashtags
df['tweet'] = df['tweet'].str.lower().replace(r'[^\w\s]', '', regex=True).replace(' {2,}', ' ', regex=True).replace('"', '')
df['tweet'] = df['tweet'].replace(r'http\S+|www.\S+|@\w+|#\w+', '', regex=True)

# Tokenization
nltk.download('punkt')
df['tweet'] = df['tweet'].apply(nltk.word_tokenize)

# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['tweet'] = df['tweet'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Removing stop-words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x if word not in stop_words]))

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Encode the labels
# 0 - hate speech, 1 - offensive language, 2 - neither as positive or negative
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['class'])

# Splitting the Data using Stratified split
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], y, test_size=0.3, stratify=y, random_state=42)

# Tokenize and pad the input sequences
def tokenize_and_pad(texts, maxlen=100):
    tokenized_texts = [nltk.word_tokenize(text) for text in texts]
    return pad_sequence([torch.tensor([word_to_index[word] for word in text if word in word_to_index][:maxlen]) for text in tokenized_texts], batch_first=True, padding_value=len(word_to_index))

word_to_index = {word: i for i, word in enumerate(set(df['tweet'].str.cat(sep=' ').split()), 1)}
X_train = tokenize_and_pad(X_train)
X_test = tokenize_and_pad(X_test)

# Create PyTorch Datasets and DataLoaders
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


[nltk_data] Downloading package punkt to
[nltk_data]     /home/pthamminedi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pthamminedi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pthamminedi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Create a PyTorch LSTM model
class LSTMBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        packed_output, (hidden, cell) = self.lstm(x)
        x = self.fc(hidden[-1])
        return x

# Initialize the model, optimizer, and loss function
model = LSTMBaseline(len(word_to_index) + 1, 50, 100, len(set(y)))
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
    
# Train the model
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(train_loader)}")

    # Save Model
    torch.save(model, '/mnt/c/Users/tpras/Documents/UF/2nd Sem/NLP/Project/Weights/KaggleLSTM.pth')

# Test the model and collect predictions and true labels
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.numpy())
        true_labels.extend(labels.numpy())

#Calculate accuracy, precision, recall, F1-score, and confusion matrix
accuracy = np.mean(np.array(predictions) == np.array(true_labels))
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
conf_mat = confusion_matrix(true_labels, predictions)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1_score)
print("Confusion Matrix:\n", conf_mat)

Epoch 1/10, Loss: 0.6664963504556793
Epoch 2/10, Loss: 0.6635816231726722
Epoch 3/10, Loss: 0.5274130837446418
Epoch 4/10, Loss: 0.3618412032503889
Epoch 5/10, Loss: 0.3004763749038756
Epoch 6/10, Loss: 0.2547307630819071
Epoch 7/10, Loss: 0.21596188936554048
Epoch 8/10, Loss: 0.17579061188091888
Epoch 9/10, Loss: 0.14049971539829043
Epoch 10/10, Loss: 0.11049812881303454
Accuracy:  0.8516476126429052
Precision:  0.8550234441144426
Recall:  0.8516476126429052
F1-score:  0.8520053692937981
Confusion Matrix:
 [[ 146  260   23]
 [ 254 5339  164]
 [ 100  302  847]]


In [5]:
df = df.dropna(subset=['tweet'])
df = df.dropna(subset=['class'])

# Reset the index
df = df.reset_index(drop=True)

# Wrapper
class LSTMBaselineWrapper(ModelWrapper):
    def __init__(self, model):
        self.model = model

    def __call__(self, text_input_list):
        preds = []
        for text in text_input_list:
            input_tensor = tokenize_and_pad([text]).long()
            output = self.model(input_tensor)
            pred = torch.softmax(output, dim=1).squeeze().tolist()
            preds.append(pred)
        return np.array(preds)

wrapped_model = LSTMBaselineWrapper(model)
class_names = ['hate_speech', 'offensive_language', 'neither']

In [6]:
df.isnull().values.any()
df.isnull().sum().sum()

0

In [7]:
# Explainability
def lime_analysis(text, wrapped_model, class_names):
    explainer = LimeTextExplainer(class_names=class_names)
    exp = explainer.explain_instance(text,wrapped_model, num_features=10, num_samples=2)
    return exp.as_list()


In [8]:

text_to_explain = random.choice(df['tweet'])
print(text_to_explain)
lime_results = lime_analysis(text_to_explain, wrapped_model, class_names)
print("LIME analysis results:")

print(lime_results)

8220goldmind___ smoke good weed bad bitch8221128079
LIME analysis results:
[('8220goldmind___', 0.10348278885051554), ('good', 0.10348278885051554), ('weed', 0.10348278885051554), ('smoke', 0.0), ('bad', 0.0), ('bitch8221128079', 0.0)]


def calculate_doe(lime_results):
    feature_scores = [abs(score) for _, score in lime_results]
    std_dev = np.std(feature_scores)
    significant_features = len([score for score in feature_scores if score > std_dev])
    return significant_features

doe = calculate_doe(lime_results)
print("Degree of Explainability (DoE):", doe)

# Adversarial Robustness
def attack(model, tokenizer, dataset, samples=20):
    correct_before_attack = 0
    attacked = 0
    correct_after_attack = 0

    attack = TextFoolerJin2019.build(model)

    for i in range(samples):
        example = dataset[i]
        print(example)
        input_text = example['tweet']
        ground_truth_label = example['class']
        result = attack.attack(input_text, ground_truth_label)
        print("1st")
        print(result.original_result.raw_output[0])
        print("2nd")
        print(result.perturbed_result.raw_output[0])
        
        if result.original_result.raw_output[0] > 0.5:
            correct_before_attack += 1
            
            if result.perturbed_result.raw_output[0] <= 0.5:
                attacked += 1
        elif result.original_result.raw_output[0] <= 0.5:
            correct_after_attack += 1

    return attacked, correct_before_attack, correct_after_attack

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def load_custom_dataset(path):
    dataset = load_dataset("csv", data_files=path, split="train")
    return dataset

test_data_path = "/mnt/c/Users/tpras/Documents/UF/2nd Sem/NLP/Project/KaggleData.csv"
test_dataset = load_custom_dataset(test_data_path)
attacked, correct_before_attack, correct_after_attack = attack(wrapped_model, tokenizer, test_dataset)

print("Adversarial attack results:")
print(f"Total samples attacked: {attacked}")

if attacked > 0:
    print(f"Accuracy before attack: {correct_before_attack / attacked}")
    print(f"Accuracy after attack: {correct_after_attack / attacked}")

    adv_rob = correct_after_attack / correct_before_attack
    print("Adversarial Robustness (AdvRob):", adv_rob)

    attack_resilience = 1 - abs(adv_rob - 1)
    print("Attack Resilience (Ar):", attack_resilience)
else:
    print("No successful adversarial attacks.")

In [9]:
import textattack
import transformers
from textattack.transformations import WordSwapEmbedding
from textattack.search_methods import GreedyWordSwapWIR
from textattack import Attack

# Load model, tokenizer, and model_wrapper
model = transformers.AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-imdb")
tokenizer = transformers.AutoTokenizer.from_pretrained("textattack/bert-base-uncased-imdb")
model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(model, tokenizer)

# Construct our four components for `Attack`
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics import WordEmbeddingDistance

goal_function = textattack.goal_functions.UntargetedClassification(model_wrapper)
constraints = [
    RepeatModification(),
    StopwordModification(),
    WordEmbeddingDistance(min_cos_sim=0.9)
]
transformation = WordSwapEmbedding(max_candidates=50)
search_method = GreedyWordSwapWIR(wir_method="delete")



def calculate_doe(lime_results):
    feature_scores = [abs(score) for _, score in lime_results]
    std_dev = np.std(feature_scores)
    significant_features = len([score for score in feature_scores if score > std_dev])
    return significant_features

doe = calculate_doe(lime_results)
print("Degree of Explainability (DoE):", doe)

# Adversarial Robustness
def attack(model, tokenizer, dataset, samples=20):
    correct_before_attack = 0
    attacked = 0
    correct_after_attack = 0

    #attack = TextFoolerJin2019.build(model)
    # Construct the actual attack
    attack1 = Attack(goal_function, constraints, transformation, search_method)

    for i in range(samples):
        example = dataset[i]
        #print(example)
        input_text = example['tweet']
        ground_truth_label = 0
        result = attack1.attack(input_text, ground_truth_label)
        print(result)
        print("1st")
        print(result.original_result.raw_output[0])
        print("2nd")
        print(result.perturbed_result.raw_output[0])
        
        if result.original_result.raw_output[0] > 0.5:
            correct_before_attack += 1
            
            if result.perturbed_result.raw_output[0] <= 0.5:
                attacked += 1
        elif result.original_result.raw_output[0] <= 0.5:
            correct_after_attack += 1

    return attacked, correct_before_attack, correct_after_attack

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def load_custom_dataset(path):
    dataset = load_dataset("csv", data_files=path, split="train")
    return dataset

test_data_path = "/mnt/c/Users/tpras/Documents/UF/2nd Sem/NLP/Project/KaggleData.csv"
test_dataset = load_custom_dataset(test_data_path)
attacked, correct_before_attack, correct_after_attack = attack(wrapped_model, tokenizer, test_dataset)

print("Adversarial attack results:")
print(f"Total samples attacked: {attacked}")

if attacked > 0:
    print(f"Accuracy before attack: {correct_before_attack / attacked}")
    print(f"Accuracy after attack: {correct_after_attack / attacked}")

    adv_rob = correct_after_attack / correct_before_attack
    print("Adversarial Robustness (AdvRob):", adv_rob)

    attack_resilience = 1 - abs(adv_rob - 1)
    print("Attack Resilience (Ar):", attack_resilience)
else:
    print("No successful adversarial attacks.")

textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Degree of Explainability (DoE): 3


Using custom data configuration default-17b54d524b96dfad
Reusing dataset csv (/home/pthamminedi/.cache/huggingface/datasets/csv/default-17b54d524b96dfad/0.0.0/568fe90a74a751c3380eb1a61a0322c09e7edc4dbe9ee7300005a8dcd8a8902e)


1 (92%) --> [SKIPPED]

!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
1st
0.08013989
2nd
0.08013989
0 (93%) --> [FAILED]

!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
1st
0.93134826
2nd
0.6682295
0 (94%) --> [FAILED]

!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
1st
0.93808246
2nd
0.8677309
0 (99%) --> [FAILED]

!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
1st
0.99300826
2nd
0.99289775
0 (95%) --> [FAILED]

!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;
1st
0.94569564
2nd
0.9253586
0 (94%) --> [FAILED]

!!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! &#128514;&#128514;&#128514;"
1st
0.937164

In [10]:
import textattack
import transformers
from textattack.transformations import WordSwapEmbedding
from textattack.search_methods import GreedyWordSwapWIR
from textattack import Attack

# Load model, tokenizer, and model_wrapper
model = transformers.AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-imdb")
tokenizer = transformers.AutoTokenizer.from_pretrained("textattack/bert-base-uncased-imdb")
model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(model, tokenizer)

# Construct our four components for `Attack`
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics import WordEmbeddingDistance

goal_function = textattack.goal_functions.UntargetedClassification(model_wrapper)
constraints = [
    RepeatModification(),
    StopwordModification(),
    WordEmbeddingDistance(min_cos_sim=0.9)
]
transformation = WordSwapEmbedding(max_candidates=50)
search_method = GreedyWordSwapWIR(wir_method="delete")

# Construct the actual attack
attack = Attack(goal_function, constraints, transformation, search_method)

input_text = '!!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! &#128514;&#128514;&#128514;"'
label = 0 #Positive
result = attack.attack(input_text, label)
print(result)
print(result.original_result.raw_output[0])
#print(result.original_text)
print(result.perturbed_result.raw_output[0])
#print(result.perturbed_text.raw_output[0])


def lime_analysis(text, wrapped_model, class_names):
    explainer = LimeTextExplainer(class_names=class_names)
    exp = explainer.explain_instance(text,wrapped_model, num_features=10, num_samples=2)
    return exp.as_list()

print("Before attack")
text_to_explain = "!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."
print(text_to_explain)
lime_results = lime_analysis(text_to_explain, wrapped_model, class_names)
print("LIME analysis results:")

print(lime_results)

print("After attack")
text_to_explain = "!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should permanently take the rubbish out..."
print(text_to_explain)
lime_results = lime_analysis(text_to_explain, wrapped_model, class_names)
print("LIME analysis results:")

print(lime_results)

textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


0 (94%) --> [FAILED]

!!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! &#128514;&#128514;&#128514;"
0.93716407
0.59986943
Before attack
!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
LIME analysis results:
[('RT', 0.013777142912941999), ('woman', 0.013777142912941999), ('you', 0.013777142912941999), ('t', 0.013777142912941999), ('cleaning', 0.013777142912941999), ('up', 0.013777142912941999), ('your', 0.013777142912941999), ('man', 0.013777142912941999), ('trash', 0.013777142912941999), ('mayasolovely', 0.0)]
After attack
!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should permanently take the rubbish out...
LIME analysis results:
[('shouldn', 0.03335869844036424), ('your', 0.03335869844036424), ('as', 0.03335869844036424), ('man', 0.03335869844036424), (