<a href="https://colab.research.google.com/github/Mekatebi/NMA_DL_2023_Project/blob/main/NMA_DL_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Codes for NMA DL 2023 project

## Set-up environment

In [6]:
!pip install -q requests nlpaug sacremoses datasets transformers[torch] evaluate

In [7]:
from transformers import AutoTokenizer, RobertaForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
import torch
import evaluate
import numpy as np
import requests
from torch import nn
import pandas as pd

In [8]:
tokenizer = AutoTokenizer.from_pretrained("Mekatebi/NMA_DL_2023_Project") # , revision="v2.0.0")
model = RobertaForSequenceClassification.from_pretrained("Mekatebi/NMA_DL_2023_Project", output_attentions=True) # , revision="v2.0.0")

## RoBERTa

In [None]:
from huggingface_hub import notebook_login

notebook_login()

### Loading dataset

In [13]:
# Define the remote file to retrieve
url = 'https://zenodo.org/record/2667859/files/500_Reddit_users_posts_labels.csv'
# Define the local filename to save data
local_file = '/content/Dataset.csv'
# Make http request for remote file data
data = requests.get(url)
# Save file data to local copy
with open(local_file, 'wb')as file:
  file.write(data.content)

### Augmentation

In [14]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
import nlpaug.augmenter.word as naw

from nlpaug.util import Action

In [15]:
aug = naf.Sequential([
    naw.SpellingAug(aug_min=0, aug_max=1024, aug_p=0.04),
    naw.SynonymAug(aug_src='wordnet', aug_min=0, aug_max=1024, aug_p=0.04),
    naw.RandomWordAug(action='delete', aug_min=0, aug_max=1024, aug_p=0.04),
    naw.RandomWordAug(action='swap', aug_min=0, aug_max=1024, aug_p=0.04)
])

#    naw.BackTranslationAug(
#    from_model_name='facebook/wmt19-en-de',
#    to_model_name='facebook/wmt19-de-en',
#    max_length=512,
#    device='cpu')

In [18]:
# Edit the dataset

dataset = pd.read_csv(local_file)

dataset = dataset[dataset['Label'].isin(['Indicator', 'Ideation', 'Behavior', 'Attempt'])]
dataset['Label'] = dataset['Label'].replace('Indicator','Ideation')
dataset['Label'] = dataset['Label'].replace('Behavior','Attempt')

dataset = dataset.reset_index()

In [19]:
rows = dataset.shape[0]

for i in range(0, rows):

  dataset.loc[i + rows, ('Post')] = aug.augment(dataset.loc[i, ('Post')])
  dataset.loc[i + rows, ('Label')] = dataset.loc[i, ('Label')]

rows = dataset.shape[0]

for j in range(0, rows):

  dataset.loc[j + rows, ('Post')] = aug.augment(dataset.loc[j, ('Post')])
  dataset.loc[j + rows, ('Label')] = dataset.loc[j, ('Label')]

dataset.to_csv('/content/Modified_Dataset.csv')

In [None]:
dataset = load_dataset('csv', data_files='/content/Modified_Dataset.csv')

In [None]:
dataset

### Preprocess

In [None]:
def not_none(example):
    return example['Post'] is not None

dataset = dataset.filter(not_none)

dataset_sampled = dataset['train'].train_test_split(test_size=0.01, seed=2023)['train']

train_val_test = dataset_sampled.train_test_split(test_size=0.3, seed=2023)
train_dataset = train_val_test['train']
test_val_dataset = train_val_test['test']

test_val_split = test_val_dataset.train_test_split(test_size=0.5, seed=2023)
validation_dataset = test_val_split['train']
test_dataset = test_val_split['test']

columns_to_keep = ['Post', 'Label']

columns_to_remove = [col for col in dataset_sampled.column_names if col not in columns_to_keep]

train_dataset = train_dataset.remove_columns(columns_to_remove)
validation_dataset = validation_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

le = LabelEncoder()

le.fit(dataset_sampled['Label'])

def encode_labels(example):
    example['Label'] = le.transform([example['Label']])[0]
    return example

train_dataset = train_dataset.map(encode_labels)
validation_dataset = validation_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)

In [None]:
labels= le.classes_

id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

id2label

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def prepare_data(example):
    encoding = tokenizer.encode_plus(
        example['Post'],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt',
    )
    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(example['Label'], dtype=torch.long)
    }

train_dataset = train_dataset.map(prepare_data)
validation_dataset = validation_dataset.map(prepare_data)
test_dataset = test_dataset.map(prepare_data)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
np.histogram(test_dataset['Label'])

### Evaluate

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

### Model

In [None]:
# model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(le.classes_), id2label=id2label, label2id=label2id) # , problem_type="multi_label_classification")

In [None]:
model.config

### Train

In [31]:
training_args = TrainingArguments(
    output_dir='./Model',
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    push_to_hub=False,
    hub_model_id = "NMA_DL_2023_Project"
)

In [32]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [33]:
# CustomTrainer for Multiclass

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
# Push to Hugging Face

tokenizer.push_to_hub("NMA_DL_2023_Project")
model.push_to_hub("NMA_DL_2023_Project")

## BertViz

In [None]:
!pip install -q bertviz

from bertviz import head_view, model_view

### Head View

In [8]:
Example = "I tried to kill my self once and failed badly cause in the moment i wanted to do it i realized that i want to live!"

In [9]:
inputs = tokenizer.encode(Example , return_tensors='pt')
outputs = model(inputs)

attention = outputs[-1] # Output includes attention weights when output_attentions=True
tokens = tokenizer.convert_ids_to_tokens(inputs[0])

In [None]:
head_view(attention, tokens)

## LIME

In [4]:
!pip install -q lime

In [None]:
import numpy as np
import lime
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

class_names = ['Attempt', 'Ideation']

def predictor(texts):
    outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
    tensor_logits = outputs[0]
    probas = F.softmax(tensor_logits).detach().numpy()
    return probas

text = "I tried to kill myself."

explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(text, predictor, num_samples=1000)
exp.show_in_notebook(text=text)

## Transformers Interpret

In [None]:
!pip install -q transformers-interpret

In [15]:
Example = "I just took 10 more. Okay I threw up a little bit but now I feel weird and bloated. Its not so much that I want to die but Im scared and I dont see a way out. I dont see the light at the end of the tunnel anymore. Its just black. I feel like my life is hopeless so why prolong the suffering. Is there anyone out there?"

In [16]:
from transformers_interpret import SequenceClassificationExplainer

cls_explainer = SequenceClassificationExplainer(model, tokenizer)
word_attributions = cls_explainer(Example) # , class_name="Attempt")

In [None]:
cls_explainer.predicted_class_name

In [None]:
cls_explainer.visualize()