# **TRAIN MODEL**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U git+https://github.com/Adapter-Hub/adapter-transformers.git 

In [None]:
import json
import torch
import sklearn
from transformers import RobertaTokenizer
from transformers import RobertaConfig, RobertaModelWithHeads
from pathlib import Path
import numpy as np
import nltk
from nltk.corpus import stopwords
from transformers import TrainingArguments, Trainer, EvalPrediction
from transformers import TextClassificationPipeline
nltk.download('stopwords')

## LOAD DATA

In [None]:
np.random.seed(0)
torch.manual_seed(0)
train_path = '/content/drive/MyDrive/Machine Learning/datasets/how2/train.jsonl' #CHANGE gdrive to drive and vice versa if it can't find the dataset
test_path = '/content/drive/MyDrive/Machine Learning/datasets/how2/test.jsonl'
dev_path = '/content/drive/MyDrive/Machine Learning/datasets/how2/valid.jsonl'
dataset_name = 'amazon' # CHANGE

def read_split(split_dir, dataset):  #dataset is the folder containing the dataset'ag', 'chemprot' etc.
    file_path = Path(split_dir.format(dataset))
    texts = []
    labels = []
    with open(file_path, encoding="utf-8") as f:
        for id_, row in enumerate(f):
            data = json.loads(row)
            texts.append(data["text"]) #change
            labels.append(data["label"])
            
    return texts, labels

train_texts, train_labels = read_split(train_path, dataset_name)
test_texts, test_labels = read_split(test_path,dataset_name)
val_texts, val_labels = read_split(dev_path, dataset_name)
try:
  classes = [x.item() for x in np.unique(train_labels)]
except:
  pass
datapoints = {'Train': len(train_texts), 'Test': len(test_texts), 'valid': len(val_texts)}
print('NUM DATAPOINTS:\n', datapoints)
print('\nNUM CLASSES: ', len(classes))

## PRE-PROCESSING

In [None]:
#REMOVE UNWANTED CLASSES
class_list = []
class_list = set(class_list)
temp_labels = list()
temp_texts = list()
for x in range(len(train_labels)):
  if (not (train_labels[x] in class_list)):
    temp_labels.append(train_labels[x])
    temp_texts.append(train_texts[x])
train_texts = temp_texts
train_labels = temp_labels

temp_labels = list()
temp_texts = list()
for x in range(len(val_labels)):
  if (not (val_labels[x] in class_list)):
    temp_labels.append(val_labels[x])
    temp_texts.append(val_texts[x])
val_texts = temp_texts
val_labels = temp_labels

temp_labels = list()
temp_texts = list()
for x in range(len(test_labels)):
  if (not (test_labels[x] in class_list)):
    temp_labels.append(test_labels[x])
    temp_texts.append(test_texts[x])
test_texts = temp_texts
test_labels = temp_labels

try:
  classes = [x.item() for x in np.unique(train_labels)]
except:
  pass
datapoints = {'Train': len(train_texts), 'Test': len(test_texts), 'valid': len(val_texts)}
print('NUM DATAPOINTS:\n', datapoints)
print('\nNUM CLASSES: ', len(classes))

In [None]:
#REMOVE STOP WORDS and normalize case
def remove_stop_words(data):
  words = set(stopwords.words("english"))
  return [' '.join([word for word in text.split() if word not in words]).lower() for text in data]

train_texts = remove_stop_words(train_texts)
test_texts = remove_stop_words(test_texts)
val_texts = remove_stop_words(val_texts)

## TRAIN MODEL

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


In [None]:
train_encodings = tokenizer(train_texts, max_length=65, truncation=True, padding="max_length")
val_encodings = tokenizer(val_texts, max_length=65, truncation=True, padding="max_length")
test_encodings = tokenizer(test_texts, max_length=65, truncation=True, padding="max_length")

In [None]:
class Dataset(torch.utils.data.Dataset): # might need to change depending on label type
    def __init__(self, encodings, labels, classes):
        self.encodings = encodings
        class_dict = {val: key for key, val in enumerate(classes)} 
        self.labels = [class_dict[x] for x in labels]
        self.classes = classes

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    def len_classes(self):
      return len(self.classes)

train_dataset = Dataset(train_encodings, train_labels, classes)
val_dataset = Dataset(val_encodings, val_labels, classes)
test_dataset = Dataset(test_encodings, test_labels, classes)

In [None]:
config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=train_dataset.len_classes(),
)
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
)
# Add a new adapter
adapter_name = "Name"
model.add_adapter(adapter_name)
id2label = {id: label for (id, label) in enumerate(train_dataset.classes)} #MAY HAVE TO CHANGE DEPENDING ON LABELS OF THE DATASET
# Add a matching classification head
model.add_classification_head(
    adapter_name,
    num_labels=train_dataset.len_classes(),
    id2label=id2label #{ 0: "👎", 1: "👍"} 
  )
# Activate the adapter
model.train_adapter(adapter_name)

training_args = TrainingArguments(
    learning_rate=1e-4,  #CHANGE
    num_train_epochs=75,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_steps=500,              
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)
def compute_accuracy(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": (preds == p.label_ids).mean()}

def compute_f1_macro(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1 macro':sklearn.metrics.f1_score(p.label_ids, preds, average='macro')}

def compute_f1_micro(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1 micro':sklearn.metrics.f1_score(p.label_ids, preds, average='micro')}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_f1_macro,
)

In [None]:
trainer.train()
eval = trainer.evaluate()
print(eval)
model.save_adapter('/content/drive/MyDrive/Machine Learning/TextClassification/oct/', adapter_name)
with open('/content/drive/MyDrive/Machine Learning/TextClassification/oct/EVALS.json', 'w') as f:
  json.dump(eval, f, indent=4)

# **RUN SAVED MODEL**

In [None]:
!pip install -U git+https://github.com/Adapter-Hub/adapter-transformers.git

In [None]:
import json
import torch
import sklearn
from transformers import RobertaTokenizer
from transformers import RobertaConfig, RobertaModelWithHeads
from pathlib import Path
import numpy as np
from transformers import TrainingArguments, Trainer, EvalPrediction
from transformers import TextClassificationPipeline
from transformers import AdapterLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=len(classes),
)
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
)

weightsdir = '/content/drive/MyDrive/Machine Learning/TextClassification/oct/'

x = model.load_adapter(weightsdir)
model.set_active_adapters(x)

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [None]:
c = classifier('Example')
print('Label: ',c[0]['label'], 'score: ', c[0]['score'])