<a href="https://colab.research.google.com/github/sefeoglu/neurogenesis-cre/blob/master/FS_CRE_neurogenesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install --upgrade pip
!pip3 install --upgrade transformers
!pip3 install --upgrade accelerate
!pip3 install sentencepiece
!pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr --upgrade
!pip install ipywidgets
!pip install peft
!pip install bitsandbytes
!pip install evaluate
!pip install trl


In [None]:
# !pip uninstall transformers
!pip install transformers==4.45.2

In [None]:
# !unzip relations.zip
!unzip 5way5shot_data.zip

In [None]:
!unzip relations.zip

In [None]:
"""Continuous instruction fine-tuning of Flan T5 model for Relation Extraction"""
import sys
import os
import json

from sklearn.metrics import accuracy_score
from datasets import load_dataset
from trl import SFTTrainer
from random import randrange
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments
from peft import  get_peft_model, LoraConfig, TaskType
from transformers import BitsAndBytesConfig
from huggingface_hub import HfFolder
import evaluate
import nltk, torch
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")
from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_recall_fscore_support
from transformers import DataCollatorForSeq2Seq

from datasets import concatenate_datasets


In [17]:
class CustomTrainer(Seq2SeqTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.vocab_size), labels.view(-1))
        return loss # Added return statement to return the calculated loss

In [18]:
def read_json(path):
    with open(path, 'r', encoding="utf-8") as f:
        data = json.load(f)
    return data


def set_dataset(dataset_id):
    # Load dataset

    dataset = load_dataset(dataset_id)

    print(len(dataset['validation']))
    return dataset


def set_tokenizer(model_id="google/flan-t5-base"):


    tokenizer = AutoTokenizer.from_pretrained(model_id)
    return tokenizer


tokenizer = set_tokenizer()
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["prompt"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["relation"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# huggingface hub model id
def load_model(model_id="google/flan-t5-base", local=False):

    compute_dtype = getattr(torch, "float16")
    # load model from the hub
    # maxmem={i:f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB' for i in range()}
    # maxmem['cpu']='300GB'
    bnb_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=False,
    )

    # model = AutoModelForSeq2SeqLM.from_pretrained(model_id,  device_map="auto", max_memory=maxmem, quantization_config=bnb_config)
    if local:
      model = AutoModelForSeq2SeqLM.from_pretrained(model_id,  device_map="auto", quantization_config=bnb_config, local_files_only=True)
    else:
      model = AutoModelForSeq2SeqLM.from_pretrained(model_id,  device_map="auto", quantization_config=bnb_config)

    return model


def preprocess_logits_for_metrics(logits, labels):
  if isinstance(logits, tuple):
    logits = logits[0]

  return logits.argmax(dim=-1)
# helper function to postprocess text
def postprocess_text(labels, preds):
    preds = [pred.replace('\n','').split('Answer:')[-1].strip() for pred in preds]
    labels = [label.replace('\n','').split('Answer:')[-1].strip() for label in labels]
    #print(preds)
    #print(labels)
    return preds, labels



def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    targets_labels = read_json(tasks_path)
    # Replace -100 in the preds as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
        # Some simple post-processing

    # model_predictions.extend(decoded_preds)
    grounds, preds = postprocess_text(decoded_labels,decoded_preds)
    tf = [True  if preds[i] == grounds[i] else False for i in range(len(preds))]
    print(preds)
    print(grounds)
    p, r, f, _ = precision_recall_fscore_support(grounds, preds, labels=targets_labels, average='micro')
    acc = accuracy_score(grounds, preds)
    decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]

    decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]
    # Compute ROUGscores
    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}



    return {k: round(v, 4) for k, v in result.items()}

def main(model_id, data_path, tasks_path, task_id, local):

    targets_labels = read_json(tasks_path)
    print(targets_labels)



    dataset = set_dataset(data_path)
    model = load_model(model_id, local)
    model = get_peft_model(model, lora_config)

    tokenizer = set_tokenizer(model_id="google/flan-t5-base")

    # The maximum total input sequence length after tokenization.
    # Sequences longer than this will be truncated, sequences shorter will be padded.
    tokenized_inputs = concatenate_datasets([dataset["train"], dataset["validation"]]).map(lambda x: tokenizer(x["prompt"], truncation=True), batched=True, remove_columns=["prompt", "relation"])
    max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
    print(f"Max source length: {max_source_length}")

    # The maximum total sequence length for target text after tokenization.
    # Sequences longer than this will be truncated, sequences shorter will be padded."
    tokenized_targets = concatenate_datasets([dataset["train"], dataset["validation"]]).map(lambda x: tokenizer(x["relation"], truncation=True), batched=True, remove_columns=["prompt", "relation"])
    max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
    print(f"Max target length: {max_target_length}")
    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["prompt", "relation"])
    print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

    # model = load_model(model_id, local)
    # model = get_peft_model(model, lora_config)



    # we want to ignore tokenizer pad token in the loss
    label_pad_token_id = -100
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8
    )


    # Hugging Face repository id
    repository_id = f"{model_id}-{task_id}"

    training_args = Seq2SeqTrainingArguments(
        output_dir=repository_id,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        predict_with_generate=True,
        fp16=False, # Overflows with fp16
        learning_rate=1e-3,
        num_train_epochs=1,
        do_eval=True,
        # logging & evaluation strategies
        logging_dir=f"{repository_id}/logs",
        logging_strategy="steps",
        logging_steps=500,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        # push to hub parameters
        report_to="tensorboard",
        push_to_hub=False,
        hub_strategy="every_save",
        hub_model_id=repository_id,
        hub_token=HfFolder.get_token(),
        lr_scheduler_type = "cosine_with_restarts",
        lr_scheduler_kwargs = {"num_cycles": 1},
        remove_unused_columns=False
    )

    # Create Trainer instance
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics,

    )


    # Start training
    trainer.train()
    trainer.save_model()
    trainer.model.save_pretrained(repository_id)
    merged_model = model.merge_and_unload()

    return merged_model, tokenizer, trainer
def read_json(path):
    with open(path, 'r', encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_json(data, path):
    with open(path, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def get_prediction(model,tokenizer, prompt, length=250,stype='greedy'):

    inputs = tokenizer(prompt, add_special_tokens=True, max_length=4096,return_tensors="pt").input_ids.to("cuda")

    outputs = model.generate(inputs, max_new_tokens=length)

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return response






In [19]:
def read_json(path):
    """ Read a json file from the given path."""
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def write_json(data, path):
    """ Write a json file to the given path."""
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))

    with open(path, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [20]:
##kmeans
from sklearn.cluster import KMeans
import numpy as np

#send data emmbeddings according to relation types.
def sample_selection_kmeans(data, memory_size):
    # Extract the embeddings and convert them into a 2D numpy array
    embeddings_array = [item['embedding'] for item in data]
    prompt_array = [item['prompt'] for item in data]
    relation_array = [item['relation'] for item in data]

    min_dim = min(emb.shape[1] for emb in embeddings_array)

    reshaped_embeddings = [emb[:, :min_dim].reshape(-1) for emb in embeddings_array]
    embeddings_array = np.vstack(reshaped_embeddings)

    num_clusters = min(memory_size, len(embeddings_array))

    distances = KMeans(n_clusters=num_clusters, random_state=0).fit_transform(embeddings_array)

    mem_set = [] # Initialize mem_set as a list
    proto_set = []
    relations = []
    selected_samples = []

    for i in range(num_clusters):
        sel_index = np.argmin(distances[:,i])
        instance = embeddings_array[sel_index]
        mem_set.append(instance)
        proto = prompt_array[sel_index]
        proto_set.append(proto)
        relations.append(relation_array[sel_index])
        print(relation_array[sel_index])
        selected_samples.append({"prompt":proto,"relation":relation_array[sel_index]})


    mem_set = np.array(mem_set)

    return mem_set, proto_set, relations, selected_samples

In [10]:
def compute_embedding(model, tokenizer, input_path):
  data = read_json(input_path)
  embeddings = []
  for i, item in enumerate(data):
    prompt = item['prompt']
    relation  = item['relation']
    inputs = tokenizer(prompt, add_special_tokens=True, max_length=4096,return_tensors="pt").input_ids.to("cuda") # Move inputs to GPU
    embedding = model.encoder(inputs)
    embeddings.append({"prompt": prompt, "relation":relation, "embedding": embedding['last_hidden_state'].data.cpu().numpy()}) # Move embedding back to CPU for NumPy
  return embeddings


In [11]:
def select_samples(model, tokenizer, memory_size, train_data_path, tasks_path):
  ### TODO ###
  relations = read_json(tasks_path)

  embeddings = compute_embedding(model, tokenizer, train_data_path)
  r1_emb = [ emb for emb in embeddings if emb['relation'] == relations[-1]]
  r2_emb = [ emb for emb in embeddings if emb['relation'] == relations[-2]]
  r3_emb = [ emb for emb in embeddings if emb['relation'] == relations[-3]]
  r4_emb = [ emb for emb in embeddings if emb['relation'] == relations[-4]]
  r5_emb = [ emb for emb in embeddings if emb['relation'] == relations[-5]]
  if len(relations) == 6:
    r6_emb = [ emb for emb in embeddings if emb['relation'] == relations[-6]]
  mem1, proto_set1, relations1,selected_samples1 = sample_selection_kmeans(r1_emb, memory_size)
  mem2, proto_set2, relations2, selected_samples2 = sample_selection_kmeans(r2_emb,memory_size)
  mem3, proto_set3, relations3, selected_samples3 = sample_selection_kmeans(r3_emb, memory_size)
  mem4, proto_set4, relations4, selected_samples4 = sample_selection_kmeans(r4_emb, memory_size)
  mem5, proto_set5, relations5, selected_samples5 = sample_selection_kmeans(r5_emb, memory_size)
  if len(relations) == 6:
    mem6, proto_set6, relations6, selected_samples6 = sample_selection_kmeans(r6_emb, memory_size)
  all_selected_samples = []
  all_selected_samples.extend(selected_samples1)
  all_selected_samples.extend(selected_samples2)
  all_selected_samples.extend(selected_samples3)
  all_selected_samples.extend(selected_samples4)
  all_selected_samples.extend(selected_samples5)
  if len(relations) == 6:
    all_selected_samples.extend(selected_samples6)



  return all_selected_samples


In [12]:
def evaluate_model(experiment_id, task_id, model, tokenizer, current_task=True):
    if current_task:
      input_path = "tacred/5way5shot-test/run{0}/{1}/test.json".format(experiment_id, task_id)
      data = read_json(input_path)
      out_pred_path = "KMmeans_CRE_tacred{0}/task_{1}_current_task_pred.json".format(experiment_id, task_id)
      out_acc_path = "KMmeans_CRE_tacred{0}/task_{1}_current_task_result.json".format(experiment_id, task_id)
    else:

      data = []
      for i in range(1, task_id+1):
        input_path = "tacred/5way5shot-test/run{0}/task{1}/test.json".format(experiment_id, i)
        data.extend(read_json(input_path))
      out_pred_path = "KMmeans_CRE_tacred_{0}/task_{1}_seen_task.json".format(experiment_id, task_id)
      out_acc_path = "KMmeans_CRE_tacred_{0}/task_{1}_seen_task_result.json".format(experiment_id, task_id)
    responses = []
    relations = []
    for j, item in enumerate(data):
      prompt = item['prompt']
      relations.append(item['relation'])

      response = get_prediction(model, tokenizer, prompt)
      print('test:', j)
      if len(response) == 0:
          print("No response")
          responses.append("")
      else:
          print(response[0])
          responses.append({"predict":response[0]})
    y_true = relations
    preds = [line['predict'] for line in responses]
    acc = accuracy_score(y_true, preds)
    result = [{"acc":acc}]

    write_json(responses, out_pred_path)
    write_json(result,out_acc_path)

In [13]:
from datetime import datetime


In [21]:
logs = ""

In [None]:

m=1
for experiment_id in range(1,2):

  print("Experiment: {0}".format(experiment_id))

  dataset_path = 'tacred/5way5shot/run{0}/task1/'.format(experiment_id)
  tasks_path = "relations/run_{0}/task1.json".format(experiment_id)
  model_id="google/flan-t5-base"
  task_id = "task1"
  targets_labels = []
  max_source_length = None
  max_target_length = None
  tokenizer = set_tokenizer()
  print(dataset_path)
  lora_config = LoraConfig(
                  # the task to train for (sequence-to-sequence language modeling in this case)
                  task_type=TaskType.SEQ_2_SEQ_LM,
                  # the dimension of the low-rank matrices
                  r=4,
                  # the scaling factor for the low-rank matrices
                  lora_alpha=32,
                  # the dropout probability of the LoRA layers
                  lora_dropout=0.01,
                  target_modules=["k","q","v","o"]
                  )

  metric = evaluate.load("rouge")
  start_time = datetime.now()
  model, tokenizer, trainer = main(model_id, dataset_path, tasks_path, task_id, False)
  end_time = datetime.now()
  train_time = 'Base Train. Experiment Id: {0}. Task Id: {1}. Duration: {2} \n'.format(experiment_id, task_id, end_time - start_time)
  logs += train_time

  model.save_pretrained("KMmeans_CRE_tacred_{0}/FSCRE_5_1/".format(experiment_id), from_pt=True)
  ## evaluate model
  evaluate_model(experiment_id, task_id, model, tokenizer, current_task=True)
  if m >0:
      train_data_path = dataset_path+"train_1.json"
      all_selected_samples = select_samples(model, tokenizer, m, train_data_path, tasks_path)

      for k in range(2, 11):
        outpath_selected_samples = 'tacred/5way5shot/memory/run_{0}/task{1}/train_2.json'.format(experiment_id,k)
        write_json(all_selected_samples, outpath_selected_samples)

  for i in range(1, 10):
    targets_labels = []
    max_source_length = None
    max_target_length = None
    tokenizer = set_tokenizer()
    metric = evaluate.load("rouge")

    dataset_path = 'tacred/5way5shot/run_{0}/task{1}/'.format(experiment_id,i+1)
    tasks_path = "relations/run{0}/task{1}.json".format(experiment_id,i+1)

    base_model_id="KMmeans_CRE_tacred_{0}/FSCRE_5_{1}/".format(experiment_id, i)
    task_id = "task{0}".format(i+1)

    print(base_model_id)
    start_time = datetime.now()
    model, tokenizer, trainer = main(base_model_id, dataset_path, tasks_path, task_id, True)
    end_time = datetime.now()
    train_time = 'Base Train. Experiment Id: {0}. Task Id: {1}. Duration: {2} \n'.format(experiment_id, task_id, end_time - start_time)
    logs += train_time
    model.save_pretrained("KMmeans_CRE_tacred_{0}/FSCRE_5_{1}/".format(experiment_id, i+1), from_pt=True)
    write_json(logs, "KMmeans_CRE_tacred_{0}/logs.txt".format(experiment_id))
    if m > 0:
        if i <9:
          train_data_path = dataset_path+"train_1.json"
          all_selected_samples = select_samples(model, tokenizer,m, train_data_path, tasks_path)
          for j in range(i+2, 11):
            outpath_selected_samples = 'tacred/5way5shot/memory/run_{0}/task{1}/train_{2}.json'.format(experiment_id, j, i+2)
            write_json(all_selected_samples, outpath_selected_samples)

        ########################### Memory Train ######################################
        targets_labels = []
        max_source_length = None
        max_target_length = None
        tokenizer = set_tokenizer()
        metric = evaluate.load("rouge")

        dataset_path = 'tacred/5way5shot/memory/run_{0}/task{1}/'.format(experiment_id,i+1)
        tasks_path = "relations/run_{0}/task{1}.json".format(experiment_id,i+1)

        base_model_id = "KMmeans_CRE_tacred_{0}/FSCRE_5_{1}/".format(experiment_id, i+1)
        task_id = "task{0}".format(i+1)

        print(base_model_id)
        start_time = datetime.now()
        model, tokenizer, trainer = main(base_model_id, dataset_path, tasks_path, task_id, True)
        end_time = datetime.now()

        train_time = 'Memory Train. Experiment Id: {0}. Task Id: {1}. Duration: {2} \n'.format(experiment_id, task_id, end_time - start_time)
        logs += train_time
        model.save_pretrained("KMmeans_CRE_tacred_{0}/FSCRE_5_{1}/".format(experiment_id, i+1), from_pt=True)
        ### evaluate model
        evaluate_model(experiment_id, i+1, model, tokenizer, current_task=False)
        write_json(logs, "KMmeans_CRE_tacred_{0}/logs.txt".format(experiment_id))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
set_dataset("tacred/5way5shot/train/run1/task1/")

FileNotFoundError: Couldn't find any data file at /content/tacred/5way5shot/train/run1/task1.

In [None]:
write_json(logs, "KMmeans_CRE_tacred_{0}/logs.txt".format(experiment_id))