### General

In [1]:
from google.colab import drive
drive.mount("/content/drive")

# Here is the path of the root dir of this folder in your google drive
path="/content/drive/My Drive/Project"


import os
import sys
os.chdir(path)
sys.path.append(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
import numpy as np
import pandas as pd
import re
# import json
# import random
# import copy
import h5py
# import random
# import math
# from tqdm import tqdm
# tqdm.pandas()

In [3]:
!pip install SentencePiece



In [4]:
!pip install transformers
!pip install datasets



### Get pretrain dataset

In [None]:
from datasets import load_dataset

dataset_name = 'ywchoi/pubmed_abstract_0'
abstracts = load_dataset(dataset_name)

abstracts_train = abstracts['train']['text']
abstracts_validation = abstracts['validation']['text']
abstracts_test = abstracts['test']['text']

In [None]:
'''
try to split a whole abstract to shorter sentence
'''

def split_text_into_items(text, num_sentences_per_item=3):
    # Use regular expressions to divide text into sentences
    sentences = re.split(r'(?<=[.!?])\s', text)

    items = []

    # Organize sentences into one form for every two or three sentences.
    for i in range(0, len(sentences), num_sentences_per_item):
        item = ' '.join(sentences[i:i + num_sentences_per_item])
        items.append(item)

    return items


def split_dataset(abstracts):
    results = []
    for abstract in abstracts:
      result = split_text_into_items(abstract, num_sentences_per_item=3)
      results.extend(result)
    return results

train_dataset = split_dataset(abstracts_train)
val_dataset = split_dataset(abstracts_validation)
test_dataset = split_dataset(abstracts_test)


print(f"pretrain train dataset number {len(train_dataset)}")
print(f"pretrain val dataset number {len(val_dataset)}")
print(f"pretrain test dataset number {len(test_dataset)}")

pretrain train dataset number 5660404
pretrain val dataset number 115310
pretrain test dataset number 116521


### pretrain autoencoder

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
#data loader
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, text, tokenizer, max_length):
        self.text = text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def _text_to_encoding(self, item):
        return self.tokenizer(item,
                              max_length=self.max_length,
                              padding="max_length",
                              truncation=True)

    def _text_to_item(self, text):
        try:
            if (text is not None):
                return self._text_to_encoding(text)
            else:
                return None
        except:
            return None

    def __len__(self):
        return len(self.text)

    def __getitem__(self, _id):

        input_text = self.text[_id]
        text_encodings = self._text_to_item(input_text)

        return {key: torch.tensor(value) for key, value in text_encodings.items()}

batch_size = 16
max_length = 1024

train_Dataset = TextDataset(train_dataset, tokenizer, 1024)
val_Dataset = TextDataset(val_dataset,  tokenizer, 1024)
test_Dataset = TextDataset(test_dataset,  tokenizer, 1024)


train_loader = DataLoader(train_Dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_Dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_Dataset, batch_size=batch_size)


In [None]:
# train
from transformers import AdamW

optim = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

#
num_epochs = 1
#

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # outputs = model(**input_ids, return_dict=True)
        # forward
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=input_ids)

        # compute loss
        train_loss = criterion(outputs.logits.view(-1, outputs.logits.shape[-1]), input_ids.view(-1))
        train_loss.backward()
        optim.step()

    model.eval()
    with torch.no_grad():
         for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=input_ids)
            val_loss = criterion(outputs.logits.view(-1, outputs.logits.shape[-1]), input_ids.view(-1))

    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss.item()}, Val Loss: {val_loss.item()}")

### encode all memories

In [5]:
# Load model directly

from transformers import AutoTokenizer, AutoModel

model_name = "malteos/PubMedNCL"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [6]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [10]:
def encode_embeding(tokenizer, model, data, device):
  if not isinstance(data, list):
    data = data.iloc[:, 1].tolist()

  # preprocess the input
  inputs = tokenizer(data, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
  # inference
  result = model(**inputs)

  # take the first token ([CLS] token) in the batch as the embedding
  embeddings = result.last_hidden_state[:, 0, :]
  return embeddings

In [8]:
# load data
triplets_memory = pd.read_csv('data/memories/triplets_memory.csv')
syn_memory = pd.read_csv('data/memories/syn_memory.csv')
clusters_memory = pd.read_csv('data/memories/clusters_memory.csv')

In [None]:
# encode
t_embedings = encode_embeding(tokenizer, model, triplets_memory, device)
s_embedings = encode_embeding(tokenizer, model, syn_memory, device)
c_embedings = encode_embeding(tokenizer, model, clusters_memory, device)