In [1]:
import torch.nn as nn 
from torch.utils.data import Dataset
from torch.utils.data import dataloader

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# load dataset
import pandas as pd 
df = pd.read_csv("../scifact/train.source_rl", sep = "\t", names = ["source"])
df["source"].to_list()[:10]

['Type 1 Diabetes is associated with subtle perturbations in T reg development.',
 'In transgenic mice harboring green florescent protein under the control of the Sox2 promoter, more than 50 percent of the cells with green florescent colocalize with cell proliferation markers.',
 'Oxidative DNA damage activates STING signalling.',
 'ATM and Rad3 related protein are critical for sensing DNA damage.',
 'Assessing treatment adherence is more beneficial to clinical practice than measuring routine outcomes.',
 'N348I mutations cause resistance to nevirapine.',
 'Cellular clocks are not predictive of mitosis timing in NIH 3T3 cells.',
 'Acute ablation of KRAS causes severe growth impairment.',
 "The World Health Organization's (WHO) data collection process is biased downward by unequal selection of larger outbreaks.",
 'CD44v6 is not associated with constitutive and reprogrammed cancer stem cells driving cancer metastasis.']

In [3]:
# test the gpt tokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
prep_txt = "hello world"
modelname = "gpt2"

# load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(modelname, pad_token='<|pad|>')
tokenizer.pad_token = tokenizer.eos_token # to avoid an error

model = GPT2LMHeadModel.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id)


2023-06-03 10:26:08.086388: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# We need to pad our input tokens to max length of the sequence
encode_tokenizer = tokenizer(df["source"][:10].to_list(), truncation=True,max_length=40, padding="max_length",return_tensors="pt")
print(df["source"][:10].to_list())
print(encode_tokenizer['input_ids'].shape)

['Type 1 Diabetes is associated with subtle perturbations in T reg development.', 'In transgenic mice harboring green florescent protein under the control of the Sox2 promoter, more than 50 percent of the cells with green florescent colocalize with cell proliferation markers.', 'Oxidative DNA damage activates STING signalling.', 'ATM and Rad3 related protein are critical for sensing DNA damage.', 'Assessing treatment adherence is more beneficial to clinical practice than measuring routine outcomes.', 'N348I mutations cause resistance to nevirapine.', 'Cellular clocks are not predictive of mitosis timing in NIH 3T3 cells.', 'Acute ablation of KRAS causes severe growth impairment.', "The World Health Organization's (WHO) data collection process is biased downward by unequal selection of larger outbreaks.", 'CD44v6 is not associated with constitutive and reprogrammed cancer stem cells driving cancer metastasis.']
torch.Size([10, 40])


  encode_tokenizer = tokenizer(df["source"][:10].to_list(), truncation=True,max_length=40, padding="max_length",return_tensors="pt")
  print(df["source"][:10].to_list())


## max_length or max_new_tokens

max_length means we generate the whole sentence with length : max_length

max_new_tokens means generated tokens in addition to the max_length

In [9]:

output_sequences = model.generate(
    input_ids=encode_tokenizer['input_ids'],
    attention_mask=encode_tokenizer['attention_mask'],
    do_sample=False, # disable sampling to test if batching affects output
    early_stopping=True,
    max_length=60,
    remove_invalid_values = True,
)
output_sequences.size()

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


torch.Size([10, 60])

In [16]:
decoder_text = tokenizer.batch_decode(output_sequences,skip_special_tokens = True, clean_up_tokenization_spaces = True)
for item in decoder_text:
    print(item.replace('\n', ""))

Type 1 Diabetes is associated with subtle perturbations in T reg development.T reg development is a major cause of diabetes mellitus (DM). It is characterized by
In transgenic mice harboring green florescent protein under the control of the Sox2 promoter, more than 50 percent of the cells with green florescent colocalize with cell proliferation markers.The green florescent protein is expressed in the nucleus of the cell, and it is expressed in the
Oxidative DNA damage activates STING signalling.The study was published in the journal Nature Communications.The researchers found that the enzyme
ATM and Rad3 related protein are critical for sensing DNA damage.The researchers found that the protein was able to detect DNA damage in the presence of a high
Assessing treatment adherence is more beneficial to clinical practice than measuring routine outcomes.The study was conducted in the United States, and the results were reported in the Journal of
N348I mutations cause resistance to nevirapin

In [14]:
sentences = ["Hello, my dog is a little",
            "Hello, my dog is", # use different length sentences to test batching
            ]
inputs = tokenizer(sentences,truncation=True,max_length=20, padding="max_length",return_tensors="pt")

In [15]:

output_sequences = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    do_sample=False, # disable sampling to test if batching affects output
    # early_stopping=True,
    max_new_tokens=10,
    remove_invalid_values = True,
)

tokenizer.batch_decode(output_sequences,skip_special_tokens = True)
# for i in range(len(sentences)):
#     print(tokenizer.decode(output_sequences[i]), "**********")

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


["Hello, my dog is a little\n\nlittle bit shy. I'm not sure",
 'Hello, my dog is\n\na little bit\n\ndifferent.\n']

# Lets load the dataset

In [34]:
df_source = pd.read_csv("../scifact/train.source",sep = "\t",names = ["source"])
df_target = pd.read_csv("../scifact/train.target",sep = "\t",names = ["target"])
df_train = df_source["source"] + "<|pad|>" + df_target["target"]
for item in df_train:
    print(item)

Type 1 Diabetes is associated with subtle perturbations in T reg development.<|pad|>Autoimmune diseases are thought to result from imbalances in normal immune physiology and regulation. Here, we show that autoimmune disease susceptibility and resistance alleles on mouse chromosome 3 (Idd3) correlate with differential expression of the key immunoregulatory cytokine interleukin-2 (IL-2). In order to test directly that an approximately twofold reduction in IL-2 underpins the Idd3-linked destabilization of immune homeostasis, we show that engineered haplodeficiency of Il2 gene expression not only reduces T cell IL-2 production by twofold but also mimics the autoimmune dysregulatory effects of the naturally occurring susceptibility alleles of Il2. Reduced IL-2 production achieved by either genetic mechanism correlates with reduced function of CD4+ CD25+ regulatory T cells, which are critical for maintaining immune homeostasis.
In transgenic mice harboring green florescent protein under the 

In [35]:
len(df_source)

735

In [36]:
len(df_target)

735

In [55]:
# get the dataset
import pandas as pd 
import torch
class S2Sdataset(Dataset):
    def __init__(self,max_length = 20, data_type = "train"):
        self.input_ids = []
        self.attn_masks = []

        df_source = pd.read_csv("../scifact/train.source",sep = "\t",names = ["source"])
        df_target = pd.read_csv("../scifact/train.target",sep = "\t",names = ["target"])
        self.df_train =  df_source["source"] + "<|pad|>" + df_target["target"]
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df_train)
    def __getitem__(self, index):
        item = self.df_train[index]
        encodings_dict = tokenizer(item, truncation=True,
                                       max_length = self.max_length, padding="max_length")
        return torch.tensor(encodings_dict['input_ids']), torch.tensor(encodings_dict['attention_mask'])

In [56]:
train_dataset = S2Sdataset(data_type = "train")
test_dataset = S2Sdataset(data_type = "test")

# training the model

In [58]:
# creating the train arguments
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir='results', num_train_epochs=10, logging_steps=10,
                                 load_best_model_at_end=True, save_strategy="epoch", evaluation_strategy="epoch",
                                 per_device_train_batch_size=2, per_device_eval_batch_size=2,
                                 warmup_steps=100, weight_decay=0.01, logging_dir='logs')

In [59]:
model = GPT2LMHeadModel.from_pretrained("gpt2").cuda()
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

In [60]:
# start training
Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset,
        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                    'attention_mask': torch.stack([f[1] for f in data]),
                                    'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 735
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1840
  Number of trainable parameters = 124440576
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzhansu[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 735
  Batch size = 4
  Num examples = 735
  Batch size = 4
Saving model checkpoint to results/checkpoint-184
Configuration saved in results/checkpoint-184/config.json
Model weights saved in results/checkpoint-184/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 4
Saving model checkpoint to results/checkpoint-368
Configuration saved in results/checkpoint-368/config.json
Model weights saved in results/checkpoint-368/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 4
Saving model checkpoint to results/checkpoint-552
Configuration saved in results/checkpoint-552/config.json
Model weights saved in results/checkpoint-552/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 735
  Batch size = 4
Saving model checkpoint to results/checkpoint-736
Configuration saved in results/checkpoint-736/config.json
Model weights saved in results/checkpoint-736/pytorch_model.bi

In [61]:
# Test 

model.eval()

text = "A deficiency of vitamin B12 increases blood levels of homocysteine."
prompt = text

generated = tokenizer(f"{prompt}", return_tensors="pt").input_ids.cuda()

# perform prediction
sample_outputs = model.generate(generated, do_sample=False, top_k=50, max_length=50, top_p=0.90, 
        temperature=0, num_return_sequences=0)
# decode the predicted tokens into texts
predicted_text  = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [62]:
predicted_text

'A deficiency of vitamin B12 increases blood levels of homocysteine. <|pad|> BACKGROUND Homocysteine is a general endocrine disorder characterized by elevated blood levels of homocysteine. <|pad|> BACKGROUND Homocysteine deficiency is a severe'