In [1]:
import os
import datasets
# import logging
import torch
import tensorflow as tf
import pandas as pd
# import random
import transformers
import jsonlines

from utilities import tokenize_and_split_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

# logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
import torch

def check_gpu():
    if torch.cuda.is_available():
        print(f"CUDA is available. PyTorch version: {torch.__version__}")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024 ** 2:.2f} MB")
            print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024 ** 2:.2f} MB")
    else:
        print("CUDA is not available. Using CPU.")

check_gpu()

CUDA is available. PyTorch version: 2.3.0+cu118
Number of GPUs: 1
GPU 0: NVIDIA GeForce RTX 3070 Ti Laptop GPU
  Memory Allocated: 1471.26 MB
  Memory Cached: 1586.00 MB


In [4]:
dataset_path = 'C:\\Users\\prudh\\Desktop\\Python\\drug-interaction\\data\\drug_interaction1.jsonl'

model_name = 'drug_info'
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [5]:
training_config = {
    'model' : {
        'pretrained_name' : model_name,
        'max_length' : 512
    },
    'datasets' : {
        'path' : dataset_path
    },
    'verbose' : True
}

In [6]:
def tokenize_fn(examples,tokenizer, max_length):
    texts = []
    for i in range(len(examples["input1"])):
        text = examples["input1"][i] + examples["input2"][i] + examples["output"][i]
        texts.append(text)
    
    tokenized_inputs = tokenizer(
        texts,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs


def tokenize_and_split_dataset(training_config, tokenizer):
    dataset = datasets.load_dataset('json', data_files=training_config['datasets']['path'])

    max_length = training_config['model']['max_length']
    tokenized_dataset = dataset.map(lambda examples : tokenize_fn(examples, tokenizer, max_length), batched=True, drop_last_batch=True, batch_size=400)

    tokenized_dataset = tokenized_dataset['train'].add_column('labels', tokenized_dataset['train']['input_ids'])

    # split_dataset = tokenized_dataset.train_test_split(test_size=0.1,shuffle=True)

    return tokenized_dataset #split_dataset['train'], split_dataset['test']

In [8]:
tokenizer = AutoTokenizer.from_pretrained('openai-community/gpt2')
tokenizer.pad_token = tokenizer.eos_token
dataset = tokenize_and_split_dataset(training_config, tokenizer)
print('completed')
print(dataset)

Generating train split: 57398 examples [00:00, 1106292.15 examples/s]
Map: 100%|██████████| 57200/57200 [00:03<00:00, 17679.71 examples/s]


completed
Dataset({
    features: ['input1', 'input2', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 57200
})


In [9]:
def inference(text1, text2, model, tokenizer, max_input_tokens = 100, max_output_tokens = 512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    input_ids = tokenizer.encode(
        text1 + text2,
        return_tensors='pt',
        truncation=True,
        max_length = max_input_tokens,
        padding=True
    ).to(device)

    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids,
        max_length = max_output_tokens
    ).to(device)

    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    generated_text_answer = generated_text_with_prompt[0]

    return generated_text_answer

In [11]:
inference('paracetamol', 'aspirin', base_model, tokenizer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'paracetamolaspirin 100mg Tablet Thank you for providing the drug name: Paracetamolaspirin 100mg Tablet.\n\nParacetamolaspirin 100mg Tablet is categorized under the sub-category Anti Ulcerants H2 Antagonists and it contains the active salt composition of \nPantoprazole (100mg). This medicine is manufactured by Torrent Pharmaceuticals Ltd. \n\nDescription: \nParacetamolaspirin 100mg Tablet is a medicine that reduces the amount of excess acid make by your stomach. It is used for treating and preventing heartburn, indigestion and other symptoms caused by too much acid in the stomach. It is also used to treat and prevent stomach ulcers, reflux disease and some other rare conditions.Paracetamolaspirin 100mg Tablet is also commonly prescribed to prevent stomach ulcers and heartburn caused by the use of painkillers. It can be taken with or without food. How much you need, and how often you take it will depend on what you are being treated for. Follow the advice of your doctor while taking thi

In [12]:
max_steps = len(dataset)*2
trained_model_name = f'drug_interaction_1_{max_steps}_steps'
output_dir = 'output/saved_models/' + trained_model_name

In [16]:
training_args = TrainingArguments(

    learning_rate=1.0e-5,
    num_train_epochs=2,
    max_steps=max_steps,
    per_device_train_batch_size=1,
    output_dir=output_dir,
    
    overwrite_output_dir=False,
    disable_tqdm=True,
    save_steps=50000,
    warmup_steps=1,
    logging_strategy='steps',
    logging_steps=1,
    optim='adafactor',
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,

    save_total_limit=1,
    metric_for_best_model='eval_loss',
    greater_is_better=False
)

In [17]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer
)

max_steps is given, it will override any value given in num_train_epochs


In [18]:
training_output = trainer.train()

{'loss': 1.0708, 'grad_norm': 6.600746154785156, 'learning_rate': 1e-05, 'epoch': 6.993006993006993e-05}
{'loss': 0.9756, 'grad_norm': 7.270636081695557, 'learning_rate': 9.999912586648485e-06, 'epoch': 0.00013986013986013986}
{'loss': 1.227, 'grad_norm': 8.123656272888184, 'learning_rate': 9.99982517329697e-06, 'epoch': 0.0002097902097902098}
{'loss': 1.0997, 'grad_norm': 7.270029544830322, 'learning_rate': 9.999737759945454e-06, 'epoch': 0.0002797202797202797}
{'loss': 1.1164, 'grad_norm': 7.87039852142334, 'learning_rate': 9.99965034659394e-06, 'epoch': 0.00034965034965034965}
{'loss': 0.9523, 'grad_norm': 6.3647780418396, 'learning_rate': 9.999562933242423e-06, 'epoch': 0.0004195804195804196}
{'loss': 0.9724, 'grad_norm': 7.388791084289551, 'learning_rate': 9.99947551989091e-06, 'epoch': 0.0004895104895104895}
{'loss': 0.6365, 'grad_norm': 5.443972110748291, 'learning_rate': 9.999388106539394e-06, 'epoch': 0.0005594405594405594}
{'loss': 0.7907, 'grad_norm': 5.8135986328125, 'learn

KeyboardInterrupt: 

In [19]:
base_model.save_pretrained('drug_interaction')

In [10]:
dataset = datasets.load_dataset('json', data_files='data/drug_interaction.jsonl')

In [20]:
tokenizer = AutoTokenizer.from_pretrained('openai-community/gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained('drug_interaction')

In [28]:
test_output = inference(dataset[23434]['input1'], dataset[23434]['input2'], model, tokenizer)
print(test_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Oflomac 200 TabletTizanidineDrug1: Oflomac 200 Tablet
Drug2: Tizanidine

Side effects are Nausea,Stomach pain,Diarrhea

Drug Ineraction effect can be LIFE-THREATENING



In [23]:
dataset[1]['input1'], dataset[1]['input2']

('Human Insulatard 40IU/ml Suspension for Injection', 'Captopril')

In [29]:
print(dataset[23434]['output'])

Drug1: Oflomac 200 Tablet
Drug2: Tizanidine

Side effects are Nausea,Stomach pain,Diarrhea

Drug Ineraction effect can be LIFE-THREATENING

