In [1]:
import os
import datasets
# import logging
import torch
import tensorflow as tf
import pandas as pd
# import random
import transformers
import jsonlines

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

# logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
import torch

def check_gpu():
    if torch.cuda.is_available():
        print(f"CUDA is available. PyTorch version: {torch.__version__}")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024 ** 2:.2f} MB")
            print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024 ** 2:.2f} MB")
    else:
        print("CUDA is not available. Using CPU.")

check_gpu()

CUDA is available. PyTorch version: 2.3.0+cu118
Number of GPUs: 1
GPU 0: NVIDIA GeForce RTX 3070 Ti Laptop GPU
  Memory Allocated: 997.24 MB
  Memory Cached: 4104.00 MB


In [4]:
torch.cuda.empty_cache()


In [2]:
dataset_path = 'C:\\Users\\prudh\\Desktop\\Python\\drug-interaction\\data\\final_data1.jsonl'

model_name = 'fine_tuned_on_train_dataset'
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [3]:
training_config = {
    'model' : {
        'pretrained_name' : model_name,
        'max_length' : 1024
    },
    'datasets' : {
        'path' : dataset_path
    },
    'verbose' : True
}

In [5]:
tokenizer = AutoTokenizer.from_pretrained('openai-community/gpt2')
tokenizer.pad_token = tokenizer.eos_token
dataset = tokenize_and_split_dataset(training_config, tokenizer)
print('completed')
print(dataset)

Map: 100%|██████████| 38600/38600 [00:19<00:00, 1982.88 examples/s]


completed
Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 38600
})


In [4]:
import datasets
from transformers import AutoTokenizer


def tokenize_fn(examples,tokenizer, max_length):
    texts = []
    for i in range(len(examples["input"])):
        text = examples["input"][i] + examples["output"][i]
        texts.append(text)
    
    tokenized_inputs = tokenizer(
        texts,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs


def tokenize_and_split_dataset(training_config, tokenizer):
    dataset = datasets.load_dataset('json', data_files=training_config['datasets']['path'])

    max_length = training_config['model']['max_length']
    tokenized_dataset = dataset.map(lambda examples : tokenize_fn(examples, tokenizer, max_length), batched=True, drop_last_batch=True, batch_size=200)

    tokenized_dataset = tokenized_dataset['train'].add_column('labels', tokenized_dataset['train']['input_ids'])

    # split_dataset = tokenized_dataset.train_test_split(test_size=0.1,shuffle=True)

    return tokenized_dataset#split_dataset['train'], split_dataset['test']

In [6]:
def inference(text, model, tokenizer, max_input_tokens = 100, max_output_tokens = 1000):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    input_ids = tokenizer.encode(
        text,
        return_tensors='pt',
        truncation=True,
        max_length = max_input_tokens,
        padding=True
    ).to(device)

    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids,
        max_length = max_output_tokens
    ).to(device)

    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

In [7]:
dataset[0]['input']

'Human Insulatard 40IU/ml Suspension for Injection'

In [8]:
inference(dataset[0]['input'], base_model, tokenizer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' Thank you for providing the drug name: Insulatard 40IU/ml Suspension.\n\nInsulatard 40IU/ml Suspension is categorized under the sub-category Non Narcotics And Anti Pyretics and it contains the active salt composition of \nParacetamol (40IU/ml). This medicine is manufactured by Sun Pharmaceutical Industries Ltd. \n\nDescription: \nInsulatard 40IU/ml Suspension is a medicine used to relieve pain and to reduce fever. It is used to treat many conditions such as headache, body ache, toothache and common cold. It works by inhibiting the release of certain chemical that cause pain and fever.Insulatard 40IU/ml Suspension may be prescribed alone or in combination with another medicine. You should take it regularly as advised by your doctor. It is usually best taken with food otherwise it may upset your stomach. Do not take more or use it for longer than recommended. Side effects are rare if this medicine is used correctly\xa0but this medicine may\xa0cause stomach pain, nausea, and vomiting in

In [9]:
max_steps = len(dataset)*2
trained_model_name = f'drug_interaction_{max_steps}_steps'
output_dir = 'output/saved_models/' + trained_model_name

In [29]:
training_args = TrainingArguments(

    learning_rate=1.0e-5,
    num_train_epochs=2,
    max_steps=max_steps,
    per_device_train_batch_size=1,
    output_dir=output_dir,
    
    overwrite_output_dir=False,
    disable_tqdm=True,
    save_steps=50000,
    warmup_steps=1,
    logging_strategy='steps',
    logging_steps=1,
    optim='adafactor',
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,

    save_total_limit=1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    # auto_find_batch_size=True
)

In [30]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer
)

max_steps is given, it will override any value given in num_train_epochs


In [31]:
training_output = trainer.train()

{'loss': 0.194, 'grad_norm': 1.0638036727905273, 'learning_rate': 1e-05, 'epoch': 0.00010362694300518135}
{'loss': 0.0104, 'grad_norm': 0.18217070400714874, 'learning_rate': 9.999870464643325e-06, 'epoch': 0.0002072538860103627}
{'loss': 0.0421, 'grad_norm': 0.4560261070728302, 'learning_rate': 9.99974092928665e-06, 'epoch': 0.000310880829015544}
{'loss': 0.0176, 'grad_norm': 0.36126184463500977, 'learning_rate': 9.999611393929974e-06, 'epoch': 0.0004145077720207254}
{'loss': 0.1669, 'grad_norm': 0.8861557245254517, 'learning_rate': 9.9994818585733e-06, 'epoch': 0.0005181347150259067}
{'loss': 0.0267, 'grad_norm': 0.29994142055511475, 'learning_rate': 9.999352323216622e-06, 'epoch': 0.000621761658031088}
{'loss': 0.0452, 'grad_norm': 0.4248092472553253, 'learning_rate': 9.999222787859946e-06, 'epoch': 0.0007253886010362695}
{'loss': 0.1058, 'grad_norm': 0.7091189622879028, 'learning_rate': 9.99909325250327e-06, 'epoch': 0.0008290155440414508}
{'loss': 0.1524, 'grad_norm': 0.79281294345

KeyboardInterrupt: 

In [32]:
base_model.save_pretrained('drug_info')

In [46]:
s_model = AutoModelForCausalLM.from_pretrained('drug_info')

In [50]:
test_output = inference(dataset[13453]['input'], s_model, tokenizer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [51]:
print(test_output)

 Thank you for providing the drug name: Azithral 500 Tablet.

Azithral 500 Tablet is categorized under the sub-category Macrolides And Similar Types and it contains the active salt composition of 
Azithromycin (500mg). This medicine is manufactured by Alembic Pharmaceuticals Ltd. 

Description: 
Azithral 500 Tablet is an antibiotic used to treat various types of bacterial infections of the respiratory tract, ear, nose, throat, lungs, skin, and eye in adults and children. It is also effective in typhoid fever and some sexually transmitted diseases like gonorrhea.Azithral 500 Tablet is a broad-spectrum type of antibiotic effective in killing many types of gram-positive bacteria, some types of gram-negative bacteria and other microorganisms. This medicine is taken orally, preferably either one hour before or 2 hours after a meal. It should be used regularly at evenly spaced time intervals as prescribed by your doctor. Do not skip any doses and finish the full course of treatment even if y

In [52]:
print(dataset[13453]['output'])

 Thank you for providing the drug name: Azithral 500 Tablet.

Azithral 500 Tablet is categorized under the sub-category Macrolides And Similar Types and it contains the active salt composition of 
Azithromycin (500mg). This medicine is manufactured by Alembic Pharmaceuticals Ltd. 

Description: 
Azithral 500 Tablet is an antibiotic used to treat various types of bacterial infections of the respiratory tract, ear, nose, throat, lungs, skin, and eye in adults and children. It is also effective in typhoid fever and some sexually transmitted diseases like gonorrhea.Azithral 500 Tablet is a broad-spectrum type of antibiotic effective in killing many types of gram-positive bacteria, some types of gram-negative bacteria and other microorganisms. This medicine is taken orally, preferably either one hour before or 2 hours after a meal. It should be used regularly at evenly spaced time intervals as prescribed by your doctor. Do not skip any doses and finish the full course of treatment even if y

In [53]:
print(dataset[13453]['input'])

Azithral 500 Tablet
