# Fine-tuning

In [1]:
from datasets import Dataset, DatasetDict, load_dataset
from uuid import uuid4
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer, pipeline
import evaluate
import json
import numpy as np

In [202]:
# dataset = load_dataset("squad_v2", split="train[:5000]")
# dataset = dataset.train_test_split(test_size=0.15)
# dataset
dataset = load_dataset("csv", data_files="medibert.csv", split="train")
dataset = dataset.train_test_split(test_size=0.15)
dataset['test'].to_dict()

{'context': ['"CVS pharmacy PSC 1578, Box 2410, APO AA 02400 TEL 6365762933 Rx 659547 QTY: 30 REFILLS: 0 by 03/15/2026 PRSCBR: R Udom DATE FILLED: 11/27/2025 DISCARD AFTER: 01/03/2027 RPH: S Demetriou MFR: LifeCare Labs JAYA Remeron Jaya lyer 014 Strickland Row Apt 996, Adkinsmouth, WY 20363 Remeron MORNING MIDDAY Generic equivalent of: Lorem sed enim commodo Take 1 tablet once a day EVENING BEDTIME A PHARMACY ADVICE Important Information • Lorem ipsum dolor sit amet, consectetur adipiscing elit • Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua • Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat CAUTION Federal law prohibits the transfer of this drag to any person other than the patient for whom it was prescribed',
  '* CVS pharmacy PSC 1578, Box 2410, APO AA 02400 TEL 6365762933 Rx 117453 QTY: 30 REFILLS: 6 by 01/27/1986 PRSCBR: V Katsarou DATE FILLED: 04/11/1985 DISCARD AFTER: 10/12/1986 RPH: A Haile MFR: Glob

In [139]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [155]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        # answer = answers[i]
        answer = json.loads(answers[i].strip("[]"))
        # print(answer)
        start_char = answer["answer_start"]
        end_char = answer["answer_start"] + len(str(answer["text"]))
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        # print('off', offset)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [162]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

In [159]:
data_collator = DefaultDataCollator()

In [169]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.to("mps" if torch.backends.mps.is_available() else "cpu")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [193]:
training_args = TrainingArguments(
    output_dir='medibert-base-uncased',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [194]:
trainer.train()



  0%|          | 0/5420 [00:00<?, ?it/s]

{'loss': 0.2789, 'learning_rate': 1.81549815498155e-05, 'epoch': 0.92}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.29785284399986267, 'eval_runtime': 16.6821, 'eval_samples_per_second': 91.655, 'eval_steps_per_second': 5.755, 'epoch': 1.0}
{'loss': 0.2398, 'learning_rate': 1.6309963099630997e-05, 'epoch': 1.85}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.2600816488265991, 'eval_runtime': 16.9393, 'eval_samples_per_second': 90.263, 'eval_steps_per_second': 5.667, 'epoch': 2.0}
{'loss': 0.2248, 'learning_rate': 1.4464944649446495e-05, 'epoch': 2.77}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.2641700208187103, 'eval_runtime': 16.7391, 'eval_samples_per_second': 91.343, 'eval_steps_per_second': 5.735, 'epoch': 3.0}
{'loss': 0.2317, 'learning_rate': 1.2619926199261994e-05, 'epoch': 3.69}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.2539921998977661, 'eval_runtime': 16.5424, 'eval_samples_per_second': 92.429, 'eval_steps_per_second': 5.803, 'epoch': 4.0}
{'loss': 0.2013, 'learning_rate': 1.0774907749077492e-05, 'epoch': 4.61}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.24775446951389313, 'eval_runtime': 16.8914, 'eval_samples_per_second': 90.52, 'eval_steps_per_second': 5.683, 'epoch': 5.0}
{'loss': 0.204, 'learning_rate': 8.92988929889299e-06, 'epoch': 5.54}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.2334178239107132, 'eval_runtime': 16.3542, 'eval_samples_per_second': 93.493, 'eval_steps_per_second': 5.87, 'epoch': 6.0}
{'loss': 0.1922, 'learning_rate': 7.084870848708487e-06, 'epoch': 6.46}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.2587943971157074, 'eval_runtime': 16.4627, 'eval_samples_per_second': 92.877, 'eval_steps_per_second': 5.831, 'epoch': 7.0}
{'loss': 0.1767, 'learning_rate': 5.2398523985239855e-06, 'epoch': 7.38}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.30416813492774963, 'eval_runtime': 16.5856, 'eval_samples_per_second': 92.188, 'eval_steps_per_second': 5.788, 'epoch': 8.0}
{'loss': 0.1867, 'learning_rate': 3.3948339483394836e-06, 'epoch': 8.3}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.26886478066444397, 'eval_runtime': 33.7757, 'eval_samples_per_second': 45.269, 'eval_steps_per_second': 2.842, 'epoch': 9.0}
{'loss': 0.1727, 'learning_rate': 1.5498154981549817e-06, 'epoch': 9.23}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.28341200947761536, 'eval_runtime': 16.5101, 'eval_samples_per_second': 92.61, 'eval_steps_per_second': 5.815, 'epoch': 10.0}
{'train_runtime': 6075.8394, 'train_samples_per_second': 14.255, 'train_steps_per_second': 0.892, 'train_loss': 0.20730101525563596, 'epoch': 10.0}


TrainOutput(global_step=5420, training_loss=0.20730101525563596, metrics={'train_runtime': 6075.8394, 'train_samples_per_second': 14.255, 'train_steps_per_second': 0.892, 'train_loss': 0.20730101525563596, 'epoch': 10.0})

In [195]:
# trainer.push_to_hub("medibert-base-uncased")
trainer.save_model("medibert-base-uncased")

In [1]:
context = "Lorem ipsum dolor sit. amet, consectetur. adipiscing elit.. Sed do eiusmod tempor. incididunt ut labore et. dolore magna aliqua.. Ut enim ad minim veniam,. quis nostrud exercitation. ullamco laboris nisi ut. aliquip ex ea commodo. consequat.. JOHN DOE. 236 PAULA COMMON SUITE 448, NORTH DAVID, FL 58836. ASPIRIN. Lorem sed enim commodo oij aoiwjef pijzxmm cjlaj goia we oigjaorw. TAKE ONE PILL AFTER MEALS. DATE: 10/31/2032. WIC#957754. Patent Pending. Rx 223517-606. EXPIRATION DATE 06/26/2033. QTY 30. 2 by 01/04/2033. Walgreens. 7921 DAKOTA CENTER, LAKE YVONNEVIEW, PW 87589. QXN/YAK/YAK/YAK. (085) 560-301. CAUTION FEDERAL LAW PROHIBITS THE TRANSFER OF THIS DRUG TO. ANY PERSON OTHER THAN THE PATIENT OR WHOM IT WAS PRESCRIBED - RX ONL. Walgreens"

In [3]:
qa = pipeline("question-answering", model="medibert-base-uncased/")

In [4]:
qa(question="What is the medicationName?", context=context)

{'score': 0.003522496437653899, 'start': 12, 'end': 17, 'answer': 'dolor'}

In [189]:
metric = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [192]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer.evaluate()

  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.31557196378707886,
 'eval_runtime': 16.6209,
 'eval_samples_per_second': 91.992,
 'eval_steps_per_second': 5.776,
 'epoch': 3.0}

In [3]:
model = AutoModelForQuestionAnswering.from_pretrained("medibert-base-uncased")

In [8]:
model.push_to_hub("pr28416/medibert-base-uncased", create_pr=1)

CommitInfo(commit_url='https://huggingface.co/pr28416/medibert-base-uncased/commit/5f6cdf8296e9328e76f36f7332ddbb4c6834aa8d', commit_message='Upload DistilBertForQuestionAnswering', commit_description='', oid='5f6cdf8296e9328e76f36f7332ddbb4c6834aa8d', pr_url='https://huggingface.co/pr28416/medibert-base-uncased/discussions/1', pr_revision='refs/pr/1', pr_num=1)

# Data wrangling

In [90]:
df = pd.read_csv('medtable.csv')

In [97]:
ndf = pd.DataFrame(columns=["context", "question", "answers", "title", "id"])
bigjson = {}
# keys = ["train", "validation", "test"]
jsn = []
# Iterate through rows of dataframe
for index, row in df.iterrows():
    # print(row)
    # if index == 850:
    #     bigjson["train"] = jsn
    #     jsn = {"id": [], "title": [], "context": [], "question": [], "answers": []}
    # elif index == 925:
    #     bigjson["validation"] = jsn
    #     jsn = {"id": [], "title": [], "context": [], "question": [], "answers": []}
    # if index == 0:
    #     continue
    # if index == 1:
    #     break

    

    for key, value in row.to_dict().items():
        if key in {"Input", "doctor", "rph", "rxNumber", "pharmacyPhoneNumber", "pharmacyAddress"}:
            continue
        nd = {}
        nd['context'] = row["Input"]
        nd['id'] = uuid4().hex
        nd['question'] = f"What is the {key}?"
        nd['title'] = key

        idx = row["Input"].find(str(value))
        idx = row["Input"].find(value[:len(str(value))//2]) if idx == -1 else idx
        nd['answers'] = json.dumps([{"answer_start": idx, "text": value}])

        # print(nd)
        # ndf.append(nd, ignore_index=True)
        ndf = pd.concat([ndf, pd.DataFrame(nd, index=[0])], ignore_index=True)
        ndf.reset_index()
    
    # jsn.append(nd)
    
    # print(row.to_dict())
# bigjson["test"] = jsn
# bigjson = {"data": jsn}
ndf.head()
ndf.to_csv("medibert.csv", index=False)

In [85]:
with open("out.json", "w") as f:
    # f.write("[\n")
    # for idx, line in enumerate(bigjson):
    #     f.write(json.dumps(line))
    #     if idx != len(bigjson) - 1:
    #         f.write(",")
    #     f.write("\n")
    # f.write("]\n")
    f.write(json.dumps(bigjson, indent=4))
    f.write("\n")

In [84]:
# Format:
# {
#     "id": ID,
#     "title": String (CVS),
#     "question": String,
#     "context": String,
#     "answers": {

#     }
# }
squad.to_dict()

{'id': ['56be85543aeaaa14008c9063',
  '56be85543aeaaa14008c9065',
  '56be85543aeaaa14008c9066',
  '56bf6b0f3aeaaa14008c9601',
  '56bf6b0f3aeaaa14008c9602',
  '56bf6b0f3aeaaa14008c9603',
  '56bf6b0f3aeaaa14008c9604',
  '56bf6b0f3aeaaa14008c9605',
  '56d43c5f2ccc5a1400d830a9',
  '56d43c5f2ccc5a1400d830aa',
  '56d43c5f2ccc5a1400d830ab',
  '56d43c5f2ccc5a1400d830ac',
  '56d43c5f2ccc5a1400d830ad',
  '56d43ce42ccc5a1400d830b4',
  '56d43ce42ccc5a1400d830b5',
  '56be86cf3aeaaa14008c9076',
  '56be86cf3aeaaa14008c9078',
  '56be86cf3aeaaa14008c9079',
  '56bf6e823aeaaa14008c9627',
  '56bf6e823aeaaa14008c9629',
  '56bf6e823aeaaa14008c962a',
  '56bf6e823aeaaa14008c962b',
  '56d43da72ccc5a1400d830bd',
  '56d43da72ccc5a1400d830be',
  '56d43da72ccc5a1400d830bf',
  '56d43da72ccc5a1400d830c0',
  '56d43da72ccc5a1400d830c1',
  '56be88473aeaaa14008c9080',
  '56be88473aeaaa14008c9083',
  '56be88473aeaaa14008c9084',
  '56bf725c3aeaaa14008c9643',
  '56bf725c3aeaaa14008c9644',
  '56bf725c3aeaaa14008c9645',
  '5

In [40]:
# df = pd.DataFrame(pd.read_csv("medtable.csv"))
# print(df.loc[0])
# df = pd.read_csv("https://huggingface.co/datasets/imodels/credit-card/raw/main/train.csv")

In [41]:
# endpoint = 'deepset/roberta-base-squad2'
# tokeinzer = AutoTokenizer.from_pretrained(endpoint)
# model = AutoModelForQuestionAnswering.from_pretrained(endpoint)

In [42]:
# dataset = Dataset.from_pandas(df)

In [43]:
# dataset

In [44]:
# train_testvalid = dataset.train_test_split(test_size=0.1)
# test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# dataset = DatasetDict({
#     'train': train_testvalid['train'],
#     'test': test_valid['test'],
#     'valid': test_valid['train']})

In [45]:
# labels = [label for label in dataset['train'].features.keys() if label != 'Input']
# id2label = {idx:label for idx, label in enumerate(labels)}
# label2id = {label:idx for idx, label in enumerate(labels)}
# labels

In [46]:
# from transformers import AutoTokenizer
# import numpy as np

# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# def preprocess_data(examples):
#     text = examples['Tweet']
#     encoding = tokenizer(text, padding='max_length', truncation=True, max_length=128)
#     labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
#     labels_matrix = np.zeros((len(text), len(labels)))
#     for idx, label in enumerate(labels):
#         labels_matrix[:, idx] = labels_batch[label]
#     encoding["labels"] = labels_matrix.tolist()
#     return encoding

In [47]:
# encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)