In [9]:
!pip install transformers datasets evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import os
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline

In [12]:
if not torch.cuda.is_available():
    print("Please switch to a GPU machine before running this notebook.")


### Loading dataset

In [3]:
import pandas as pd
import os

In [6]:
df = pd.read_csv("/Users/shobanasiranjeevilu/Downloads/Healthcare_LLM/data/disease_with_symptoms.csv")[['Disease','Preprocessed_Symptoms']]

In [7]:

unique_labels = sorted(df['Disease'].unique())

id2label = {id: label for id, label in enumerate(unique_labels)}
label2id = {label: id for id, label in id2label.items()}

In [8]:
df['labels'] = df['Disease'].map(label2id)

In [10]:
df

Unnamed: 0,Disease,Preprocessed_Symptoms,labels
0,Attention deficit hyperactivity disorder,inattention carelessness hyperactivity executi...,47
1,HIV/AIDS,early flulike illness later large lymph node f...,167
2,Abnormal uterine bleeding,irregular abnormally frequent prolonged excess...,0
3,Paracetamol poisoning,early non specific feeling tired abdominal pai...,307
4,Acne,blackhead whitehead pimple oily skin scarring,1
...,...,...,...
525,Xerostomia,hyposalivation may give following sign symptom,418
526,Viral hemorrhagic fever,sign symptom vhf include definition fever blee...,408
527,Zollinger–Ellison syndrome,patient zollinger–ellison syndrome may experie...,419
528,Haemophilia B,easy bruising,169


In [None]:

# First, split into training and temp (which will be split into validation and testing)
train_df, test_df = train_test_split(df[['labels','Preprocessed_Symptoms']], test_size=0.1, random_state=42)


# Writing each DataFrame to a CSV file
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)


In [17]:
disease_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "test": "test.csv",
    },
)

print(disease_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'Preprocessed_Symptoms'],
        num_rows: 477
    })
    test: Dataset({
        features: ['labels', 'Preprocessed_Symptoms'],
        num_rows: 53
    })
})


In [18]:
disease_dataset['train'][0]

{'labels': 366,
 'Preprocessed_Symptoms': 'people spastic type cp typically muscle tight stiff due high muscle tone symptom spastic cerebral palsy vary disability affect individual differently however typically appear infancy early childhood child diagnosed first two year life main indicator spastic cerebral palsy delay reaching motor milestone following common early sign though presence listed symptom definitively mean child spastic cp'}

### Tokenization of dataset

In [19]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased",padding=True, truncation=True)

In [20]:
def preprocess_function(examples):
    return tokenizer(examples["Preprocessed_Symptoms"], truncation=True)

In [21]:
tokenized_disease_data = disease_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/53 [00:00<?, ? examples/s]

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Fine tuning distilbert Classification model

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=420 ,id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# evaluation metric definition
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
## setting traning arguments
training_args = TrainingArguments(
    output_dir="myproject",
    learning_rate=2e-5,
    num_train_epochs=70,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    push_to_hub=False,
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_disease_data["train"],
    eval_dataset=tokenized_disease_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Fine tuning the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,6.841188,0.358491
2,No log,7.038105,0.358491
3,No log,7.230175,0.358491
4,No log,7.3315,0.358491
5,No log,7.509328,0.358491
6,No log,7.653702,0.358491
7,No log,7.777401,0.358491
8,No log,7.845932,0.358491
9,2.412600,7.968329,0.358491
10,2.412600,8.072684,0.358491


Checkpoint destination directory myproject/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory myproject/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory myproject/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=4200, training_loss=0.6137873322623116, metrics={'train_runtime': 150.3379, 'train_samples_per_second': 222.1, 'train_steps_per_second': 27.937, 'total_flos': 750466249730640.0, 'train_loss': 0.6137873322623116, 'epoch': 70.0})

### Saving fine tuned model to hugging face

In [30]:
trainer.push_to_hub("shobana/myproject")

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

events.out.tfevents.1714434596.ip-10-192-12-195.2161.1:   0%|          | 0.00/54.8k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1714434521.ip-10-192-12-195.2161.0:   0%|          | 0.00/41.0k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shobana/myproject/commit/ba5c0ed69dc3266b3fe07509e9f4eef6e53e8a34', commit_message='shobana/myproject', commit_description='', oid='ba5c0ed69dc3266b3fe07509e9f4eef6e53e8a34', pr_url=None, pr_revision=None, pr_num=None)

### Inferencing on test data

In [13]:
test_dataset = load_dataset(
    "csv",
    data_files={
        "test": "test.csv",
    },
    split= "test[:52]"
)

In [17]:


classifier = pipeline("text-classification", model="shobana/myproject")

total_accuracy = 0
final_results = []
for out in classifier(KeyDataset(test_dataset, "Preprocessed_Symptoms")):
    final_results.append(out)
    total_accuracy+=out['score']



In [19]:
final_results

[{'label': 'Type 2 diabetes', 'score': 0.15508708357810974},
 {'label': 'Cutaneous squamous-cell carcinoma', 'score': 0.06562843173742294},
 {'label': 'Renal cell carcinoma', 'score': 0.08458136767148972},
 {'label': 'Vomiting', 'score': 0.9090529680252075},
 {'label': 'Anaplastic thyroid cancer', 'score': 0.09260102361440659},
 {'label': 'Ear pain', 'score': 0.09686202555894852},
 {'label': "Paget's disease of the breast", 'score': 0.04266057908535004},
 {'label': 'Chronic obstructive pulmonary disease',
  'score': 0.19403842091560364},
 {'label': 'Histoplasmosis', 'score': 0.04510360211133957},
 {'label': 'Cluster headache', 'score': 0.9783090949058533},
 {'label': 'Dermatomyositis', 'score': 0.20889967679977417},
 {'label': 'Fungal infection', 'score': 0.0639975368976593},
 {'label': 'Chronic obstructive pulmonary disease',
  'score': 0.36362871527671814},
 {'label': 'Melanoma', 'score': 0.9756152033805847},
 {'label': 'Soft-tissue sarcoma', 'score': 0.10207521170377731},
 {'label':

In [22]:
print("Test accuracy on the Language model is",total_accuracy/len(test_dataset)*100)

Test accuracy on the Language model is 44.764273803537854
