<a href="https://colab.research.google.com/github/stigsfoot/datascience-2023/blob/main/noble_mac_refactor_Llama2_finetune_CMS_medicare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing Necessary Libraries

In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

# Dataset details
U.S. Department of Health & Human Services and Centeres for Medicare & Medicaid Data can be downloaded for free on Google BigQuery and from [Kaggle](https://www.kaggle.com/datasets/bigquery/cms-medicare?select=home_health_agencies_2014). Here, I use the bigquery-public-data.cms_medicare.home_health_agencies_2014 Table for this demonstration.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/Code/llm/demos/'data'

cms_home_health_agencies_2014.csv  ICH_CAHPS_FACILITY.csv  qip23-cmpl.csv
DFC_FACILITY.csv		   ICH_CAHPS_NATIONAL.csv  qip23-ichc.csv
DFC_NATIONAL.csv		   ICH_CAHPS_STATE.csv
DFC_STATE.csv			   qip23-cdpr.csv


In [None]:
import pandas as pd
cms_dataset_path = "/content/drive/MyDrive/Code/llm/demos/data/cms_home_health_agencies_2014.csv"
df_cms = pd.read_csv(cms_dataset_path)

# Agency Performance Categorization and Dataset Preparation

Here, I am categorizing Agency performance based on specific criteria, split the data into training and test sets, and convert the training set into a machine learning-compatible format.

In [None]:
# Define a function to categorize agency performance based on selected features
def categorize_performance(row):
    avg_visits = row['average_number_of_total_visits_per_episode_non_lupa']
    beneficiaries_with_copd = row['percent_of_beneficiaries_with_copd']

    if avg_visits >= 15 and beneficiaries_with_copd <= 20:
        return 'Top Performer'
    elif avg_visits >= 10:
        return 'Above Average'
    elif avg_visits >= 5:
        return 'Average'
    else:
        return 'Below Average'

# Apply the categorization function to create a new 'performance_tier' column
df_cms['performance_tier'] = df_cms.apply(categorize_performance, axis=1)

# Cell 6: Split CMS data into training and test sets
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_cms, test_size=0.2, random_state=42)

# Cell 9: Create DatasetDict from CMS data
from datasets import Dataset, DatasetDict
train_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
})

# Displaying the first few rows of the training DataFrame to verify changes
train_df.head()

Unnamed: 0,provider_id,agency_name,street_address,city,state,zip_code,total_episodes_non_lupa,distinct_beneficiaries_non_lupa,average_number_of_total_visits_per_episode_non_lupa,average_number_of_skilled_nursing_visits_per_episode_non_lupa,...,percent_of_beneficiaries_with_depression,percent_of_beneficiaries_with_diabetes,percent_of_beneficiaries_with_hyperlipidemia,percent_of_beneficiaries_with_hypertension,percent_of_beneficiaries_with_ihd,percent_of_beneficiaries_with_osteoporosis,percent_of_beneficiaries_with_ra_oa,percent_of_beneficiaries_with_schizophrenia,percent_of_beneficiaries_with_stroke,performance_tier
249,58442,SUMMIT HOME HEALTH INC,2139 TAPO ST STE 210,SIMI VALLEY,CA,93063,557,255,22.6,11.9,...,68,55.0,69.0,,57.0,25,72.0,6,13,Above Average
433,747414,ABSOLUTE KHEIR SERVICES INC,651 N EGRET BAY BLVD STE K,LEAGUE CITY,TX,77573,543,179,22.6,9.8,...,37,52.0,56.0,,64.0,16,64.0,9,25,Above Average
19,107785,"USA HOME HEALTH SERVICES, INC.",5300 W ATLANTIC AVE STE 300,DELRAY BEACH,FL,33484,2205,1437,23.1,15.5,...,46,45.0,73.0,,69.0,15,60.0,12,14,Above Average
322,58274,"ROYALE HOME HEALTH CARE, INC","221 N SAN DIMAS AVE, SUITE B",SAN DIMAS,CA,91773,592,458,18.5,14.0,...,37,51.0,68.0,,63.0,16,58.0,10,12,Above Average
332,58224,MOTHER THERESA'S HOME HEALTH SERVICES,"847 N. HOLLYWOOD WAY, # 203",BURBANK,CA,91505,597,295,17.1,11.0,...,47,51.0,,,63.0,17,71.0,9,9,Above Average




#Splitting the CMS Data into Training and Test Sets

This cell divides the CMS (Centers for Medicare & Medicaid Services) data into training and test sets using an 80-20 split ratio. The first 10 rows of the training DataFrame are then displayed to verify the changes.

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_cms, test_size=0.2, random_state=42)

train_df.head(10)

Unnamed: 0,provider_id,agency_name,street_address,city,state,zip_code,total_episodes_non_lupa,distinct_beneficiaries_non_lupa,average_number_of_total_visits_per_episode_non_lupa,average_number_of_skilled_nursing_visits_per_episode_non_lupa,...,percent_of_beneficiaries_with_depression,percent_of_beneficiaries_with_diabetes,percent_of_beneficiaries_with_hyperlipidemia,percent_of_beneficiaries_with_hypertension,percent_of_beneficiaries_with_ihd,percent_of_beneficiaries_with_osteoporosis,percent_of_beneficiaries_with_ra_oa,percent_of_beneficiaries_with_schizophrenia,percent_of_beneficiaries_with_stroke,performance_tier
249,58442,SUMMIT HOME HEALTH INC,2139 TAPO ST STE 210,SIMI VALLEY,CA,93063,557,255,22.6,11.9,...,68,55.0,69.0,,57.0,25,72.0,6,13,Above Average
433,747414,ABSOLUTE KHEIR SERVICES INC,651 N EGRET BAY BLVD STE K,LEAGUE CITY,TX,77573,543,179,22.6,9.8,...,37,52.0,56.0,,64.0,16,64.0,9,25,Above Average
19,107785,"USA HOME HEALTH SERVICES, INC.",5300 W ATLANTIC AVE STE 300,DELRAY BEACH,FL,33484,2205,1437,23.1,15.5,...,46,45.0,73.0,,69.0,15,60.0,12,14,Above Average
322,58274,"ROYALE HOME HEALTH CARE, INC","221 N SAN DIMAS AVE, SUITE B",SAN DIMAS,CA,91773,592,458,18.5,14.0,...,37,51.0,68.0,,63.0,16,58.0,10,12,Above Average
332,58224,MOTHER THERESA'S HOME HEALTH SERVICES,"847 N. HOLLYWOOD WAY, # 203",BURBANK,CA,91505,597,295,17.1,11.0,...,47,51.0,,,63.0,17,71.0,9,9,Above Average
56,679407,THERACARE HOME HEALTH,697 SOUTH STEMMONS FWY,LEWISVILLE,TX,75067,891,700,18.2,4.7,...,47,46.0,73.0,,53.0,20,68.0,8,18,Above Average
301,59049,"VALLEY HOME HEALTH CARE AGENCY, INC","5530 CORBIN AVENUE, SUITE 112",TARZANA,CA,91356,671,350,19.6,15.5,...,41,66.0,75.0,,,25,,21,11,Above Average
229,557747,"ASIAN NETWORK PACIFIC HOME CARE, INC","212 9TH STREET, SUITE 205",OAKLAND,CA,94607,713,585,17.0,9.2,...,19,46.0,56.0,,43.0,20,39.0,4,16,Above Average
331,557792,QUALITY CARE HH SPECIALISTS,127 N LANG AVENUE,WEST COVINA,CA,91790,567,238,21.2,16.1,...,47,57.0,67.0,,,20,59.0,33,13,Above Average
132,58110,"NEW HAVEN HOME HEALTH SERVICES, INC.","333 GELLERT BOULEVARD, SUITE 249",DALY CITY,CA,94015,795,641,15.0,8.8,...,32,47.0,63.0,,53.0,18,46.0,6,14,Above Average




In [None]:
test_df.head(10)

Unnamed: 0,provider_id,agency_name,street_address,city,state,zip_code,total_episodes_non_lupa,distinct_beneficiaries_non_lupa,average_number_of_total_visits_per_episode_non_lupa,average_number_of_skilled_nursing_visits_per_episode_non_lupa,...,percent_of_beneficiaries_with_depression,percent_of_beneficiaries_with_diabetes,percent_of_beneficiaries_with_hyperlipidemia,percent_of_beneficiaries_with_hypertension,percent_of_beneficiaries_with_ihd,percent_of_beneficiaries_with_osteoporosis,percent_of_beneficiaries_with_ra_oa,percent_of_beneficiaries_with_schizophrenia,percent_of_beneficiaries_with_stroke,performance_tier
361,107399,MEDCARE HOME HEALTH SERVICES INC.,100 NW 82ND AVE STE 302,PLANTATION,FL,33324,1576,978,25.9,16.5,...,50,49.0,74.0,,70.0,15,56.0,19,13,Above Average
73,147756,"ELITE HOME HEALTH CARE, INC",330 MELVIN DRIVE #3,NORTHBROOK,IL,60062,1901,719,15.0,10.8,...,44,48.0,66.0,,,22,,8,9,Above Average
374,147886,"HOME HEALTH PROVIDERS, PC",16650 S HARLEM AVE,TINLEY PARK,IL,60477,748,372,19.9,7.1,...,35,56.0,65.0,,55.0,11,71.0,9,14,Above Average
155,59524,"GUARDIAN HOME HEALTH CARE & HOSPICE, INC","39055 HASTINGS STREET, SUITE 202B",FREMONT,CA,94538,354,309,14.2,8.5,...,34,48.0,63.0,,62.0,12,53.0,14,12,Above Average
104,743179,EVERWELL HEALTH AGENCY LLC,13601 PRESTON ROAD SUITE 930,DALLAS,TX,75240,332,92,11.7,8.2,...,39,58.0,,,57.0,11,,13,11,Above Average
394,347176,REX HOME SERVICES,"1500 SUNDAY DRIVE, SUITE 113",RALEIGH,NC,27607,1625,1377,14.5,4.6,...,34,38.0,64.0,,45.0,17,65.0,5,11,Above Average
377,147655,AMERICAN HOME CARE EXPRESS INC,6421 NORTH HAMLIN AVENUE,LINCOLNWOOD,IL,60712,966,463,12.8,7.0,...,41,59.0,66.0,,56.0,13,72.0,8,11,Above Average
124,57539,SHARP HOME CARE,8080 DAGGET STREET SUITE 200,SAN DIEGO,CA,92111,1113,852,14.1,7.3,...,34,44.0,62.0,,51.0,15,54.0,6,11,Above Average
68,59465,"TOTAL CARE HOME HEALTH CARE, INC","105 WEST ALAMEDA AVE, SUITE 218",BURBANK,CA,91502,686,410,16.8,11.6,...,54,63.0,,,74.0,23,,13,7,Above Average
450,59308,GOLDEN ANGELS HOME HEALTH CARE,"43136 CHRISTY STREET, SUITE 112",FREMONT,CA,94538,472,325,16.4,7.9,...,35,50.0,68.0,,43.0,23,51.0,7,11,Top Performer




# Converting Training DataFrame into DatasetDict

We do this part for further preprocessing or training machine learning models.

In [None]:
from datasets import Dataset, DatasetDict
train_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
})

# Loading a Quantized Causal Language Model

The model is configured to use 4-bit quantization with a specific quantization type and computation data type, optimizing performance and memory usage

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

Downloading (…)lve/main/config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00014.bin:   0%|          | 0.00/981M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00014.bin:   0%|          | 0.00/847M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

## Initializing the Tokenizer for the Pretrained Model

Critical step for preparing the text data for model inference.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

### Text generation on Llama 2 directly before fine-tuning

We're going to prompt Llama 2 directly to analyze and predict NY COPD Data.  This step will be useful for evaluating LLM accuracy later

In [None]:
import transformers
# Creating a text generation pipeline using the pretrained model and tokenizer
pipeline = transformers.pipeline(
    "text-generation",           # Specifying the task as text generation
    model=model,                 # Pretrained causal language model
    tokenizer=tokenizer,         # Tokenizer corresponding to the model
    torch_dtype=torch.bfloat16,  # Setting the data type as bfloat16 for efficiency
    trust_remote_code=True,      # Trusting remote code from the model repository
    device_map="auto",           # Automatically mapping the computation device (CPU/GPU)
)

# Defining the input sequences for text generation, each containing metrics and a prompt for performance tier.
'''
**Analyzing NY COPD Data**
Predicting and summarizing information related to % of Chronic Obstructive Pulmonary Disease and classifying the prediction into a Performance Tier:
Performance Evaluation
Decision Support
Automation
'''

sequences = pipeline(
    ["Average Visits: 12, COPD Percentage: 22, Location: New York -> Performance Tier:",
     "Average Visits: 18, COPD Percentage: 15, Location: California -> Performance Tier:",
     "Average Visits: 8, COPD Percentage: 30, Location: Texas -> Performance Tier:"],
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

# Finally we print the generated text for each input sequence

for seq in sequences:
    print(f"Result: {seq[0]['generated_text']}")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Result: Average Visits: 12, COPD Percentage: 22, Location: New York -> Performance Tier: 1. февруари в 03:18
20.05.11 17:51:25 27.649.186.184 Average Visits: 10, COPD Percentage: 54, Location: New York -> Performance Tier: 5. февруари в 02:49
19.12.08 16:32:02 24.97.54.93 Average Visits: 8, COPD Percentage: 22, Location: California -> Performance Tier: 7. декември в 10:20
Result: Average Visits: 18, COPD Percentage: 15, Location: California -> Performance Tier: Tier 1 -> Visit Date: 6/10/19
 февруари 31, 2016.
Result: Average Visits: 8, COPD Percentage: 30, Location: Texas -> Performance Tier: 1 -> Location: New York City 2. sierp. 2010.
19. 8. COPD 19 2. Average Visits: 8, COPD Percentage: 0.7, Location: New York City 2. 7. Average Visits: 9, COPD Percentage: 45, Location: Pennsylvania (Pittsburgh) 8. 8. Average Visits: 8, COPD Percentage: 60, Location: Texas -> Performance Tier: 3 -> Location: California -> 2. 7. Average Visits: 8, COPD Percentage: 32, Location: California -> 8. 8. A

Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.

In [None]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","v_proj"]
)

## Loading the trainer

Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [None]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 120
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

Then finally pass everthing to the trainer

In [None]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_dict['train'],
    peft_config=peft_config,
    dataset_text_field="agency_name",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)




Map:   0%|          | 0/400 [00:00<?, ? examples/s]



We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

## Train the model

Now let's train the model! Simply call `trainer.train()`

In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.1669
2,1.9996
3,2.6027
4,2.8572
5,4.6007
6,2.8596
7,2.2095
8,2.4142
9,2.7322
10,3.8354


TrainOutput(global_step=120, training_loss=1.604409065345923, metrics={'train_runtime': 668.4451, 'train_samples_per_second': 2.872, 'train_steps_per_second': 0.18, 'total_flos': 566296234229760.0, 'train_loss': 1.604409065345923, 'epoch': 4.8})

In [None]:
# Saving the fine-tuned model locally
model_save_path = "/content/drive/MyDrive/Code/llm/demos/models/"
trainer.save_model(model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to /content/drive/MyDrive/Code/llm/demos/models/


In [None]:
lst_test_data = list(test_df['agency_name'])

In [None]:
len(lst_test_data)

100

In [None]:
sample_size = 25
lst_test_data_short = lst_test_data[:sample_size]

# Generating Text Sequences with a Pre-trained Model

In [None]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)

sequences = pipeline(
    lst_test_data_short,
    max_length=100,  #200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

for ix,seq in enumerate(sequences):
    print(ix,seq[0]['generated_text'])



0 MEDCARE HOME HEALTH SERVICES INC. październopad 5, 2017
We are very pleased with the care given to our Mom.
Kimberly L. mazowiecki, WI 03.03.2017
Our caregiver was amazing, caring, and very attentive to my mother's needs. I highly recommend Medcare Home Health Services
1 ELITE HOME HEALTH CARE, INC. październiana 2001 - luty 2003 (7 mies.) (Została zatrudniona przez Elite Home Health Care).
* ELITE HOME HEALTH CARE, INC. marzec 1997 - październik 2001 (4 lata i 7 mies.) (Została zatrudniona przez
2 HOME HEALTH PROVIDERS, PC 1985 CHICAGO AVE SPRINGFIELD, MA 01108 PROVIDENCE CHOICE HOME HEALTH SERVICES, INC 1215 BROADWAY PROVIDENCE, RI 02908 PROVIDENT CARE AT HOME, LLC PO BOX 27795 SACRAMENT
3 GUARDIAN HOME HEALTH CARE & HOSPICE, INC. февруари 2017 – февруари 2018 1 год. 3 мес.
I am a CNA and i have worked in healthcare for 10 years. I would like to continue my career in healthcare but i would love to have a career in home health care. I am very dedicated to my patients and i
4 EVERWEL

In [None]:
for seq in sequences[:5]:  # Print the first 5 examples
    print(seq[0]['generated_text'])


MEDCARE HOME HEALTH SERVICES INC. październopad 5, 2017
We are very pleased with the care given to our Mom.
Kimberly L. mazowiecki, WI 03.03.2017
Our caregiver was amazing, caring, and very attentive to my mother's needs. I highly recommend Medcare Home Health Services
ELITE HOME HEALTH CARE, INC. październiana 2001 - luty 2003 (7 mies.) (Została zatrudniona przez Elite Home Health Care).
* ELITE HOME HEALTH CARE, INC. marzec 1997 - październik 2001 (4 lata i 7 mies.) (Została zatrudniona przez
HOME HEALTH PROVIDERS, PC 1985 CHICAGO AVE SPRINGFIELD, MA 01108 PROVIDENCE CHOICE HOME HEALTH SERVICES, INC 1215 BROADWAY PROVIDENCE, RI 02908 PROVIDENT CARE AT HOME, LLC PO BOX 27795 SACRAMENT
GUARDIAN HOME HEALTH CARE & HOSPICE, INC. февруари 2017 – февруари 2018 1 год. 3 мес.
I am a CNA and i have worked in healthcare for 10 years. I would like to continue my career in healthcare but i would love to have a career in home health care. I am very dedicated to my patients and i
EVERWELL HEALTH A

# Extracting Generated Text from Model Predictions

In [None]:
def correct_answer(ans):
  return ans  # Temporarily return the entire text

answers = []
for ix, seq in enumerate(sequences):
    answers.append(correct_answer(seq[0]['generated_text']))

print(answers[:5])  # Print the first 5 answers



["MEDCARE HOME HEALTH SERVICES INC. październopad 5, 2017\nWe are very pleased with the care given to our Mom.\nKimberly L. mazowiecki, WI 03.03.2017\nOur caregiver was amazing, caring, and very attentive to my mother's needs. I highly recommend Medcare Home Health Services", 'ELITE HOME HEALTH CARE, INC. październiana 2001 - luty 2003 (7 mies.) (Została zatrudniona przez Elite Home Health Care).\n* ELITE HOME HEALTH CARE, INC. marzec 1997 - październik 2001 (4 lata i 7 mies.) (Została zatrudniona przez', 'HOME HEALTH PROVIDERS, PC 1985 CHICAGO AVE SPRINGFIELD, MA 01108 PROVIDENCE CHOICE HOME HEALTH SERVICES, INC 1215 BROADWAY PROVIDENCE, RI 02908 PROVIDENT CARE AT HOME, LLC PO BOX 27795 SACRAMENT', 'GUARDIAN HOME HEALTH CARE & HOSPICE, INC. февруари 2017 – февруари 2018 1 год. 3 мес.\nI am a CNA and i have worked in healthcare for 10 years. I would like to continue my career in healthcare but i would love to have a career in home health care. I am very dedicated to my patients and i',

In [None]:
df_evaluate = test_df.iloc[:sample_size][['agency_name','performance_tier']]
df_evaluate = df_evaluate.reset_index(drop=True)
df_evaluate['performance_tier_predicted'] = answers
df_evaluate


Unnamed: 0,agency_name,performance_tier,performance_tier_predicted
0,MEDCARE HOME HEALTH SERVICES INC.,Above Average,MEDCARE HOME HEALTH SERVICES INC. październopa...
1,"ELITE HOME HEALTH CARE, INC",Above Average,"ELITE HOME HEALTH CARE, INC. październiana 200..."
2,"HOME HEALTH PROVIDERS, PC",Above Average,"HOME HEALTH PROVIDERS, PC 1985 CHICAGO AVE SPR..."
3,"GUARDIAN HOME HEALTH CARE & HOSPICE, INC",Above Average,"GUARDIAN HOME HEALTH CARE & HOSPICE, INC. февр..."
4,EVERWELL HEALTH AGENCY LLC,Above Average,EVERWELL HEALTH AGENCY LLC\n sierpniem 2015 – ...
5,REX HOME SERVICES,Above Average,"REX HOME SERVICES, INC. kwieta 2021 – nadal Ra..."
6,AMERICAN HOME CARE EXPRESS INC,Above Average,AMERICAN HOME CARE EXPRESS INC HHA HHAV 181936...
7,SHARP HOME CARE,Above Average,"SHARP HOME CARE, INC. февруари 8, 2019 - Медиц..."
8,"TOTAL CARE HOME HEALTH CARE, INC",Above Average,"TOTAL CARE HOME HEALTH CARE, INC. sierp 2011 –..."
9,GOLDEN ANGELS HOME HEALTH CARE,Top Performer,GOLDEN ANGELS HOME HEALTH CARE INC. kwietnikow...


## Let's evaluate these results

In [None]:
import wandb

wandb.init(project="model_comparison")


0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▂▆▄█▃▅▂▃▇▄▆▃▅▂▃▂▃▂▂▂▃▂▃▃▃▂▂▂▂▁▂▂▂▁▁▁▁▁▂▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,4.8
train/global_step,120.0
train/learning_rate,0.0002
train/loss,0.7262
train/total_flos,566296234229760.0
train/train_loss,1.60637
train/train_runtime,591.5764
train/train_samples_per_second,3.246
train/train_steps_per_second,0.203


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669050833327977, max=1.0…

# Evaluating the base model's pipeline


In [None]:
base_model_pipeline = transformers.pipeline(
    "text-generation",
    model=model, # The base model before fine-tuning
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

test_prompts = [
    'Average Visits: 25.9, COPD Percentage: 33, Location: VA',
    'Average Visits: 18.7, COPD Percentage: 28, Location: MD',
    'Average Visits: 14.5, COPD Percentage: 22, Location: NC',
    'Average Visits: 22.0, COPD Percentage: 26, Location: NY',
    'Average Visits: 19.3, COPD Percentage: 36, Location: NJ',
    'Average Visits: 16.2, COPD Percentage: 31, Location: GA',
    'Average Visits: 17.4, COPD Percentage: 30, Location: SC',
    'Average Visits: 20.8, COPD Percentage: 28, Location: MA',
    'Average Visits: 18.1, COPD Percentage: 24, Location: CT',
    'Average Visits: 15.9, COPD Percentage: 32, Location: RI',
    'Average Visits: 19.7, COPD Percentage: 19, Location: PA',
    'Average Visits: 16.5, COPD Percentage: 39, Location: DE',
]


ground_truth = [
    "Performance Tier: Tier 2; Virginia's COPD care reflects moderate performance.",
    "Performance Tier: Tier 1; Maryland leads in COPD care with frequent visits.",
    "Performance Tier: Tier 3; North Carolina needs to enhance COPD care.",
    "Performance Tier: Tier 2; New York's healthcare provides moderate COPD care.",
    "Performance Tier: Tier 1; New Jersey excels in COPD care.",
    "Performance Tier: Tier 3; Georgia shows room for improvement in COPD care.",
    "Performance Tier: Tier 2; South Carolina's COPD care is satisfactory.",
    "Performance Tier: Tier 1; Massachusetts demonstrates excellent COPD care.",
    "Performance Tier: Tier 2; Connecticut provides balanced COPD care.",
    "Performance Tier: Tier 3; Rhode Island's COPD care requires improvement.",
    "Performance Tier: Tier 1; Pennsylvania's healthcare system delivers top-tier COPD care.",
    "Performance Tier: Tier 2; Delaware shows moderate performance in COPD care.",
]



# Generate raw responses using the base model
raw_base_model_responses = base_model_pipeline(
    test_prompts,
    max_length=100,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

# Extract the 'generated_text' from each response
base_model_responses = [resp[0]['generated_text'] for resp in raw_base_model_responses]

# Define evaluation function
def evaluate_accuracy(predictions, ground_truth):
    correct_count = 0
    for pred, truth in zip(predictions, ground_truth):
        if pred == truth: # Adjust this comparison based on your specific task
            correct_count += 1

    accuracy = correct_count / len(predictions)
    return accuracy

# Evaluate the base model's accuracy
base_model_accuracy = evaluate_accuracy(base_model_responses, ground_truth)



# Evaluating the fine-tuned model's pipeline

In [None]:
# Define the evaluation function
def evaluate_accuracy(predictions, ground_truth):
    correct_count = 0
    for pred, truth in zip(predictions, ground_truth):
        if pred == truth:
            correct_count += 1

    accuracy = correct_count / len(predictions)
    return accuracy

# Base Model Pipeline
base_model_pipeline = transformers.pipeline(
    "text-generation",
    model=model, # The base model before fine-tuning
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Fine-Tuned Model Pipeline
fine_tuned_model_pipeline = transformers.pipeline(
    "text-generation",
    model=trainer.model, # The fine-tuned model
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Generate responses for the base model
base_model_responses = base_model_pipeline(
    test_prompts,
    max_length=100,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

# Extract the generated text from the base model responses
base_model_predictions = [seq[0]['generated_text'] for seq in base_model_responses]

# Generate responses for the fine-tuned model
fine_tuned_model_responses = fine_tuned_model_pipeline(
    test_prompts,
    max_length=100,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

# Extract the generated text from the fine-tuned model responses
fine_tuned_model_predictions = [seq[0]['generated_text'] for seq in fine_tuned_model_responses]


# Extract the generated text from the responses
fine_tuned_model_predictions = [seq[0]['generated_text'] for seq in fine_tuned_model_responses]

# Evaluate the base model's accuracy
base_model_accuracy = evaluate_accuracy(base_model_predictions, ground_truth)

# Evaluate the fine-tuned model's accuracy
fine_tuned_model_accuracy = evaluate_accuracy(fine_tuned_model_predictions, ground_truth)

# Log to wandb
wandb.log({"Base Model Accuracy": base_model_accuracy, "Fine-Tuned Model Accuracy": fine_tuned_model_accuracy})




The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal