In [6]:
from datasets import load_dataset

ds = load_dataset("cbasu/Med-EASi")

print(ds)
print("Train dataset sample: ")
ds['train'][0]

DatasetDict({
    train: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 196
    })
    test: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 300
    })
})
Train dataset sample: 


{'Expert': '75-90 % of the affected people have mild intellectual disability.',
 'Simple': "People with syndromic intellectual disabilities may have a `` typical look. ''",
 'Annotation': "<del>75-90 % of the</del> <rep>affected people  have mild intellectual disability.<by>People with syndromic intellectual disabilities</rep> <ins>may have a `` typical look. ''</ins>",
 'sim': 0.48951049,
 'sentence_sim': 0.639872432,
 'compression': 1.2,
 'expert_fk_grade': 12.7,
 'expert_ari': 12.4,
 'layman_fk_grade': 13.1,
 'layman_ari': 15.1,
 'umls_expert': "[[{'start': 41, 'end': 64, 'ngram': 'intellectual disability', 'term': 'intellectual disability', 'cui': 'C3714756', 'similarity': 1.0, 'semtypes': {'T048'}, 'preferred': 1, 'preferred_term': None}, {'start': 41, 'end': 64, 'ngram': 'intellectual disability', 'term': 'Intellectual disability', 'cui': 'C3714756', 'similarity': 0.9090909090909091, 'semtypes': {'T048'}, 'preferred': 0, 'preferred_term': None}, {'start': 41, 'end': 64, 'ngram': 

In [7]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")


def preprocess_and_tokenizer(batch):
    inputs = batch['Expert']
    targets = batch['Simple']

    model_inputs = tokenizer(inputs, truncation=True, max_length=256)
    labels = tokenizer(text_target=targets, truncation=True, max_length=256)["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs


col_names = ds["train"].column_names
tokenized_ds = ds.map(preprocess_and_tokenizer, batched=True, remove_columns=col_names)

tokenized_ds

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 196
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})

In [3]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints/flan_t5_baseline",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    num_train_epochs=5,
    warmup_steps=1000,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    tpu_num_cores=8,
    predict_with_generate=True,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)


trainer.train()

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnaluru[0m ([33mnaluru-george-mason-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.5938,1.883344
2,1.5594,1.832589
3,1.4708,1.795272
4,1.3386,1.776446
5,1.287,1.767708


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=875, training_loss=1.4866534641810827, metrics={'train_runtime': 428.5582, 'train_samples_per_second': 16.299, 'train_steps_per_second': 2.042, 'total_flos': 691261209879552.0, 'train_loss': 1.4866534641810827, 'epoch': 5.0})

In [4]:
import transformers
print(transformers.__version__)

4.57.2


In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "/content/checkpoints/flan_t5_baseline/checkpoint-875"   # or your training output directory
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [6]:
test_dataset = ds["test"]


In [None]:
def generate_predictions(batch):
    inputs = tokenizer(
        [text for text in batch["Expert"] ],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Adjusting generation parameters for simplfiication
    outputs = model.generate(
        input_ids=inputs["input_ids"].to(model.device),
        attention_mask=inputs["attention_mask"].to(model.device),
        max_length=180, 
        do_sample=True, 
        top_k=70,       
        top_p=0.99,      
        temperature=0.8 
    )

    batch["pred"] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return batch

In [8]:
predictions = test_dataset.map(generate_predictions, batched=True, batch_size=8)


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [5]:
!pip install evaluate
!pip install rouge_score
!pip install sacrebleu sacremoses



In [6]:
import evaluate

sari_metric = evaluate.load("sari")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")


In [11]:
sari_score = sari_metric.compute(
    sources=predictions["Expert"],
    predictions=predictions["pred"],
    references=[[ref] for ref in predictions["Simple"]]
)

bleu_score = bleu_metric.compute(
    predictions=predictions["pred"],
    references=[[ref] for ref in predictions["Simple"]]
)

rouge_score = rouge_metric.compute(
    predictions=predictions["pred"],
    references=predictions["Simple"]
)


In [12]:
print("SARI:", sari_score)
print("BLEU:", bleu_score)
print("ROUGE:", rouge_score)


SARI: {'sari': 42.69897415291706}
BLEU: {'bleu': 0.24544830886712846, 'precisions': [0.520539639229953, 0.32666349055105603, 0.2624645656161414, 0.22358722358722358], 'brevity_penalty': 0.7765904173464058, 'length_ratio': 0.7981851179673322, 'translation_length': 6597, 'reference_length': 8265}
ROUGE: {'rouge1': np.float64(0.46119808423683206), 'rouge2': np.float64(0.3077546441574093), 'rougeL': np.float64(0.43113544857539055), 'rougeLsum': np.float64(0.43072842362367125)}


In [13]:
import pandas as pd

df = pd.DataFrame({
    "expert": predictions["Expert"],
    "reference_simple": predictions["Simple"],
    "model_simple": predictions["pred"]
})

df.to_csv("model_outputs.csv", index=False)

In [None]:
# Trying an instruction based tuning for the Flan-T5-Base Model by adding the 'simplify' keyword and others

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")


def preprocess_and_tokenizer(batch):
    inputs = ["simplify: " + text for text in batch['Expert']]
    targets = batch['Simple']

    model_inputs = tokenizer(inputs, truncation=True, max_length=256)
    labels = tokenizer(text_target=targets, truncation=True, max_length=256)["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs


col_names = ds["train"].column_names
tokenized_ds = ds.map(preprocess_and_tokenizer, batched=True, remove_columns=col_names)

tokenized_ds

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 196
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

model_name = "google/flan-t5-base"

#Reloading the flan-t5-base model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints/flan_t5_baseline_with_prefix",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    num_train_epochs=5,
    warmup_steps=1000,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    tpu_num_cores=8,
    predict_with_generate=True,
)

# Re-instantiating the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Starts training process
trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,1.5637,1.868734
2,1.5348,1.821129
3,1.4851,1.792567
4,1.3531,1.77838
5,1.2936,1.764031


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=875, training_loss=1.477281537737165, metrics={'train_runtime': 574.0128, 'train_samples_per_second': 12.169, 'train_steps_per_second': 1.524, 'total_flos': 709944921234432.0, 'train_loss': 1.477281537737165, 'epoch': 5.0})

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "checkpoints/flan_t5_baseline_with_prefix/checkpoint-875" # Updated checkpoint path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Generates predictions using the newly trained model
predictions_with_prefix = ds["test"].map(generate_predictions, batched=True, batch_size=8)

predictions_with_prefix

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Dataset({
    features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx', 'pred'],
    num_rows: 300
})

In [33]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.11-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.11-py3-none-any.whl (176 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.11


In [None]:
import evaluate
import textstat

# Calculates the SARI, BLEU, and ROUGE scores for the new predictions
sari_metric_with_prefix = evaluate.load("sari")
bleu_metric_with_prefix = evaluate.load("bleu")
rouge_metric_with_prefix = evaluate.load("rouge")

sari_score_with_prefix = sari_metric_with_prefix.compute(
    sources=predictions_with_prefix["Expert"],
    predictions=predictions_with_prefix["pred"],
    references=[[ref] for ref in predictions_with_prefix["Simple"]]
)

bleu_score_with_prefix = bleu_metric_with_prefix.compute(
    predictions=predictions_with_prefix["pred"],
    references=[[ref] for ref in predictions_with_prefix["Simple"]]
)

rouge_score_with_prefix = rouge_metric_with_prefix.compute(
    predictions=predictions_with_prefix["pred"],
    references=predictions_with_prefix["Simple"]
)

print("SARI (with prefix):", sari_score_with_prefix)
print("BLEU (with prefix):", bleu_score_with_prefix)
print("ROUGE (with prefix):", rouge_score_with_prefix)

# Calculates the readability scores for the newly generated simplified texts
flesch_kincaid_grades = [textstat.flesch_kincaid_grade(text) for text in predictions_with_prefix["pred"]]
automated_readability_indices = [textstat.automated_readability_index(text) for text in predictions_with_prefix["pred"]]

print("\nAverage Flesch-Kincaid Grade Level (with prefix):", sum(flesch_kincaid_grades) / len(flesch_kincaid_grades))
print("Average Automated Readability Index (with prefix):", sum(automated_readability_indices) / len(automated_readability_indices))

SARI (with prefix): {'sari': 38.337342550116546}
BLEU (with prefix): {'bleu': 0.1431582646998161, 'precisions': [0.44239791485664637, 0.22859761686526123, 0.17032007759456838, 0.13956360642239604], 'brevity_penalty': 0.6465255993042578, 'length_ratio': 0.6963097398669087, 'translation_length': 5755, 'reference_length': 8265}
ROUGE (with prefix): {'rouge1': np.float64(0.3402923199680302), 'rouge2': np.float64(0.18759359208978965), 'rougeL': np.float64(0.3010766135976525), 'rougeLsum': np.float64(0.30152520436472463)}

Average Flesch-Kincaid Grade Level (with prefix): 10.542654696850299
Average Automated Readability Index (with prefix): 10.481627800394959


In [None]:
# Trying an instruction based tuning for the Flan-T5-Base Model by adding the 'simplfy for children'

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")


def preprocess_and_tokenizer(batch):
    inputs = ["Simplify for a 5th-grade reading level: " + text for text in batch['Expert']]
    targets = batch['Simple']

    model_inputs = tokenizer(inputs, truncation=True, max_length=256)
    labels = tokenizer(text_target=targets, truncation=True, max_length=256)["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs


col_names = ds["train"].column_names
tokenized_ds = ds.map(preprocess_and_tokenizer, batched=True, remove_columns=col_names)

tokenized_ds

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 196
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

model_name = "google/flan-t5-base"

# Reloads the flan-t5-base model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints/flan_t5_grade_level_prefix",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    num_train_epochs=5,
    warmup_steps=1000,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    tpu_num_cores=8,
    predict_with_generate=True,
)

# Re-instantiating the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Starts the training process
trainer.train()

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnaluru[0m ([33mnaluru-george-mason-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.5356,1.864665
2,1.5319,1.82616
3,1.4558,1.790924
4,1.3411,1.776245
5,1.2864,1.762284


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=875, training_loss=1.4698741237095425, metrics={'train_runtime': 497.7642, 'train_samples_per_second': 14.033, 'train_steps_per_second': 1.758, 'total_flos': 812705333686272.0, 'train_loss': 1.4698741237095425, 'epoch': 5.0})

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "checkpoints/flan_t5_grade_level_prefix/checkpoint-875"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

predictions_with_grade_prefix = ds["test"].map(generate_predictions, batched=True, batch_size=8)

predictions_with_grade_prefix

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Dataset({
    features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx', 'pred'],
    num_rows: 300
})

In [None]:
import evaluate

# Calculates the SARI, BLEU, and ROUGE scores for the new predictions
sari_metric_with_grade_prefix = evaluate.load("sari")
bleu_metric_with_grade_prefix = evaluate.load("bleu")
rouge_metric_with_grade_prefix = evaluate.load("rouge")

sari_score_with_grade_prefix = sari_metric_with_grade_prefix.compute(
    sources=predictions_with_grade_prefix["Expert"],
    predictions=predictions_with_grade_prefix["pred"],
    references=[[ref] for ref in predictions_with_grade_prefix["Simple"]]
)

bleu_score_with_grade_prefix = bleu_metric_with_grade_prefix.compute(
    predictions=predictions_with_grade_prefix["pred"],
    references=[[ref] for ref in predictions_with_grade_prefix["Simple"]]
)

rouge_score_with_grade_prefix = rouge_metric_with_grade_prefix.compute(
    predictions=predictions_with_grade_prefix["pred"],
    references=predictions_with_grade_prefix["Simple"]
)

print("SARI (with grade-level prefix):", sari_score_with_grade_prefix)
print("BLEU (with grade-level prefix):", bleu_score_with_grade_prefix)
print("ROUGE (with grade-level prefix):", rouge_score_with_grade_prefix)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

SARI (with grade-level prefix): {'sari': 37.88166932103268}
BLEU (with grade-level prefix): {'bleu': 0.13852962505814245, 'precisions': [0.3969332911792602, 0.19233322270162628, 0.14093608103388056, 0.11663902708678828], 'brevity_penalty': 0.7360090940379919, 'length_ratio': 0.7653962492437991, 'translation_length': 6326, 'reference_length': 8265}
ROUGE (with grade-level prefix): {'rouge1': np.float64(0.32084159357478215), 'rouge2': np.float64(0.1625820602623287), 'rougeL': np.float64(0.2828361117053192), 'rougeLsum': np.float64(0.2820638847176238)}


In [40]:
import pandas as pd

df = pd.DataFrame({
    "expert": predictions["Expert"],
    "reference_simple": predictions["Simple"],
    "model_simple": predictions["pred"]
})

df.to_csv("model_outputs.csv", index=False)

In [15]:
!pip install huggingface_hub



In [None]:
# Trying a new model to test the output: GPT-Neo-125m

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "EleutherAI/gpt-neo-125m" # New model to test

# Loads the tokenizer
tokenizer_llama = AutoTokenizer.from_pretrained(model_id)

# Loads the model
model_llama = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
)

print("Publicly available CausalLM Tokenizer and Model loaded successfully.")

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Publicly available CausalLM Tokenizer and Model loaded successfully.


In [None]:
def preprocess_and_tokenizer_llama(batch):

    model_inputs_list = []
    labels_batch = []

    for expert_text, simple_text in zip(batch['Expert'], batch['Simple']):
        # Prepares the input and output text for the causal LM
        prompt_input = f"Expert: {expert_text} Simple: "
        target_output = f"{simple_text}{tokenizer_llama.eos_token}"

        full_text = prompt_input + target_output

        # Tokenizes the full text, ensuring padding to max_length
        tokenized_full = tokenizer_llama(
            full_text,
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors=None
        )

        # Tokenizes the prompt_input separately to get its length for masking labels
        tokenized_prompt_only = tokenizer_llama(
            prompt_input,
            truncation=True, # Truncates if prompt is too long
            max_length=256, # Matches max_length if prompt is too long
            add_special_tokens=False
        )
        prompt_len = len(tokenized_prompt_only["input_ids"])

        # Creates and ensures the labels are padded with -100 to match the length of full_input_ids
        current_labels = [-100] * prompt_len + tokenized_full["input_ids"][prompt_len:]
        current_labels = current_labels[:len(tokenized_full["input_ids"])] # Truncate if longer than input_ids
        while len(current_labels) < len(tokenized_full["input_ids"]):
            current_labels.append(-100) # Pad if shorter than input_ids

        model_inputs_list.append({"input_ids": tokenized_full["input_ids"],
                               "attention_mask": tokenized_full["attention_mask"]})
        labels_batch.append(current_labels)

    # Returns a dict with lists of features
    return {
        "input_ids": [item["input_ids"] for item in model_inputs_list],
        "attention_mask": [item["attention_mask"] for item in model_inputs_list],
        "labels": labels_batch,
    }

# Applies the new function to the dataset
col_names = ds["train"].column_names
tokenized_ds_llama = ds.map(preprocess_and_tokenizer_llama, batched=True, remove_columns=col_names)

print(tokenized_ds_llama)

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 196
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Ensuring the tokenizer has a padding token
if tokenizer_llama.pad_token is None:
    tokenizer_llama.pad_token = tokenizer_llama.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_llama, mlm=False)

# Training Arguments
training_args = TrainingArguments(
    output_dir="checkpoints/gpt_neo_causal_lm_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    num_train_epochs=5,
    warmup_steps=1000,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    tpu_num_cores=8,
)

trainer_llama = Trainer(
    model=model_llama,
    args=training_args,
    train_dataset=tokenized_ds_llama["train"],
    eval_dataset=tokenized_ds_llama["validation"],
    tokenizer=tokenizer_llama,
    data_collator=data_collator,
)

# Trains the model
trainer_llama.train()

  trainer_llama = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss
1,2.5166,2.511028
2,2.3158,2.433814
3,2.1984,2.406394
4,2.0249,2.398438
5,1.8744,2.418295


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=875, training_loss=2.218644008091518, metrics={'train_runtime': 581.5582, 'train_samples_per_second': 12.011, 'train_steps_per_second': 1.505, 'total_flos': 912265787473920.0, 'train_loss': 2.218644008091518, 'epoch': 5.0})

In [None]:
def generate_predictions_llama(batch):
    inputs = [f"Expert: {text} Simple: " for text in batch["Expert"]]

    # Tokenizes the inputs
    tokenized_inputs = tokenizer_llama(
        inputs,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt"
    )

    input_ids = tokenized_inputs["input_ids"].to(model_llama.device)
    attention_mask = tokenized_inputs["attention_mask"].to(model_llama.device)

    # Generating predictions
    outputs = model_llama.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=100,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=1,
        pad_token_id=tokenizer_llama.pad_token_id,
    )

    # Decoding predictions and extracting only the generated part
    decoded_outputs = tokenizer_llama.batch_decode(outputs, skip_special_tokens=True)
    predictions_list = []
    for i, output in enumerate(decoded_outputs):
        # Finding the end of the prompt to extract only the generated simple text
        prompt_len = len(tokenizer_llama.encode(inputs[i], add_special_tokens=False))
        generated_text = tokenizer_llama.decode(outputs[i][prompt_len:], skip_special_tokens=True).strip()
        # Removes any lingering EOS token if it wasn't skipped fully
        generated_text = generated_text.replace(tokenizer_llama.eos_token, "").strip()
        predictions_list.append(generated_text)

    batch["pred"] = predictions_list
    return batch

# Generates the predictions using the fine-tuned GPT-Neo model
predictions_llama = ds["test"].map(generate_predictions_llama, batched=True, batch_size=8)

print(predictions_llama)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Dataset({
    features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx', 'pred'],
    num_rows: 300
})


In [34]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.11-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.11-py3-none-any.whl (176 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.11


In [None]:
import evaluate
import textstat

# Calculates the SARI, BLEU, and ROUGE scores for the new predictions
sari_metric_llama = evaluate.load("sari")
bleu_metric_llama = evaluate.load("bleu")
rouge_metric_llama = evaluate.load("rouge")

sari_score_llama = sari_metric_llama.compute(
    sources=predictions_llama["Expert"],
    predictions=predictions_llama["pred"],
    references=[[ref] for ref in predictions_llama["Simple"]]
)

bleu_score_llama = bleu_metric_llama.compute(
    predictions=predictions_llama["pred"],
    references=[[ref] for ref in predictions_llama["Simple"]]
)

rouge_score_llama = rouge_metric_llama.compute(
    predictions=predictions_llama["pred"],
    references=predictions_llama["Simple"]
)

print("SARI (GPT-Neo):", sari_score_llama)
print("BLEU (GPT-Neo):", bleu_score_llama)
print("ROUGE (GPT-Neo):", rouge_score_llama)

# Calculates the readability scores for the newly generated simplified texts
flesch_kincaid_grades_llama = [textstat.flesch_kincaid_grade(text) for text in predictions_llama["pred"] if text.strip() != '']
automated_readability_indices_llama = [textstat.automated_readability_index(text) for text in predictions_llama["pred"] if text.strip() != '']

if flesch_kincaid_grades_llama:
    print("\nAverage Flesch-Kincaid Grade Level (GPT-Neo):", sum(flesch_kincaid_grades_llama) / len(flesch_kincaid_grades_llama))
else:
    print("\nNo valid texts to calculate Flesch-Kincaid Grade Level for GPT-Neo.")

if automated_readability_indices_llama:
    print("Average Automated Readability Index (GPT-Neo):", sum(automated_readability_indices_llama) / len(automated_readability_indices_llama))
else:
    print("No valid texts to calculate Automated Readability Index for GPT-Neo.")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

SARI (GPT-Neo): {'sari': 35.331212056309944}
BLEU (GPT-Neo): {'bleu': 0.03857070154490733, 'precisions': [0.10129880081222942, 0.042438570653437716, 0.026542774249196266, 0.0193962952679386], 'brevity_penalty': 1.0, 'length_ratio': 3.1580157289776163, 'translation_length': 26101, 'reference_length': 8265}
ROUGE (GPT-Neo): {'rouge1': np.float64(0.15131674080510826), 'rouge2': np.float64(0.07137312434629692), 'rougeL': np.float64(0.12764703185734932), 'rougeLsum': np.float64(0.1306368861598335)}

Average Flesch-Kincaid Grade Level (GPT-Neo): 26.83795212396487
Average Automated Readability Index (GPT-Neo): 42.0669028005208


In [10]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.11.4-py3-none-any.whl.metadata (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.3/64.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.11.4 (from unsloth)
  Downloading unsloth_zoo-2025.11.5-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from 

In [1]:
!pip install huggingface_hub



In [2]:
!pip install accelerate



In [3]:
!pip install bitsandbytes



In [4]:
!pip install transformers



In [5]:
!pip install evaluate



In [6]:
!pip install textstat



In [7]:
!pip install rouge_score



In [8]:
!pip install sacrebleu sacremoses



In [9]:
from datasets import load_dataset

ds = load_dataset("cbasu/Med-EASi")

print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1397 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/196 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 196
    })
    test: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 300
    })
})


In [None]:
# Testing with new Model: Llama-3.1-8B-Instruct model from Unsloth

from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = None 
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = ""
)

def preprocess_and_tokenizer_unsloth_llama(batch):
    messages = []
    for expert_text, simple_text in zip(batch['Expert'], batch['Simple']):
        # Llama 3.1 prompt that the model receives as input
        prompt_messages = [
            {"role": "system", "content": "You are a text simplification expert. Simplify the medical text for a 5th-grade reading level."},
            {"role": "user", "content": f"Simplify the following medical text: {expert_text}"}
        ]
        # The full conversation including the target response
        full_messages = prompt_messages + [
            {"role": "assistant", "content": simple_text}
        ]

        # Gets the tokenized input and labels
        tokenized_output = tokenizer.apply_chat_template(
            full_messages,
            tokenize=True,
            add_generation_prompt=False,
            return_tensors=None,
            max_length=max_seq_length,
            truncation=True
        )

        prompt_input_tokens = tokenizer.apply_chat_template(
            prompt_messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors=None,
            max_length=max_seq_length,
            truncation=True
        )
        prompt_len = len(prompt_input_tokens)

        labels = [-100] * prompt_len + tokenized_output[prompt_len:]

        if len(labels) > len(tokenized_output):
            labels = labels[:len(tokenized_output)]

        # Stores the processed example
        messages.append({
            "input_ids": tokenized_output,
            "labels": labels,
            "attention_mask": [1] * len(tokenized_output)
        })

    # Converts a list of dicts to dict of lists as required by  the 'map'
    return {
        "input_ids": [m["input_ids"] for m in messages],
        "labels": [m["labels"] for m in messages],
        "attention_mask": [m["attention_mask"] for m in messages],
    }

col_names = ds["train"].column_names
tokenized_ds_unsloth_llama = ds.map(preprocess_and_tokenizer_unsloth_llama, batched=True, remove_columns=col_names)

print(tokenized_ds_unsloth_llama)


==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 196
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 300
    })
})


In [None]:
def generate_predictions_unsloth_llama(batch):
    predictions_list = []
    for expert_text in batch["Expert"]:
        # Llama 3.1 Instruct format for the inference
        messages = [
            {"role": "system", "content": "You are a text simplification expert. Simplify the medical text for a 5th-grade reading level."},
            {"role": "user", "content": f"Simplify the following medical text: {expert_text}"}
        ]

        # Tokenize the inputs
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        inputs = tokenizer(
            [prompt],
            return_tensors="pt",
            max_length=max_seq_length, 
            truncation=True
        ).to(model.device)

        # Generates the predictions
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id
        )

        # Decoding the predictions and extracting only the generated part
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        generated_text = decoded_outputs[0]

        # Removes the prompt from the generated text
        if prompt.strip() in generated_text.strip():
            generated_text = generated_text.replace(prompt, "").strip()

        # Cleans up any remaining special tokens or system/user tags
        generated_text = generated_text.replace("<|eot_id|>", "").strip()
        generated_text = generated_text.replace("<|start_header_id|>assistant<|end_header_id|>", "").strip()
        generated_text = generated_text.replace("<|begin_of_text|>", "").strip()

        predictions_list.append(generated_text)

    batch["pred_unsloth_llama"] = predictions_list
    return batch

# Generate the predictions using the Unsloth Llama model
predictions_unsloth_llama = ds["test"].map(generate_predictions_unsloth_llama, batched=True, batch_size=8)

print(predictions_unsloth_llama)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Dataset({
    features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx', 'pred_unsloth_llama'],
    num_rows: 300
})


In [None]:
import evaluate
import textstat

# Calculates the SARI, BLEU, and ROUGE scores for the new predictions
sari_metric_unsloth_llama = evaluate.load("sari")
bleu_metric_unsloth_llama = evaluate.load("bleu")
rouge_metric_unsloth_llama = evaluate.load("rouge")

sari_score_unsloth_llama = sari_metric_unsloth_llama.compute(
    sources=predictions_unsloth_llama["Expert"],
    predictions=predictions_unsloth_llama["pred_unsloth_llama"],
    references=[[ref] for ref in predictions_unsloth_llama["Simple"]]
)

bleu_score_unsloth_llama = bleu_metric_unsloth_llama.compute(
    predictions=predictions_unsloth_llama["pred_unsloth_llama"],
    references=[[ref] for ref in predictions_unsloth_llama["Simple"]]
)

rouge_score_unsloth_llama = rouge_metric_unsloth_llama.compute(
    predictions=predictions_unsloth_llama["pred_unsloth_llama"],
    references=predictions_unsloth_llama["Simple"]
)

print("SARI (Unsloth Llama):", sari_score_unsloth_llama)
print("BLEU (Unsloth Llama):", bleu_score_unsloth_llama)
print("ROUGE (Unsloth Llama):", rouge_score_unsloth_llama)

# Calculates the readability scores for the newly generated simplified texts
flesch_kincaid_grades_unsloth_llama = [textstat.flesch_kincaid_grade(text) for text in predictions_unsloth_llama["pred_unsloth_llama"] if text.strip() != '']
automated_readability_indices_unsloth_llama = [textstat.automated_readability_index(text) for text in predictions_unsloth_llama["pred_unsloth_llama"] if text.strip() != '']

if flesch_kincaid_grades_unsloth_llama:
    print("\nAverage Flesch-Kincaid Grade Level (Unsloth Llama):", sum(flesch_kincaid_grades_unsloth_llama) / len(flesch_kincaid_grades_unsloth_llama))
else:
    print("\nNo valid texts to calculate Flesch-Kincaid Grade Level for Unsloth Llama.")

if automated_readability_indices_unsloth_llama:
    print("Average Automated Readability Index (Unsloth Llama):", sum(automated_readability_indices_unsloth_llama) / len(automated_readability_indices_unsloth_llama))
else:
    print("No valid texts to calculate Automated Readability Index for Unsloth Llama.")

SARI (Unsloth Llama): {'sari': 48.75574280145333}
BLEU (Unsloth Llama): {'bleu': 0.08150457700525399, 'precisions': [0.1568621622425017, 0.08602021535046939, 0.06328500953360976, 0.05167832381417794], 'brevity_penalty': 1.0, 'length_ratio': 4.070296430732002, 'translation_length': 33641, 'reference_length': 8265}
ROUGE (Unsloth Llama): {'rouge1': np.float64(0.23708338416617175), 'rouge2': np.float64(0.13970770885440573), 'rougeL': np.float64(0.19855296733065997), 'rougeLsum': np.float64(0.21201691480735788)}

Average Flesch-Kincaid Grade Level (Unsloth Llama): 10.944008687742597
Average Automated Readability Index (Unsloth Llama): 12.112041346790527


In [20]:
import pandas as pd

df = pd.DataFrame({
    "expert": predictions_unsloth_llama["Expert"],
    "reference_simple": predictions_unsloth_llama["Simple"],
    "model_simple": predictions_unsloth_llama["pred_unsloth_llama"]
})

df.to_csv("model_outputs.csv", index=False)

In [None]:
def generate_predictions_unsloth_llama(batch):
    predictions_list = []
    for expert_text in batch["Expert"]:
        # Llama 3.1 Instruct format
        messages = [
            {"role": "system", "content": "You are a text simplification expert. Simplify the medical text for a 5th-grade reading level."},
            {"role": "user", "content": f"Simplify the following medical text: {expert_text}"}
        ]

        # Tokenize the inputs
        inputs_ids = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            max_length=max_seq_length,
            truncation=True
        ).to(model.device)

        # Generates the predictions
        outputs = model.generate(
            input_ids=inputs_ids,
            attention_mask=inputs_ids.ne(tokenizer.pad_token_id).long(),
            max_new_tokens=150,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id
        )

        # Extracts the newly generated tokens
        generated_tokens = outputs[0][inputs_ids.shape[-1]:]
        decoded_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

        # Ensuring any empty string or just whitespace is handled
        if not decoded_generated_text:
            decoded_generated_text = ""

        predictions_list.append(decoded_generated_text)

    batch["pred_unsloth_llama"] = predictions_list
    return batch

# Generates the predictions using the Unsloth Llama model
predictions_unsloth_llama = ds["test"].map(generate_predictions_unsloth_llama, batched=True, batch_size=8)

print(predictions_unsloth_llama)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Dataset({
    features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx', 'pred_unsloth_llama'],
    num_rows: 300
})


**Reasoning**:
The previous `generate_predictions_unsloth_llama` function produced `predictions_unsloth_llama` successfully. Now, I need to re-compute the SARI, BLEU, and ROUGE metrics, and calculate readability scores for these cleaned predictions, as requested by the task.



In [None]:
import evaluate
import textstat

# Calculates the SARI, BLEU, and ROUGE scores for the new predictions
sari_metric_unsloth_llama = evaluate.load("sari")
bleu_metric_unsloth_llama = evaluate.load("bleu")
rouge_metric_unsloth_llama = evaluate.load("rouge")

sari_score_unsloth_llama = sari_metric_unsloth_llama.compute(
    sources=predictions_unsloth_llama["Expert"],
    predictions=predictions_unsloth_llama["pred_unsloth_llama"],
    references=[[ref] for ref in predictions_unsloth_llama["Simple"]]
)

bleu_score_unsloth_llama = bleu_metric_unsloth_llama.compute(
    predictions=predictions_unsloth_llama["pred_unsloth_llama"],
    references=[[ref] for ref in predictions_unsloth_llama["Simple"]]
)

rouge_score_unsloth_llama = rouge_metric_unsloth_llama.compute(
    predictions=predictions_unsloth_llama["pred_unsloth_llama"],
    references=predictions_unsloth_llama["Simple"]
)

print("SARI (Unsloth Llama):", sari_score_unsloth_llama)
print("BLEU (Unsloth Llama):", bleu_score_unsloth_llama)
print("ROUGE (Unsloth Llama):", rouge_score_unsloth_llama)

# Calculates the readability scores for the newly generated simplified texts
flesch_kincaid_grades_unsloth_llama = [textstat.flesch_kincaid_grade(text) for text in predictions_unsloth_llama["pred_unsloth_llama"] if text.strip() != '']
automated_readability_indices_unsloth_llama = [textstat.automated_readability_index(text) for text in predictions_unsloth_llama["pred_unsloth_llama"] if text.strip() != '']

if flesch_kincaid_grades_unsloth_llama:
    print("\nAverage Flesch-Kincaid Grade Level (Unsloth Llama):", sum(flesch_kincaid_grades_unsloth_llama) / len(flesch_kincaid_grades_unsloth_llama))
else:
    print("\nNo valid texts to calculate Flesch-Kincaid Grade Level for Unsloth Llama.")

if automated_readability_indices_unsloth_llama:
    print("Average Automated Readability Index (Unsloth Llama):", sum(automated_readability_indices_unsloth_llama) / len(automated_readability_indices_unsloth_llama))
else:
    print("No valid texts to calculate Automated Readability Index for Unsloth Llama.")

SARI (Unsloth Llama): {'sari': 44.80018313986765}
BLEU (Unsloth Llama): {'bleu': 0.27045043318661793, 'precisions': [0.5217576791808873, 0.33288770053475936, 0.26803482587064675, 0.2317351598173516], 'brevity_penalty': 0.8391705655524792, 'length_ratio': 0.8508166969147005, 'translation_length': 7032, 'reference_length': 8265}
ROUGE (Unsloth Llama): {'rouge1': np.float64(0.4804256086873544), 'rouge2': np.float64(0.3255189892030621), 'rougeL': np.float64(0.441041365116263), 'rougeLsum': np.float64(0.4415005753352552)}

Average Flesch-Kincaid Grade Level (Unsloth Llama): 13.074332309256064
Average Automated Readability Index (Unsloth Llama): 13.748279910452199


In [33]:
import pandas as pd

df = pd.DataFrame({
    "expert": predictions_unsloth_llama["Expert"],
    "reference_simple": predictions_unsloth_llama["Simple"],
    "model_simple": predictions_unsloth_llama["pred_unsloth_llama"]
})

df.to_csv("model_outputs.csv", index=False)