In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import json


In [2]:
from google.colab import userdata

model_name = "meta-llama/Llama-3.2-1B-Instruct"
hf_token = userdata.get("TOKEN_All_acces")

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_token,
    torch_dtype=torch.float16,
    device_map={"": device},
)


`torch_dtype` is deprecated! Use `dtype` instead!


In [3]:
print("\n‚öôÔ∏è Configuring LoRA...")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,

)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



‚öôÔ∏è Configuring LoRA...
trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


In [4]:
from datasets import load_dataset, Dataset

def format_prompt(example):
    """Format wiki_medical_terms into a Llama 3 QA-style prompt."""
    term = example.get("page_title", "").strip()
    text = example.get("page_text", "").strip()

    # On v√©rifie qu'on a du contenu
    if not term or not text:
        return None

    # On prend une d√©finition courte = 1 ou 2 phrases max
    sentences = text.split(". ")
    if len(sentences) == 1:
        definition = sentences[0]
    else:
        definition = ". ".join(sentences[:2])

    definition = definition.strip()
    if not definition.endswith("."):
        definition += "."

    question = f"What is {term}?"
    answer = definition

    prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The answer is: {answer}<|eot_id|>"""

    return {"text": prompt}

print("\nüìö Loading wiki_medical_terms dataset...")
raw_ds = load_dataset("gamino/wiki_medical_terms")
raw_train = raw_ds["train"]

print("üßπ Formatting dataset...")
formatted_examples = []
for ex in raw_train:
    out = format_prompt(ex)
    if out is not None:
        formatted_examples.append(out)

# On garde 500 exemples pour rester dans l‚Äôesprit du lab
formatted_examples = formatted_examples[:500]
train_dataset = Dataset.from_list(formatted_examples)

print(f"‚úÖ Training on {len(train_dataset)} examples")



üìö Loading wiki_medical_terms dataset...
üßπ Formatting dataset...
‚úÖ Training on 500 examples


In [6]:
from datasets import Dataset

print("\nüìä Loading dataset...")
raw = load_dataset("gamino/wiki_medical_terms")
raw_train = raw["train"]   # use the train split

print("üîÑ Formatting dataset...")

formatted_examples = []
for ex in raw_train:
    out = format_prompt(ex)   # our function from the slide
    if out is not None:
        formatted_examples.append(out)

# keep only 500 examples
formatted_examples = formatted_examples[:500]

# turn back into a Hugging Face Dataset
train_dataset = Dataset.from_list(formatted_examples)

print(f"‚úÖ Training on {len(train_dataset)} examples")
print(train_dataset[0])



üìä Loading dataset...
üîÑ Formatting dataset...
‚úÖ Training on 500 examples
{'text': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWhat is Paracetamol poisoning?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe answer is: Paracetamol poisoning, also known as acetaminophen poisoning, is caused by excessive use of the medication paracetamol (acetaminophen). Most people have few or non-specific symptoms in the first 24 hours following overdose.<|eot_id|>'}


In [7]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 500
})

Tokenize

In [8]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,  # Shorter for Mac memory
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

In [9]:
print("\nüîÑ Tokenizing...")

tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

print("Tokenization completed!")
print(tokenized_dataset)



üîÑ Tokenizing...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenization completed!
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})


In [10]:
print("‚öôÔ∏è Setting up training...")

use_cuda = torch.cuda.is_available()

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=10,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=False,
    logging_dir="./logs",
    report_to="none",
)

‚öôÔ∏è Setting up training...


In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [12]:
trainer = Trainer(
    model=model,                      # your LoRA-wrapped Llama
    args=training_args,
    train_dataset=tokenized_dataset,  # from previous step
    data_collator=data_collator,
)

In [13]:
print("\nüöÄ Starting training...")
print("="*60)
trainer.train()
print("="*60)
print("‚úÖ Training complete!")



üöÄ Starting training...


Step,Training Loss
10,2.5392
20,1.9313
30,1.856
40,1.7267
50,1.6318
60,1.5293
70,1.5634
80,1.5547
90,1.5901
100,1.6201


‚úÖ Training complete!


In [14]:
print("\nüíæ Saving model...")

save_path = "./llama3_medical_lora"

# Save LoRA adapters (PEFT model)
model.save_pretrained(save_path)

# Save tokenizer
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Model saved to: {save_path}")



üíæ Saving model...
‚úÖ Model saved to: ./llama3_medical_lora


In [15]:
# if you are running out of memory run this cell to clear memory
import gc

# Clear MPS cache
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

# Clear Python garbage collection
gc.collect()

print("‚úÖ Memory cleared!")

‚úÖ Memory cleared!


In [17]:
import re
import random
import time

print("\nüìä Loading wiki_medical_terms dataset for evaluation...")
wiki_ds = load_dataset("gamino/wiki_medical_terms")
full_dataset = wiki_ds["train"]
n_total = len(full_dataset)
print(f"Total examples in wiki_medical_terms: {n_total}")

# On suppose que les 0‚Äì499 ont servi √† l'entra√Ænement
test_start = 500
test_indices = list(range(test_start, n_total))

# On prend 20 exemples al√©atoires dans la zone test
num_examples = min(20, len(test_indices))
random.seed(42)
selected_indices = random.sample(test_indices, num_examples)

print(f"\nüé≤ Randomly selected {len(selected_indices)} test examples")
print(f"Indices: {selected_indices[:5]}... (showing first 5)")


# ============================================================================
# INFERENCE FUNCTION
# ============================================================================
def get_prediction(question, max_tokens=50):
    """Generate prediction for a question using the fine-tuned model."""
    model.eval()

    prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The answer is:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.3,
            top_p=0.9,
            do_sample=True,
        )

    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # On r√©cup√®re juste ce qui suit "The answer is:"
    if "The answer is:" in full_text:
        answer = full_text.split("The answer is:", 1)[1].strip()
    else:
        answer = full_text.strip()

    return answer

# Small stop-word list for partial-match scoring
STOP_WORDS = {
    "the","a","an","of","to","and","or","is","are","in","on","for","with",
    "by","at","from","this","that","which","what","when","where","why","how"
}

def _normalize(text):
    """lowercase, remove punctuation, split, drop stop-words"""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = [t for t in text.split() if t and t not in STOP_WORDS]
    return tokens

# ============================================================================
# CHECK ACCURACY
# ============================================================================
def check_accuracy(prediction, ground_truth):
    """
    Check if prediction is correct.
    - Exact match: ground truth appears verbatim (case-insensitive)
    - Partial match: ‚â•70% of ground-truth key tokens appear in prediction
    """

    pred_l = prediction.lower()
    gt_l = ground_truth.lower()

    # 1) Exact match
    if gt_l in pred_l:
        return True, "exact_match"

    # 2) Partial match
    gt_tokens = _normalize(ground_truth)
    if not gt_tokens:
        return False, "no_match"

    pred_tokens = _normalize(prediction)
    if not pred_tokens:
        return False, "no_match"

    gt_unique = set(gt_tokens)
    common = [t for t in gt_unique if t in pred_tokens]
    match_ratio = len(common) / len(gt_unique)

    if match_ratio >= 0.7:
        return True, "partial_match"

    return False, "no_match"

# Initialize variables for evaluation
results = []
correct_exact = 0
correct_partial = 0
total = 0

start_time = time.time()

for i, idx in enumerate(selected_indices, 1):
    ex = full_dataset[idx]
    term = ex["page_title"].strip()
    text = ex["page_text"].strip()

    # M√™me logique que dans format_prompt
    sentences = text.split(". ")
    if len(sentences) == 1:
        definition = sentences[0]
    else:
        definition = ". ".join(sentences[:2])
    definition = definition.strip()
    if not definition.endswith("."):
        definition += "."

    question = f"What is {term}?"
    ground_truth = definition

    prediction = get_prediction(question)

    is_correct, match_type = check_accuracy(prediction, ground_truth)

    total += 1
    if match_type == "exact":
        correct_exact += 1
    elif match_type == "partial":
        correct_partial += 1

    accuracy = 100.0 * (correct_exact + correct_partial) / total

    results.append({
        "index": idx,
        "question": question,
        "ground_truth": ground_truth,
        "prediction": prediction,
        "correct": is_correct,
        "match_type": match_type,
    })

    print("\n" + "-"*80)
    print(f"Example {i}/{len(selected_indices)} (idx={idx})")
    print(f"Q: {question}")
    print(f"GT: {ground_truth[:120]}...")
    print(f"Pred: {prediction[:120]}...")
    print(f"‚úî Correct: {is_correct} ({match_type})")
    print(f"Running accuracy: {accuracy:.1f}% ({correct_exact + correct_partial}/{total})")

total_time = time.time() - start_time

# ============================================================================
# FINAL RESULTS
# ============================================================================
print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)

total = len(results)
total_correct = correct_exact + correct_partial
incorrect = total - total_correct

accuracy = 100.0 * total_correct / total if total > 0 else 0.0
exact_pct = 100.0 * correct_exact / total if total > 0 else 0.0
partial_pct = 100.0 * correct_partial / total if total > 0 else 0.0

total_minutes = total_time / 60.0
avg_time_sec = total_time / total if total > 0 else 0.0

print(f"\nTotal examples evaluated: {total}")
print(f"Exact matches: {correct_exact} ({exact_pct:.1f}%)")
print(f"Partial matches: {correct_partial} ({partial_pct:.1f}%)")
print(f"Total correct: {total_correct} ({accuracy:.1f}%)")
print(f"Incorrect: {incorrect} ({100-accuracy:.1f}%)\n")

print(f"Total evaluation time: {total_minutes:.1f} minutes")
print(f"Average time per example: {avg_time_sec:.1f} seconds")

print("\n" + "="*80)
print("PERFORMANCE ASSESSMENT")
print("="*80)

if accuracy >= 80:
    print("üåü EXCELLENT! Model is performing very well!")
    print("   Your fine-tuning was highly successful.")
elif accuracy >= 60:
    print("‚úÖ GOOD! Model learned successfully!")
    print("   Consider training longer or with more data for improvement.")
elif accuracy >= 40:
    print("‚ö†Ô∏è  MODERATE. Model shows some learning.")
    print("   Recommend: Train for more epochs or increase dataset size.")
elif accuracy >= 20:
    print("‚ö†Ô∏è  POOR. Model needs significant improvement.")
    print("   Recommend: Check data quality, train longer, or use more examples.")
else:
    print("‚ùå VERY POOR. Model barely learned.")
    print("   Recommend: Verify data formatting and retrain from scratch.")


# ============================================================================
# SAVE RESULTS
# ============================================================================
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

results_summary = {
    "dataset": "gamino/wiki_medical_terms",
    "total_examples": total,
    "exact_matches": correct_exact,
    "partial_matches": correct_partial,
    "incorrect": incorrect,
    "accuracy": accuracy,
    "exact_match_pct": exact_pct,
    "partial_match_pct": partial_pct,
    "total_time_sec": total_time,
    "avg_time_sec": avg_time_sec,
    "selected_indices": selected_indices,
    "results": results,
}

with open('evaluation_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("‚úÖ Results saved to: evaluation_results.json")


üìä Loading wiki_medical_terms dataset for evaluation...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total examples in wiki_medical_terms: 6861

üé≤ Randomly selected 20 test examples
Indices: [5738, 1412, 704, 6574, 2753]... (showing first 5)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 1/20 (idx=5738)
Q: What is Stickler syndrome?
GT: Stickler syndrome (hereditary progressive arthro-ophthalmodystrophy) is a group of rare genetic disorders affecting conn...
Pred: Stickler syndrome, also known as Stickler disease, is an autosomal recessive inherited disease characterized by problems...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/1)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 2/20 (idx=1412)
Q: What is Nevus lipomatosus superficialis?
GT: Nevus lipomatosus (cutaneous) superficialis (NLS or NLCS, also known as "Nevus lipomatosis of Hoffman and Zurhelle") is ...
Pred: Nevus lipomatosus superficialis (NLS) is a type of cutaneous nevus that is composed of fat cells. It is a benign conditi...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/2)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 3/20 (idx=704)
Q: What is Ventricular tachycardia?
GT: Ventricular tachycardia (V-tach or VT) is a fast heart rate arising from the lower chambers of the heart. Although a few...
Pred: Ventricular tachycardia (V-tach or VT) is a fast heart rate originating in the lower chambers of the heart (the ventricl...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/3)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 4/20 (idx=6574)
Q: What is Perforation?
GT: A perforation is a small hole in a thin material or web. There is usually more than one perforation in an organized fash...
Pred: A perforation is a hole made in a wall or lining of a body cavity by a medical instrument. It is a common complication o...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/4)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 5/20 (idx=2753)
Q: What is Pterygium inversum unguis?
GT: Pterygium inversum unguis or ventral pterygium is characterized by the adherence of the distal portion of the nailbed to...
Pred: Pterygium inversum unguis (PIU), also known as Pterygium inversum, is a rare, autosomal recessive, genetic disorder that...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/5)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 6/20 (idx=2506)
Q: What is Gigantism?
GT: Gigantism (Greek: Œ≥ŒØŒ≥Œ±œÇ, g√≠gas, "giant", plural Œ≥ŒØŒ≥Œ±ŒΩœÑŒµœÇ, g√≠gantes), also known as giantism, is a condition characterize...
Pred: Gigantism is a condition of excessive growth that occurs after the bone growth plate has closed. It typically occurs in ...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/6)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 7/20 (idx=2328)
Q: What is Hypoplastic left heart syndrome?
GT: Hypoplastic left heart syndrome (HLHS) is a rare congenital heart defect in which the left side of the heart is severely...
Pred: Hypoplastic left heart syndrome (HLHS) is a rare congenital heart defect in which the left side of the heart is underdev...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/7)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 8/20 (idx=1643)
Q: What is Cystic hygroma?
GT: A cystic hygroma is an abnormal growth that usually appears on a babys neck or head. It consists of one or more cysts an...
Pred: Cystic hygroma, also known as cystic hygroma syndrome, is a congenital condition in which a fluid-filled sac forms in th...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/8)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 9/20 (idx=6533)
Q: What is Ocular hypertension?
GT: Ocular hypertension is the presence of elevated fluid pressure inside the eye (intraocular pressure), usually with no op...
Pred: Ocular hypertension (OHT) is a condition in which the intraocular pressure (IOP) is elevated above the normal range but ...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/9)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 10/20 (idx=1339)
Q: What is Ovarian remnant syndrome?
GT: Ovarian remnant syndrome is a condition that occurs when ovarian tissue is left behind following oophorectomy, causing d...
Pred: Ovarian remnant syndrome (ORS) is a condition in which the remaining ovarian tissue after childbirth does not produce en...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/10)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 11/20 (idx=6043)
Q: What is Anencephaly?
GT: Anencephaly is the absence of a major portion of the brain, skull, and scalp that occurs during embryonic development. I...
Pred: Anencephaly is a birth defect in which a baby is born without a significant portion of the brain and skull. The brain is...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/11)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 12/20 (idx=6567)
Q: What is H?
GT: H, or h, is the eighth letter in the Latin alphabet, used in the modern English alphabet, the alphabets of other western...
Pred: H is the first letter of the chemical symbol for hydrogen. It is the chemical element with atomic number 1, representing...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/12)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 13/20 (idx=4967)
Q: What is Cancer?
GT: Cancer is a group of diseases involving abnormal cell growth with the potential to invade or spread to other parts of th...
Pred: Cancer is a group of diseases characterized by uncontrolled cell growth in tissues. Tumors (cancerous growths) can be be...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/13)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 14/20 (idx=1212)
Q: What is Winchester syndrome?
GT: Winchester syndrome is a rare hereditary connective tissue disease described in 1969, of which the main characteristics ...
Pred: Winchester syndrome is a rare genetic disease in which a male cat becomes aggressive and destructive. The syndrome is na...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/14)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 15/20 (idx=5337)
Q: What is Granuloma gluteale infantum?
GT: Granuloma gluteale infantum is a cutaneous condition that appears in the anogenital region of infants as a complication ...
Pred: Granuloma gluteale infantum (GGA), also known as "stump" or "stump-like" granuloma gluteale, is a benign, non-cancerous ...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/15)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 16/20 (idx=3956)
Q: What is Calcipotriol?
GT: Calcipotriol, also known as calcipotriene, is a synthetic derivative of calcitriol, a form of vitamin D.  It is used in ...
Pred: Calcipotriol, also known as calcipotriene, is a topical medication used to treat psoriasis. It is a synthetic 13-carbon ...
‚úî Correct: True (partial_match)
Running accuracy: 0.0% (0/16)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 17/20 (idx=760)
Q: What is Alport syndrome?
GT: Alport syndrome is a genetic disorder affecting around 1 in 5,000-10,000 children, characterized by glomerulonephritis, ...
Pred: Alport syndrome, also known as hereditary nephritis, is a group of autosomal recessive genetic disorders that affect the...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/17)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 18/20 (idx=744)
Q: What is Kwashiorkor?
GT: Kwashiorkor ( KWOSH-ee-OR-kor, -‚Å†k…ôr, UK also  KWASH-) is a form of severe protein malnutrition characterized by edema a...
Pred: Kwashiorkor is a form of malnutrition that occurs when there is not enough protein in the diet, but the body needs more....
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/18)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--------------------------------------------------------------------------------
Example 19/20 (idx=1267)
Q: What is Pseudostrabismus?
GT: Pseudostrabismus is the false appearance of crossed eyes. When the eyes are actually crossed or not completely aligned w...
Pred: Pseudostrabismus, also known as ectopias, is a condition in which the eyes appear to be positioned at different distance...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/19)

--------------------------------------------------------------------------------
Example 20/20 (idx=2291)
Q: What is Capillaritis?
GT: Capillaritis is where the capillaries, usually of the legs or lungs, are inflamed, allowing blood cells to pass through....
Pred: Capillaritis is a form of inflammation of the capillaries. It is a symptom of various diseases, most commonly rheumatoid...
‚úî Correct: False (no_match)
Running accuracy: 0.0% (0/20)

FINAL RESULTS

Total examples evaluated: 20
Exact matches: 0 (0.0%)
Partial matches: 0 (0.0%)
T

## Part A

###Q1 ‚Äî Improving Model Performance
Augmenter le nombre d‚Äôexemples, entra√Æner plus longtemps (plus d‚Äô√©poques) et utiliser des r√©ponses mieux structur√©es permettrait au mod√®le d‚Äôapprendre des d√©finitions plus coh√©rentes.


###Q2 ‚Äî Analyzing Failure Patterns
Le mod√®le g√©n√®re des paraphrases proches mais rarement identiques au ground truth, ce qui montre qu‚Äôil s‚Äôappuie surtout sur ses connaissances internes plut√¥t que sur le fine-tuning.


###Q3 ‚Äî Data Quality vs Quantity
Quelques centaines d‚Äôexemples bien structur√©s auraient plus d‚Äôimpact que de grandes quantit√©s de textes Wikipedia bruts et in√©gaux.

# Part B

###Q4 ‚Äî Optimizing for limited resources
La quantization (INT8/INT4), des s√©quences plus courtes et LoRA permettent de r√©duire fortement l‚Äôusage m√©moire et le temps d‚Äôinf√©rence.

###Q5 ‚Äî Speed vs Accuracy Trade-offs
Des param√®tres de g√©n√©ration conservateurs (peu de tokens, temp√©rature basse) am√©liorent la vitesse mais r√©duisent la richesse des r√©ponses.

# Part C

###Q7 ‚Äî Improving Evaluation Metrics
Une m√©trique s√©mantique (similarit√© d‚Äôembeddings) serait plus adapt√©e que l‚Äôexact/partial match pour comparer des d√©finitions m√©dicales reformul√©es.

###Q8 ‚Äî Test Set Size and Confidence
√âvaluer sur un √©chantillon plus large (50‚Äì200 exemples) donnerait une mesure de performance beaucoup plus fiable que 20 exemples.

# Part D
###Q9 ‚Äî Production Considerations
Une application m√©dicale doit inclure validation humaine, filtres anti-hallucination et un corpus m√©dical v√©rifi√© pour garantir s√©curit√© et fiabilit√©.

Le mod√®le n‚Äôa presque rien appris car le dataset est bruit√©, les r√©ponses longues et le nombre d‚Äôexemples trop faible.
Les erreurs sont proches du ground truth mais formul√©es diff√©remment ‚Üí la m√©trique actuelle ne reconna√Æt pas ces proximit√©s.
Pour am√©liorer : plus d‚Äôexemples, meilleurs r√©sum√©s, plus d‚Äô√©poques, meilleure m√©trique (similarit√© s√©mantique).
Pour l‚Äôinf√©rence limit√©e : LoRA + quantization, r√©duire max_length, prompts plus courts.
Pour la production : √©viter les hallucinations, utiliser des datasets m√©dicaux fiables, ajouter validations humaines.