In [1]:
!pip install torch transformers datasets peft accelerate trl bitsandbytes nltk rouge-score scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6

In [None]:
import os
import re
import torch
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import KFold
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig
)
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
from huggingface_hub import login

# Login to Hugging Face
HF_TOKEN = "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"
login(HF_TOKEN)
os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found!")

# Load Model & Tokenizer
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
QUANT_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=QUANT_CONFIG, trust_remote_code=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Resize model embeddings if new tokens were added
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id

# Apply LoRA
LORA_CONFIG = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, LORA_CONFIG)

# Load Dataset
dataset_path = "/content/250.csv"
dataset = load_dataset("csv", data_files=dataset_path, encoding="utf-8")['train']

def format_text(example):
    #instruction = "Rewrite the following sentence in Simplified Technical English without changing its meaning: "
    instruction = example["prompt"].strip()
    standard = example["standard_english"].strip()
    ste = example["ste"].strip()
    return {
        "input_text": f"{instruction} {standard}",
        "output_text": ste
    }
#dataset = dataset.map(format_text, remove_columns=["standard_english", "ste"])
dataset = dataset.map(format_text, remove_columns=["prompt", "standard_english", "ste"])

def tokenize_and_format(example):
    max_len = 384  # Adjust as needed
    # Tokenize both input and output together
    # This ensures that padding is applied to the combined sequence
    # resulting in consistent input and output lengths within a batch
    tokenized_example = tokenizer(
        example["input_text"],
        example["output_text"], # Tokenize output along with input
        truncation=True,
        padding='max_length', # Use 'max_length' for consistent padding
        max_length=max_len,
        add_special_tokens=True
    )

    # Extract input_ids, attention_mask, and labels
    input_ids = tokenized_example["input_ids"]
    attention_mask = tokenized_example["attention_mask"]

    # Create labels with -100 for padding
    labels = tokenized_example["input_ids"].copy()
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Tokenize Dataset
dataset = dataset.map(tokenize_and_format, remove_columns=["input_text", "output_text"])

# Metrics and Post-processing
kf = KFold(n_splits=5, shuffle=True, random_state=3407)
bleu_scores, rouge1_scores, rougeL_scores, validation_losses, training_losses = [], [], [], [], []

def post_process_output(output, inputInstruct):
    if output and not output.endswith('.'):
        output = f"{output}."
    output = re.sub(f"^{re.escape(inputInstruct)}", "", output, count=1).strip()
    sentences = re.findall(r"[^.]*\.\"?(?=\s|$)", output)
    if sentences and len(sentences[0].strip()) <= 4 and len(sentences) > 1:
        return sentences[1].strip()
    return sentences[0].strip() if sentences else inputInstruct

def compute_metrics(predictions, references):
    bleu = sentence_bleu([references], predictions, smoothing_function=SmoothingFunction().method1)
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True).score(" ".join(references), " ".join(predictions))
    return {"bleu": bleu, "rouge1": rouge['rouge1'].fmeasure, "rougeL": rouge['rougeL'].fmeasure}

# Training Loop
for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"Training Fold {fold + 1}")
    train_data, val_data = dataset.select(train_idx), dataset.select(val_idx)

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
        args=SFTConfig(
            output_dir=f"./results_fold_{fold + 1}",
            eval_strategy="steps",
            eval_steps=50,
            label_names=["labels"],
            logging_steps=50,
            learning_rate=2e-5,
            warmup_steps=50,  # or use warmup_ratio=0.1
            lr_scheduler_type="cosine",  # or "linear"
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=8,
            num_train_epochs=5,
            weight_decay=0.05,
            save_total_limit=2,
            save_steps=1000,
            logging_dir=f'./logs_fold_{fold + 1}',
            fp16=True,
            dataset_text_field="input_ids",
            max_seq_length=512,
            dataset_num_proc=2,
            packing=False,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False
        )
    )
    train_result = trainer.train()
    eval_results = trainer.evaluate()
    print("Evaluation Results:", eval_results)
    training_losses.append(train_result.training_loss)
    validation_losses.append(eval_results.get("eval_loss", 0))

    predictions, references = [], []
    for example in val_data:
        input_sentence = tokenizer.decode(
            [tok for tok, mask in zip(example["input_ids"], example["attention_mask"]) if mask == 1],
            skip_special_tokens=True
        )
        ground_truth = tokenizer.decode([t for t in example["labels"] if t != -100], skip_special_tokens=True)

        inputs = tokenizer(input_sentence, return_tensors="pt", truncation=True, padding=True, max_length=80)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=100,
                do_sample=False,
                temperature=None,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                num_return_sequences=1,
                early_stopping=True
            )

        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        print(f"Input: {input_sentence}")
        #print(f"Output: {decoded_output}")
        print(f"Post Process Output: {post_process_output(decoded_output, input_sentence)}")
        print(f"Ground Truth: {ground_truth}")

        predictions.append(post_process_output(decoded_output, input_sentence))
        references.append(ground_truth)

    metrics = compute_metrics(predictions, references)
    bleu_scores.append(metrics["bleu"])
    rouge1_scores.append(metrics["rouge1"])
    rougeL_scores.append(metrics["rougeL"])
    print(f"Fold {fold + 1} - BLEU: {metrics['bleu']}, ROUGE-1: {metrics['rouge1']}, ROUGE-L: {metrics['rougeL']}")

# Final Evaluation Summary
print(f"Avg BLEU: {np.mean(bleu_scores)}, Std: {np.std(bleu_scores)}")
print(f"Avg ROUGE-1: {np.mean(rouge1_scores)}, Std: {np.std(rouge1_scores)}")
print(f"Avg ROUGE-L: {np.mean(rougeL_scores)}, Std: {np.std(rougeL_scores)}")
print(f"Avg Validation Loss: {np.mean(validation_losses)}, Std: {np.std(validation_losses)}")
print(f"Avg Training Loss: {np.mean(training_losses)}, Std: {np.std(training_losses)}")

# Save Final Model
#trainer.save_model("/content/drive/MyDrive/Models/250")
print("Training complete.")


CUDA Available: True
GPU Name: Tesla T4


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Training Fold 1


Step,Training Loss,Validation Loss
50,4.1665,3.018062
100,2.0443,1.865759


Evaluation Results: {'eval_loss': 1.8415019512176514, 'eval_runtime': 11.3313, 'eval_samples_per_second': 4.413, 'eval_steps_per_second': 4.413}




Input: Identify and replace non-STE words with approved alternatives turn the control to the absolutely closed position.turn the control to the fully closed position.
Post Process Output: Question: What is the correct way to turn the control to the closed position?Question: What is the correct.
Ground Truth: Identify and replace non-STE words with approved alternatives turn the control to the absolutely closed position.turn the control to the fully closed position.
Input: Identify and replace non-STE words with approved alternatives contaminated oil must be replaced.replace the dirty oil.
Post Process Output: Question: The oil must.
Ground Truth: Identify and replace non-STE words with approved alternatives contaminated oil must be replaced.replace the dirty oil.
Input: Identify and replace non-STE words with approved alternatives replace all components that are worn beyond their maximum limits.replace all components that are worn more than the maximum limits.
Post Process Output: The 

Step,Training Loss,Validation Loss
50,1.6885,1.686556
100,1.4991,1.584984


Evaluation Results: {'eval_loss': 1.5810000896453857, 'eval_runtime': 11.3917, 'eval_samples_per_second': 4.389, 'eval_steps_per_second': 4.389}




Input: Identify and replace non-STE words with approved alternatives wipe excess compound from the gasket.remove unwanted compound from the gasket.
Post Process Output: The approved alternatives.
Ground Truth: Identify and replace non-STE words with approved alternatives wipe excess compound from the gasket.remove unwanted compound from the gasket.
Input: Identify and replace non-STE words with approved alternatives deaerate the number 1 hydraulic system.bleed the number 1 hydraulic system.
Post Process Output: The number.
Ground Truth: Identify and replace non-STE words with approved alternatives deaerate the number 1 hydraulic system.bleed the number 1 hydraulic system.
Input: Identify and replace non-STE words with approved alternatives due to the small diameter of the hole, the flow is low.because of the small diameter of the hole, the flow is low.
Post Process Output: The flow is low because of the small hole.
Ground Truth: Identify and replace non-STE words with approved alternat

Step,Training Loss,Validation Loss
50,1.3832,1.331044
100,1.1452,1.294104


Evaluation Results: {'eval_loss': 1.3011192083358765, 'eval_runtime': 11.4279, 'eval_samples_per_second': 4.375, 'eval_steps_per_second': 4.375}




Input: Identify and replace non-STE words with approved alternatives make sure that the valve is operable.make sure that the valve can operate.
Post Process Output: The information in this section.
Ground Truth: Identify and replace non-STE words with approved alternatives make sure that the valve is operable.make sure that the valve can operate.
Input: Identify and replace non-STE words with approved alternatives the horizon bar may drift slightly.the horizon bar can move a small distance.
Post Process Output: The horizon bar can move a.
Ground Truth: Identify and replace non-STE words with approved alternatives the horizon bar may drift slightly.the horizon bar can move a small distance.
Input: Identify and replace non-STE words with approved alternatives mask the first layer of tape across the cutout and the second and third layers diagonally.apply the first layer of tape across the cutout and the second and third layers diagonally.
Post Process Output: The.
Ground Truth: Identify a

Step,Training Loss,Validation Loss
50,1.0712,0.970638
100,0.864,0.998596


Evaluation Results: {'eval_loss': 1.0206701755523682, 'eval_runtime': 11.2759, 'eval_samples_per_second': 4.434, 'eval_steps_per_second': 4.434}




Input: Identify and replace non-STE words with approved alternatives ensure that there are no loose articles in the engine air intakes.make sure that there are no loose objects in the engine air intakes.
Post Process Output: The alternatives are not.
Ground Truth: Identify and replace non-STE words with approved alternatives ensure that there are no loose articles in the engine air intakes.make sure that there are no loose objects in the engine air intakes.
Input: Identify and replace non-STE words with approved alternatives apply sealant generously to the repair patch.apply a large quantity of sealant to the repair patch.
Post Process Output: The use of incorrect sealant can cause problems with the.
Ground Truth: Identify and replace non-STE words with approved alternatives apply sealant generously to the repair patch.apply a large quantity of sealant to the repair patch.
Input: Identify and replace non-STE words with approved alternatives cut along the contour to take out the damaged

Step,Training Loss,Validation Loss
50,0.7642,0.767528
100,0.5839,0.788804


Evaluation Results: {'eval_loss': 0.83261638879776, 'eval_runtime': 11.1016, 'eval_samples_per_second': 4.414, 'eval_steps_per_second': 4.414}




Input: Identify and replace non-STE words with approved alternatives check the lines for audible leaks of air.listen for air from the lines.
Post Process Output: if you hear air in the lines, replace it.
Ground Truth: Identify and replace non-STE words with approved alternatives check the lines for audible leaks of air.listen for air from the lines.
Input: Identify and replace non-STE words with approved alternatives slacken the control cables.release the tension in the control cables.
Post Process Output: deflate the control cables.
Ground Truth: Identify and replace non-STE words with approved alternatives slacken the control cables.release the tension in the control cables.
Input: Identify and replace non-STE words with approved alternatives for more information about spare parts, contact the sales department.for more information about spare parts, write to the sales department.
Post Process Output: if you do not observe these instructions, there is a risk of a.
Ground Truth: Identi