In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastModel
import torch
max_seq_length = 2048
model_name = "unsloth/Qwen3-1.7B"

model, tokenizer = FastModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

model = FastModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0.05, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import requests
import zipfile
import io
import pandas as pd
from datasets import Dataset

url = "https://huggingface.co/Reggie/test/resolve/main/tags_finetuning_sample.zip?download=true"
password = b"unsloth" # Password in bytes

print("Downloading file to RAM...")
response = requests.get(url)

if response.status_code == 200:
    # 1. Wrap the downloaded bytes in BytesIO so ZipFile treats it like a file
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        
        # Find the jsonl file
        jsonl_filename = [name for name in z.namelist() if name.endswith('.jsonl')][0]
        print(f"Found {jsonl_filename}, extracting to DataFrame...")
        
        # 2. Open the specific file in the zip stream
        with z.open(jsonl_filename, pwd=password) as f:
            # Read directly to Pandas
            df = pd.read_json(f, lines=True)

    # 3. Convert to Hugging Face Dataset
    dataset = Dataset.from_pandas(df)
    
    # Cleanup memory
    del df
    
    print("Success! Dataset loaded.")
    print(dataset)
else:
    print(f"Download failed. Status code: {response.status_code}")

dataset = dataset.train_test_split(test_size=0.02, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

print(f"‚úÖ Dataset split:")
print(f"   Training examples: {len(train_dataset)}")
print(f"   Validation examples: {len(eval_dataset)}")


alpaca_prompt = """<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
<think>

</think>

{}<|im_end|>"""

EOS_TOKEN = tokenizer.eos_token  # End-of-sequence token
print('EOS TOKEN:', EOS_TOKEN)

def formatting_prompts_func(examples):
    """
    Convert Alpaca format to Gemma 3 chat format.

    For each example:
    1. Combine instruction + input (input is empty for us)
    2. Format as Gemma chat turn
    3. Add EOS token for proper training
    """
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Combine instruction and input (input is empty for our dataset)
        full_instruction = instruction + ("\n" + input_text if input_text else "")

        # Format as chat turns
        text = alpaca_prompt.format(full_instruction, output) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

# Apply formatting to both train and validation sets
train_dataset = train_dataset.map(
    formatting_prompts_func,
    batched=True,
)

eval_dataset = eval_dataset.map(
    formatting_prompts_func,
    batched=True,
)

print("‚úÖ Dataset formatted for Qwen 3 chat!")
print("\nüìù Formatted Example:")
print("-" * 80)
print(train_dataset[0]['text'])
print("-" * 80)

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

# Training configuration
training_args = TrainingArguments(
    # Output and logging
    output_dir="./outputs",              # Where to save model checkpoints
    logging_dir="./logs",                # Where to save logs
    logging_steps=50,                    # Log every 10 steps

    # Training hyperparameters
    num_train_epochs=3,                  # Train for 3 epochs
    per_device_train_batch_size=64,       # 4 examples per GPU
    gradient_accumulation_steps=1,       # Accumulate 4 batches (effective batch=16)
    learning_rate=1e-4,                  # Standard LoRA learning rate
    weight_decay=0.01,                   # L2 regularization

    # Learning rate schedule
    lr_scheduler_type="cosine",          # Cosine decay schedule
    warmup_steps=100,                     # Warmup for first 50 steps

    # Optimization
    optim="adamw_torch",                  # 8-bit AdamW (saves memory)
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),      # Mixed precision training (2x faster)
    
    # Evaluation
    eval_strategy="steps",               # Evaluate during training
    eval_steps=250,                       # Evaluate every 50 steps
    per_device_eval_batch_size=64,        # Batch size for evaluation

    # Checkpointing
    save_strategy="steps",               # Save checkpoints
    save_steps=100,                      # Save every 100 steps
    save_total_limit=5,                  # Keep only 3 best checkpoints
    load_best_model_at_end=True,         # Load best checkpoint at end
    metric_for_best_model="eval_loss",   # Use validation loss to pick best

    # Memory optimizations
    gradient_checkpointing=True,         # Save memory (slight speed cost)
    max_grad_norm=1.0,                   # Gradient clipping (stability)

    # Reproducibility
    seed=42,

    # Disable unnecessary features
    report_to="none",                    # Don't report to wandb/tensorboard
)

print("‚úÖ Training configuration set!")
print("\nüìä Training Summary:")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   Warmup steps: {training_args.warmup_steps}")
print(f"   FP16 enabled: {training_args.fp16}")

# Calculate approximate training time
total_steps = (len(train_dataset) * training_args.num_train_epochs) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
print(f"   Total steps: ~{total_steps}")




# Create trainer with SFTTrainer (Supervised Fine-Tuning)
from transformers import TrainerCallback, EarlyStoppingCallback
import random

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

class MemoryCallback(TrainerCallback):
    """
    Monitors VRAM usage and shows a random evaluation sample
    after each evaluation.
    """
    
    def __init__(self, model, tokenizer, eval_dataset, **kwargs):
        super().__init__(**kwargs)
        self.model = model
        self.tokenizer = tokenizer
        self.eval_dataset = eval_dataset

    def on_evaluate(self, args, state, control, **kwargs):
        self._show_random_sample()

    def _show_random_sample(self):
        """Selects, generates, and prints a random evaluation sample."""
        try:
            # 1. Get random sample
            idx = random.randint(0, len(self.eval_dataset) - 1)
            sample = self.eval_dataset[idx]
            full_text = sample['text'] # Assumes 'text' field
            
            # 2. Parse prompt and label from your specific format
            parts = full_text.split("<|im_start|>assistant\n<think>\n\n</think>\n")
            if len(parts) != 2:
                print(f"   (Skipping sample: unexpected format '{full_text[:50]}...')")
                return
                
            # Re-create the prompt model expects: "...<end_of_turn><start_of_turn>model\n"
            input_prompt = parts[0] + "<|im_start|>assistant\n<think>\n\n</think>\n"
            ground_truth = parts[1].split("<|im_end|>")[0].strip()
            
            # 3. Tokenize and move to device
            inputs = self.tokenizer(input_prompt, return_tensors="pt").to(self.model.device)
            
            # 4. Generate prediction
            # Use torch.no_grad() for efficiency
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs, 
                    max_new_tokens=50, # Or your desired keyword limit
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )
            
            # 5. Decode *only* the new tokens
            # We slice the output tensor to remove the input prompt tokens
            generated_ids = outputs[0][inputs.input_ids.shape[1]:]
            generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

            # 6. Print formatted comparison
            print("\n" + "="*30 + f" üé≤ Random Eval Sample (Index: {idx}) " + "="*30)
            # Show only the 'user' part of the prompt for clarity
            user_prompt = parts[0].replace('<|im_start|>user', '').strip()
            print(f"PROMPT (User Turn):\n{user_prompt}")
            print("-" * 80)
            print(f"MODEL (Generated):\n{generated_text}")
            print("-" * 80)
            print(f"TRUTH (Label):\n{ground_truth}")
            print("=" * 80)

        except Exception as e:
            print(f"   (Error during random sample generation: {e})")


memory_callback = MemoryCallback(
    model=model, 
    tokenizer=tokenizer, 
    eval_dataset=eval_dataset
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",  # Which field contains the formatted text
    max_seq_length=max_seq_length,
    args=training_args,
    packing=False,  # Don't pack multiple examples (our examples are short)
    callbacks=[early_stopping, memory_callback],
)

print("‚úÖ Trainer initialized!")

In [None]:
trainer_stats = trainer.train()

print("\n" + "="*80)
print("üéâ Training complete!")
print("\nüìä Final Statistics:")
print(f"   Train runtime: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"   Train samples/second: {trainer_stats.metrics['train_samples_per_second']:.2f}")
print(f"   Final train loss: {trainer_stats.metrics['train_loss']:.4f}")

# Get validation metrics
eval_results = trainer.evaluate()
print(f"\nüìà Validation Results:")
print(f"   Validation loss: {eval_results['eval_loss']:.4f}")
print(f"   Validation perplexity: {eval_results.get('eval_perplexity', 'N/A')}")

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# Enable inference mode (faster, less memory)
FastLanguageModel.for_inference(model)

def test_command_translation(nl_query):
    instruction = f"List the content type of the passage and all relevant tags:\nPassage: {nl_query}"

    # Format as Qwen3 chat turn

    prompt = f"""<|im_start|>user
    {instruction}<|im_end|>
  <|im_start|>assistant
  <think>

  </think>"""

    # Tokenize
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.1,  # Low temperature for deterministic output
        top_p=0.9,
        do_sample=True,
    )

    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract model response (after "<start_of_turn>model")
    if "<start_of_turn>model" in response:
        response = response.split("<start_of_turn>model")[-1].strip()

    return response

print("‚úÖ Inference mode enabled!")


# Simple non-format test
prompt = """<|im_start|>user
Hello!<|im_end|>
<|im_start|>assistant
<think>

</think>"""

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=False,
)

print("# Simple non-format test, full output:")
print(tokenizer.decode(outputs[0], skip_special_tokens=False))
print("\n" + "="*80 + "\n")
print("Without special tokens:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


print('Proper test with right formatting')

for text in [""""I must tell you my experience. There was a time I crisscrossed India on my motorcycle. Once again, I'm getting back on my motorcycle these days (Laughs). So I'm just riding somewhere between Madhya Pradesh and Uttar Pradesh, I don't know whether it's this state or that state. Whole night I've been riding... early morning around 6:30 I come and I park near a dhaba, tch, to have a tea or something. Motorcycles every 2000-3000 kilometers, those days these modern machines are better. You have to tighten the chain, chain becomes slack.
"I am always carrying an extra chain because I'm on the road. You just have to de-link... one link you have to take it off and put it, it's a small job but it's messy. It's greasy, oily, your hands go bad, everything. So I parked for tea in the morning. Then I saw there was one mechanic shop right there early morning. It was open. Mubarak Mechanical Works. I can't forget, this is a handwritten sign - Mubarak Mechanical Works. So I saw a young strapling (strapping?) youth, one Muslim boy. I called him. ""Hey, can you fix the chain? I just want one link to go."" He said, ""Yes, I can do it."" I said, ""Okay, do it."" So he brought his tools. I looked at him. He's got a hammer and a chisel. I said, ""What, you are going to fix with that?"" He said, ""Yes."" I said, ""Wait."" And I walked into his garage, a small little shack. I look inside, all he is got is a hammer and chisel. With this he repairs everything (Laughs). 

 Time 42:08"
I said, "No, no, you're not going to touch my motorcycle (Laughs). Because I know if you do it with hammer and chisel after that nobody else can do anything with a motorcycle, it's finished." (Laughs). I said, "No, you don't do it, I got all the tools with me" (Laughs). I will do it." Why I'm saying this is like this, you read one book, as holy as the book maybe, you read one book, you are a hammer-and-chisel mechanic. With that you try to fix everything. No, I've read nothing spiritual, because spiritual is the basis of your existence. Spiritual is not in the book. Only this life can be spiritual, a book cannot be spiritual, a teaching cannot be spiritual, something else cannot be spiritual. Only this life can be spiritual.


"""]:
    instruction = f"""List the content type of the passage and all relevant tags:
    Passage: {text}"""


    prompt = f"""<|im_start|>user
    {instruction}<|im_end|>
    <|im_start|>assistant
    <think>

    </think>"""

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=False,  # Greedy first
        pad_token_id=tokenizer.eos_token_id,
    )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
    print("Full output:")
    print(full_output.split('Type: ', 1)[-1])

In [None]:
save_path = r"trained_models/markup_tags_16bit_r16_qwen3-1.7_lora"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ LoRA adapters saved to: {save_path}")


merged_model_path = save_path.replace('_lora', '_merged')
model.save_pretrained_merged(
    merged_model_path,
    tokenizer,
    save_method="merged_16bit",
)

print(f"‚úÖ 16-bit Merged model saved to: {merged_model_path}")

save_path = r"trained_models/markup_tags_4bit_r16_qwen3-1.7_lora"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ LoRA adapters saved to: {save_path}")


merged_model_path = save_path.replace('_lora', '_merged')
model.save_pretrained_merged(
    merged_model_path,
    tokenizer,
    save_method="merged_4bit",
)

print(f"‚úÖ 4-bit Merged model saved to: {merged_model_path}")



In [None]:
## WITH UNSLOTH - NEEDS GPU
# 4-bit version better than 16-bit
import torch
from unsloth import FastLanguageModel
import time

lora4b = r"trained_models/markup_tags_4bit_r16_qwen3-1.7_lora"

model4b, tokenizer4b = FastLanguageModel.from_pretrained(
    model_name=lora4b,  # Your local directory
    max_seq_length=2048,
    load_in_4bit=True,  
    local_files_only=True,
    
)

FastLanguageModel.for_inference(model4b)

# Generate

texts = [""""I must tell you my experience. There was a time I crisscrossed India on my motorcycle. Once again, I'm getting back on my motorcycle these days (Laughs). So I'm just riding somewhere between Madhya Pradesh and Uttar Pradesh, I don't know whether it's this state or that state. Whole night I've been riding... early morning around 6:30 I come and I park near a dhaba, tch, to have a tea or something. Motorcycles every 2000-3000 kilometers, those days these modern machines are better. You have to tighten the chain, chain becomes slack.
"I am always carrying an extra chain because I'm on the road. You just have to de-link... one link you have to take it off and put it, it's a small job but it's messy. It's greasy, oily, your hands go bad, everything. So I parked for tea in the morning. Then I saw there was one mechanic shop right there early morning. It was open. Mubarak Mechanical Works. I can't forget, this is a handwritten sign - Mubarak Mechanical Works. So I saw a young strapling (strapping?) youth, one Muslim boy. I called him. ""Hey, can you fix the chain? I just want one link to go."" He said, ""Yes, I can do it."" I said, ""Okay, do it."" So he brought his tools. I looked at him. He's got a hammer and a chisel. I said, ""What, you are going to fix with that?"" He said, ""Yes."" I said, ""Wait."" And I walked into his garage, a small little shack. I look inside, all he is got is a hammer and chisel. With this he repairs everything (Laughs). 

 Time 42:08"
I said, "No, no, you're not going to touch my motorcycle (Laughs). Because I know if you do it with hammer and chisel after that nobody else can do anything with a motorcycle, it's finished." (Laughs). I said, "No, you don't do it, I got all the tools with me" (Laughs). I will do it." Why I'm saying this is like this, you read one book, as holy as the book maybe, you read one book, you are a hammer-and-chisel mechanic. With that you try to fix everything. No, I've read nothing spiritual, because spiritual is the basis of your existence. Spiritual is not in the book. Only this life can be spiritual, a book cannot be spiritual, a teaching cannot be spiritual, something else cannot be spiritual. Only this life can be spiritual.


""", """They were amazed how developed and how economically strong and so much wealth but unprotected. Their idea of a heap of gold means there would be ten men always standing there with arms. But here people put a heap of gold on the street and sold. Because society was so evolved, people understood what are the laws. If I steal yours, tomorrow you'll steal mine and the whole society will collapse after some time. So people were so wise and evolved when they came, this looked like a free-for-all bonanza. They could take what they want. Wealth was not protected, women were not protected. They were everywhere. They picked up what they want and exploited it in a most horrible way... most horrific way. Today we're trying to glorify those things also unfortunately but in a most horrific way they did this.
"When the Islamic invasions happened, this happened. They did not come as invaders. They did not come as religious crusaders. They just came as bandits. But when they saw it was so easy, because there were men involved in profound music, mathematics, astrolo... astronomy, astrology, all kinds of things but there were not too many fighting men. So it was a walk over. The land, the beautiful land that this was, the rich banks of the rivers, agriculturally rich, wealth, gold, diamonds... The first nation which mined diamonds in the world is India, all right. The greatest diamond in the world is still from India, it is sitting on the British crown (Laughs). But they saw this is like a paradise that you can take without resistance. And they took it because they were basically nomadic people, barbarian, they did their own terrible things. Even when the British came, we must understand it is not the United Kingdom's army which came, its East India Company which came. It's a business which came but they saw it's so easy to take this country. They became emperors (Laughs) over a period of time. 

 Time 26:26"
""", """"See twenty-eight years is the yoga center. This activity is on for thirty-seven years now. But largely till '98, '99 or 2000, till 2000, we were only focused on one goal, Dhyanalinga. So really our worldwide activity is only from 2001 till now. So it's actually eighteen years of activity. Well, today people estimate that we have touched over five-hundred million people, that's half a billion people. But thirty-seven years ago, when this phenomenal experience happened to me on the Chamundi hills, I sat there and simply if I sit there I am... I am like dripping ecstasy in every cell in my body. I again and again tried the next few days and weeks that I do activity something and if I simply sit, it's... I am just bursting with ecstasy. 

 Time 10:08 

 That time I thought (Laughs) that this is so simple. If you don't do anything... If you don't mess with yourself, you will be ecstatic. Ecstatic means bursting with ecstasy. The peak of your life, if only you don't mess with your psychological stuff. Then I thought this is so simple. I will make the whole world ecstatic (Laughs). Well, at that moment, I thought this is the first time I have discovered this because I had no traditional background, I had no spiritual background, I just grew up in a very westernized way. And this was bursting within me, I thought... That is the time where, you know, youth of that... that generation were so much into drugs, seeking experiences. But here I was, if I just close my eyes, I'm in the highest possible experience that a human being can be. So I thought no drug, no God, no religion, no nothing, no philosophy, if you don't mess with yourself, you're ecstatic. 

 I thought this is very simple. On that day, world's population was 5.6 billion people. I thought... I made a plan. In two-and-a-half years' time, I'll make the whole world ecstatic (Laughs)."
"Interviewer: Thirty-eight years... 

 Sadhguru: (Laughs). People think we are a great success because we've touched half a billion people. I don't think so - because for me, humanity means 7.6 billion people. Well, I might die a failure (Laughs) but a blissful failure. This is something all the young people must understand. You must... You must choose to fail because the goals that you choose are so huge; in one lifetime, you cannot do it. You setup petty goals, you finish that and you think you're a great success, what is the point of such a life? You aspire for something which cannot be fulfilled by one generation of people. But you have the privilege of setting the direction for that. You will die of failure but it's a very blissful failure (Laughs). 

 So as far as I'm concerned, I am a failure. Isha Foundation is a failure. It doesn't matter, all of us work nearly twenty hours a day, seven days of the week but all the time we know we are short of what we can do or what we should be doing. This is what drives everybody here. All the young people who are here, they have known usual pleasures of life, what others are seeking in cities and all. The only pleasure they have is they're working for a large vision. And they see the transformation in people's faces, how they come and how they leave, you must see. You should have recorded - people how they come on day one to a program of three days, when they leave, they're bursting with ecstasy. So like this millions of people seeing their faces and seeing the transformation that is being brought about, that is the only thing which keeps them going and their own transformation also of course. So these thirty-seven years, though in my view, it's a failure, the world thinks we are a great success (Laughs) because they think small. 

 Time 14:11"
"""]

timh = time.time()
for text in texts:
    instruction = f"""List the content type of the passage and all relevant tags:
    Passage: {text}"""


    prompt = f"""<|im_start|>user
    {instruction}<|im_end|>
    <|im_start|>assistant
    <think>

    </think>"""

    inputs = tokenizer4b(prompt, return_tensors="pt").to("cuda")
    outputs = model4b.generate(**inputs, max_new_tokens=50,
        temperature=0.01,  # Low temperature for deterministic output
        top_p=0.95,
        do_sample=True,)
    print(tokenizer4b.decode(outputs[0], skip_special_tokens=True))

print(time.time()-timh)

In [None]:
lora16b = r"trained_models/markup_tags_16bit_r16_qwen3-1.7_lora"

model16b, tokenizer16b = FastLanguageModel.from_pretrained(
    model_name=lora16b,  # Your local directory
    max_seq_length=2048,
    load_in_4bit=False,  
    local_files_only=True,
    
)

FastLanguageModel.for_inference(model4b)

timh = time.time()
for text in texts:
    instruction = f"""List the content type of the passage and all relevant tags:
    Passage: {text}"""


    prompt = f"""<|im_start|>user
    {instruction}<|im_end|>
    <|im_start|>assistant
    <think>

    </think>"""

    inputs = tokenizer16b(prompt, return_tensors="pt").to("cuda")
    outputs = model16b.generate(**inputs, max_new_tokens=50,
        temperature=0.01,  # Low temperature for deterministic output
        top_p=0.95,
        do_sample=True,)
    print(tokenizer16b.decode(outputs[0], skip_special_tokens=True))

print(time.time()-timh)

In [None]:
lora16b = r"trained_models/markup_tags_16bit_r16_qwen3-1.7_lora"

model16b, tokenizer16b = FastLanguageModel.from_pretrained(
    model_name=lora16b,  # Your local directory
    max_seq_length=2048,
    load_in_4bit=False,  
    local_files_only=True,
    
)

FastLanguageModel.for_inference(model4b)

timh = time.time()
for text in texts:
    instruction = f"""List the content type of the passage and all relevant tags:
    Passage: {text}"""


    prompt = f"""<|im_start|>user
    {instruction}<|im_end|>
    <|im_start|>assistant
    <think>

    </think>"""

    inputs = tokenizer16b(prompt, return_tensors="pt").to("cuda")
    outputs = model16b.generate(**inputs, max_new_tokens=50,
        temperature=0.01,  # Low temperature for deterministic output
        top_p=0.95,
        do_sample=True,)
    print(tokenizer16b.decode(outputs[0], skip_special_tokens=True))

print(time.time()-timh)