## 1- Use unsloth lib 

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
# !pip install -q transformers datasets

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)


# 2- Choosing models

In [None]:
from unsloth import FastLanguageModel

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 768 
dtype = None 
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.



fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
] 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",    
    use_gradient_checkpointing = "unsloth", 
    random_state = 422,
    use_rslora = False, 
    loftq_config = None, # 
)

# 3 - Loading Dataset

In [None]:
import pandas as pd
from datasets import Dataset

# Paths
train_path = "/content/drive/MyDrive/datasets/train_100k.csv"
test_path = "/content/drive/MyDrive/datasets/test_1k.csv"

# Load full datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df = train_df[["abstract", "title"]]
test_df = test_df[["abstract", "title"]]

train_sample = train_df.sample(n=6000, random_state=42)
test_sample = test_df.sample(n=100, random_state=42)


In [None]:
train_dataset = Dataset.from_pandas(train_sample, preserve_index=False)

In [None]:
alpaca_prompt = """An instruction that describes a task, paired with an input that provides context. Write a response that completes the request.

### Instruction:
Given the abstract of a research paper, generate a concise and informative title that reflects its main idea.

### Input (abstract):
{}

### Response (title):
{}"""

EOS_TOKEN = tokenizer.eos_token 

# 5. Formatting function
def formatting_prompts_func(examples):
    abstracts = examples["abstract"]
    titles    = examples["title"]
    texts = []
    for abstract, title in zip(abstracts, titles):
        texts.append(alpaca_prompt.format(abstract, title) + EOS_TOKEN)
    return { "text": texts }

# 6. Apply it to your dataset (removing raw columns)
train_dataset = train_dataset.map(
    formatting_prompts_func,
    batched = True,
    remove_columns = ["abstract", "title"],
)


In [None]:
del train_df, train_sample

In [None]:
# tokenize + truncate:
def tokenize_and_truncate(examples):
    # return input_ids, attention_mask
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_seq_length,
    )

train_dataset = train_dataset.map(
    tokenize_and_truncate,
    batched=True,
    remove_columns=["text"],    
)

# 4- Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

tokenizer.model_max_length = max_seq_length   
tokenizer.padding_side   = "right"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = None
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 500,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", 
    ),
)

In [None]:
trainer_stats = trainer.train()

# 5- Inference

In [None]:
import torch

# 1. Pick a single abstract from your test DataFrame
sample_idx = 0
abstract = test_df.loc[sample_idx, "abstract"]

# 2. Build the prompt (leave the title spot empty)
prompt = alpaca_prompt.format(abstract, "")

# 3. Enable fast inference
FastLanguageModel.for_inference(model)

# 4. Tokenize & move to GPU
inputs = tokenizer(
    [prompt],
    return_tensors="pt",
    truncation=True,
    max_length=max_seq_length,
).to("cuda")

# 5. Generate up to 30 tokens for the title
outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    temperature=0.4,
    top_p=0.9,
    top_k=50,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

# 6. Decode and extract just the title
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Split by your prompt marker
title = decoded.split("### Response (title):", 1)[-1].strip()

print("Abstract:\n", abstract)
print("\nGenerated Title:\n", title)
print("\Original Title:\n", test_df.loc[sample_idx, "title"])


# 6- Saving Models

In [None]:
model.save_pretrained("lora_llama31_8b_title_abstract")  
tokenizer.save_pretrained("lora_llama31_8b_title_abstract")

# model.push_to_hub("robuno/lora_llama31_8b_title_abstract", token = "x") 
# tokenizer.push_to_hub("robuno/lora_llama31_8b_title_abstract", token = "x") 

zip models

In [None]:
import os

# Name of the folder to be zipped
folder_name = "lora_llama31_8b_title_abstract"

# Name of the output ZIP file
zip_file_name = "lora_llama31_8b_title_abstract.zip"

# Check if the folder exists
if os.path.exists(folder_name):
    # Run the shell command to zip the folder
    # -r: Zips the folder and everything inside it (recursive)
    !zip -r {zip_file_name} {folder_name}
    print(f"Folder '{folder_name}' successfully zipped as '{zip_file_name}'.")
else:
    print(f"Error: Folder '{folder_name}' not found.")

# 7- Using lora adapters

In [None]:
from transformers import TextStreamer

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = folder_name, 
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) 


sample_idx = 5
abstract = test_df.loc[sample_idx, "abstract"]

inputs = tokenizer(
[
    alpaca_prompt.format(
        abstract, # instruction
        "",
    )
], return_tensors = "pt").to("cuda")


text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)

In [None]:
import os

output_dir = "outputs2/lora_llama31_8b_title_abstract"
os.makedirs(output_dir, exist_ok=True)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

# this saves all its components
model.save_pretrained(output_dir)

# save the adapter separately, you can do this directly:
model.save_pretrained(os.path.join(output_dir, "adapter"))

# Save the base model separately
base_model = model.get_base_model() if hasattr(model, "get_base_model") else model
base_model.save_pretrained(os.path.join(output_dir, "base_model"))

print(f">> Saved tokenizer + PEFT model to {output_dir}")

# 8 - Generate test titles

In [None]:
# !rm -rf lora_llama31_8b_title_abstract

In [None]:
!cp -r "/content/drive/MyDrive/datasets/lora_llama31_8b_title_abstract" "/content/"


In [None]:
!pip install -U bitsandbytes

In [None]:
import os

# check
output_dir = "./lora_llama31_8b_title_abstract"

if os.path.exists(output_dir):
    print(f"'{output_dir}' klasörü mevcut.")
else:
    print(f"'{output_dir}' klasörü bulunamadı.")

In [None]:
from transformers import AutoTokenizer
from peft import PeftModel
from unsloth import FastLanguageModel  

import torch
max_seq_length = 768
dtype = None 
load_in_4bit = True 

output_dir = "./lora_llama31_8b_title_abstract"  

# 1. load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir)


# 2. load base model
base_model, base_teokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 3. load PEFT adapter
model = PeftModel.from_pretrained(base_model, "./lora_llama31_8b_title_abstract")

# 4. Enable fast inference
FastLanguageModel.for_inference(model)


In [None]:
# outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

In [None]:
alpaca_prompt = """An instruction that describes a task, paired with an input that provides context. Write a response that completes the request.

### Instruction:
Given the abstract of a research paper, generate a concise and informative title that reflects its main idea.

### Input (abstract):
{}

### Response (title):
{}"""

In [None]:
import torch
from tqdm.auto import tqdm

# Enable fast inference mode
FastLanguageModel.for_inference(model)       # Fine-tuned model
FastLanguageModel.for_inference(base_model)  # Base model

generated_titles_finetuned = []
generated_titles_base = []

for abstract in tqdm(test_sample["abstract"].tolist(), desc="Generating titles"):

    prompt = alpaca_prompt.format(abstract, "")  # leave title empty

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length,
    ).to("cuda")

    # Generate with fine-tuned model
    outputs_ft = model.generate(
        **inputs,
        max_new_tokens=64,
        temperature=0.9,
        top_p=0.9,
        top_k=50,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )
    decoded_ft = tokenizer.batch_decode(outputs_ft, skip_special_tokens=True)[0]
    title_ft = decoded_ft.split("### Response (title):", 1)[-1].strip() if "### Response (title):" in decoded_ft else decoded_ft.strip()
    generated_titles_finetuned.append(title_ft)

    # Generate with base model
    outputs_base = base_model.generate(
        **inputs,
        max_new_tokens=64,
        temperature=0.9,
        top_p=0.9,
        top_k=50,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )
    decoded_base = tokenizer.batch_decode(outputs_base, skip_special_tokens=True)[0]
    title_base = decoded_base.split("### Response (title):", 1)[-1].strip() if "### Response (title):" in decoded_base else decoded_base.strip()
    generated_titles_base.append(title_base)

# Add to DataFrame
test_sample["generated_title_finetuned"] = generated_titles_finetuned
test_sample["generated_title_base"] = generated_titles_base

# Save to CSV
test_sample.to_csv("test_with_both_titles2.csv", index=False)


In [None]:
import pandas as pd

# Define the path to your CSV file
file_path = 'test_with_titles_llama31_8b.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows of the DataFrame
print(df.head())