In [None]:
%%capture
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes
%pip install -U transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_dir = "Qwen/Qwen3-32B"
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,   
    device_map="auto",  
    torch_dtype=torch.bfloat16,
    trust_remote_code=True             
)

model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
train_prompt_style = """Below is a research question or topic from machine learning literature.
Think carefully before answering, referencing relevant theoretical frameworks and methods if possible.

### Instruction:
You are an AI researcher trained on the arXiv corpus, capable of answering or summarizing technical topics in ML and AI.
Please respond to the following research topic.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    titles = examples["title"]
    abstracts = examples["abstract"]
    texts = []
    for title, abstract in zip(titles, abstracts):
        cot = ""  # you can generate CoTs later
        response = abstract
        if not response.endswith(EOS_TOKEN):
            response += EOS_TOKEN
        text = train_prompt_style.format(title, cot, response)
        texts.append(text)
    return {"text": texts}

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "CShorten/ML-ArXiv-Papers",
    split="train[0:2000]",
    trust_remote_code=True,
)

dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)
dataset["text"][10]

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
inference_prompt_style = """Below is a research-level instruction paired with an input topic from the machine learning literature. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and develop a step-by-step chain of thought grounded in theoretical and empirical understanding.

### Instruction:
You are an AI research expert with deep knowledge of machine learning, optimization, and theoretical frameworks. 
Please answer the following research question.

### Question:
{}

### Response:
<think>
"""

In [None]:
question = dataset[10]['title']

inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
if "### Response:" in response:
    print(response.split("### Response:")[1].strip())
else:
    print(response.strip())

In [None]:
from peft import LoraConfig, get_peft_model

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,                           # scaling factor for LoRA
    lora_dropout=0.05,                       # add slight dropout for regularization
    r=64,                                    # rank of the LoRA update matrices
    bias="none",                             # no bias reparameterization
    task_type="CAUSAL_LM",                   # task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)

model = get_peft_model(model, peft_config)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=0.2,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none",
    label_names=["labels"]
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

In [None]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()
model.config.use_cache = False
trainer.train()

In [None]:
question = dataset[10]['title']
inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

if "### Response:" in response:
    print(response.split("### Response:")[1].strip())
else:
    print(response.strip())

In [None]:
question = dataset[100]['title']
inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])

In [None]:
new_model_name = "Qwen-3-32B-ML-Expert-Reasoning"

model.push_to_hub(new_model_name)
tokenizer.push_to_hub(new_model_name)