In [1]:
%%capture
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes
%pip install -U transformers

In [2]:
from huggingface_hub import login
import os

hf_token = os.environ.get("HF_TOKEN")
login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_dir = "Qwen/Qwen3-32B"
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,   
    device_map="auto",  
    torch_dtype=torch.bfloat16,
    trust_remote_code=True             
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

model-00001-of-00017.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00006-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00007-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00004-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00008-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00005-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00003-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00009-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00010-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00011-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00012-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00013-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00014-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00015-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00016-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00017-of-00017.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [4]:
train_prompt_style = """Below is a research question or topic from machine learning literature.
Think carefully before answering, referencing relevant theoretical frameworks and methods if possible.

### Instruction:
You are an AI researcher trained on the arXiv corpus, capable of answering or summarizing technical topics in ML and AI.
Please respond to the following research topic.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    titles = examples["title"]
    abstracts = examples["abstract"]
    texts = []
    for title, abstract in zip(titles, abstracts):
        cot = ""  # you can generate CoTs later
        response = abstract
        if not response.endswith(EOS_TOKEN):
            response += EOS_TOKEN
        text = train_prompt_style.format(title, cot, response)
        texts.append(text)
    return {"text": texts}

In [5]:
from datasets import load_dataset

dataset = load_dataset(
    "CShorten/ML-ArXiv-Papers",
    split="train[0:2000]",
    trust_remote_code=True,
)

dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)
dataset["text"][10]

README.md:   0%|          | 0.00/986 [00:00<?, ?B/s]

ML-Arxiv-Papers.csv:   0%|          | 0.00/147M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/117592 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

'Below is a research question or topic from machine learning literature.\nThink carefully before answering, referencing relevant theoretical frameworks and methods if possible.\n\n### Instruction:\nYou are an AI researcher trained on the arXiv corpus, capable of answering or summarizing technical topics in ML and AI.\nPlease respond to the following research topic.\n\n### Question:\nHMM Speaker Identification Using Linear and Non-linear Merging\n  Techniques\n\n### Response:\n<think>\n\n</think>\n  Speaker identification is a powerful, non-invasive and in-expensive biometric\ntechnique. The recognition accuracy, however, deteriorates when noise levels\naffect a specific band of frequency. In this paper, we present a sub-band based\nspeaker identification that intends to improve the live testing performance.\nEach frequency sub-band is processed and classified independently. We also\ncompare the linear and non-linear merging techniques for the sub-bands\nrecognizer. Support vector machi

In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [7]:
inference_prompt_style = """Below is a research-level instruction paired with an input topic from the machine learning literature. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and develop a step-by-step chain of thought grounded in theoretical and empirical understanding.

### Instruction:
You are an AI research expert with deep knowledge of machine learning, optimization, and theoretical frameworks. 
Please answer the following research question.

### Question:
{}

### Response:
<think>
"""

In [8]:
question = dataset[10]['title']

inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
if "### Response:" in response:
    print(response.split("### Response:")[1].strip())
else:
    print(response.strip())

<think>


Okay, let's see. The user wants me to answer a research question about HMM Speaker Identification using linear and non-linear merging techniques. Hmm, first, I need to recall what HMMs are in the context of speaker identification. Hidden Markov Models are probabilistic models used for modeling temporal sequences, right? In speaker identification, they might model the acoustic features of a speaker's speech patterns.

Wait, merging techniques here probably refer to combining different HMMs or their parameters to improve identification accuracy. Linear merging would be something like averaging parameters or concatenating features linearly, while non-linear merging might involve more complex transformations like neural networks or kernel methods.

I should start by explaining the basics of HMM-based speaker identification. Then, outline both linear and non-linear merging approaches. For linear merging, maybe techniques like weighted averaging of HMM parameters or linear discrimi

In [9]:
from peft import LoraConfig, get_peft_model

# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,                           # scaling factor for LoRA
    lora_dropout=0.05,                       # add slight dropout for regularization
    r=64,                                    # rank of the LoRA update matrices
    bias="none",                             # no bias reparameterization
    task_type="CAUSAL_LM",                   # task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)

model = get_peft_model(model, peft_config)

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=0.2,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none",
    label_names=["labels"]
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

Adding EOS to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [11]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()
model.config.use_cache = False
trainer.train()3

Step,Training Loss
200,1.4302
400,1.3574
600,1.307
800,1.3116
1000,1.3288


TrainOutput(global_step=1000, training_loss=1.3470003051757813, metrics={'train_runtime': 1628.7311, 'train_samples_per_second': 1.228, 'train_steps_per_second': 0.614, 'total_flos': 1.1071652720898662e+17, 'train_loss': 1.3470003051757813})

In [12]:
question = dataset[10]['title']
inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

if "### Response:" in response:
    print(response.split("### Response:")[1].strip())
else:
    print(response.strip())


<think>

  Speaker identification is a process of determining the speaker from a
predefined set of speakers. This is a challenging task due to the presence of
different speakers with similar characteristics and also due to the presence of
noise. In this paper, we are proposing a speaker identification system based on
Hidden Markov Models (HMMs). We have considered 11 speakers, each with 6 speech
samples. We have used linear and non-linear merging techniques to improve the
performance of the system. We have obtained a high performance rate for the
speaker identification system.


In [15]:
question = dataset[100]['title']
inputs = tokenizer(
    [inference_prompt_style.format(question) + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])


<think>

  This paper presents a novel approach to estimating the dependence structure
of a random vector. In particular, we show that the copula of a random vector
can be estimated in a simple and efficient manner, provided that the
marginal distributions are known. We propose to use the conditional copula
density of a particular variable given the others. The conditional copula can
be estimated via a kernel approach, which is efficient in terms of both time
and space. Our approach is particularly suitable for large-scale problems and
for high-dimensional data. We demonstrate the utility of our approach on
synthetic and real-world data.



In [None]:
new_model_name = "Qwen-3-32B-ML-Expert-Reasoning"

model.push_to_hub(new_model_name)
tokenizer.push_to_hub(new_model_name)

adapter_model.safetensors:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]