In [None]:
# !pip install -q -U transformers datasets bitsandbytes  trl peft  huggingface_hub

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer, SFTConfig
from torch.optim import AdamW
from transformers import get_scheduler
from accelerate import PartialState
import os

from math import ceil


# local_save_path_model = "New-model-full-ddp"
# WANDB_API_KEY = "xxxxxxxxxxxx"
# os.environ["WANDB_API_KEY"] = WANDB_API_KEY

### DDP

In [None]:
df = pd.read_parquet("hf://datasets/ayoubkirouane/arxiv-physics/data/train-00000-of-00001-5bba4a271402bdbb.parquet")
train_dataset = Dataset.from_pandas(df).select(range(1000))
train_dataset = train_dataset.add_column(
    "messages",
    [[{'content': row['question'], 'role': 'user'}, {'content': row['answer'], 'role': 'assistant'}] for row in train_dataset]
)

device_string = PartialState().process_index

In [None]:
def load_model_and_tokenizer(model_name, use_gpu = True):

    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name )
    model = AutoModelForCausalLM.from_pretrained(model_name , device_map={'':device_string})
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""

    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [None]:
model, tokenizer = load_model_and_tokenizer("unsloth/Qwen3-1.7B", True)

In [None]:
sft_config = SFTConfig(
    learning_rate=8e-5, # Learning rate for training.
    num_train_epochs=1, #  Set the number of epochs to train the model.
    per_device_train_batch_size=1, # Batch size for each device (e.g., GPU) during training.
    gradient_accumulation_steps=8, # Number of steps before performing a backward/update pass to accumulate gradients.
    gradient_checkpointing=True, # Enable gradient checkpointing to reduce memory usage during training at the cost of slower training speed.
    gradient_checkpointing_kwargs = {"use_reentrant": False}, # Must be false for DDP
    logging_steps=1,  # Frequency of logging training progress (log every 2 steps).
    dataset_text_field="messages",
    report_to="wandb"

)

In [None]:
num_update_steps_per_epoch = ceil(len(train_dataset) / sft_config.per_device_train_batch_size / sft_config.gradient_accumulation_steps)
total_training_steps = num_update_steps_per_epoch * sft_config.num_train_epochs

# Create optimizer
optimizer = AdamW(model.parameters(), lr=sft_config.learning_rate)

# Create learning rate scheduler
lr_scheduler = get_scheduler(
    name="cosine",  # or "cosine", "constant","linear" etc.
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_training_steps
)

In [None]:
sft_trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    optimizers=(optimizer, lr_scheduler)
)

In [None]:
sft_trainer.train()

In [None]:
sft_trainer.model.save_pretrained("New_Full-Qwen3-1.7B")

### FSDP

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer, SFTConfig
import os

In [None]:
LOCAL_SAVE_PATH = "New-model-full-finetune-fsdp"

# Load dataset
WANDB_API_KEY = "xxxxxxxxxxx"
os.environ["WANDB_API_KEY"] = WANDB_API_KEY

In [None]:
df = pd.read_parquet("hf://datasets/ayoubkirouane/arxiv-physics/data/train-00000-of-00001-5bba4a271402bdbb.parquet")
train_dataset = Dataset.from_pandas(df).select(range(1000))
train_dataset = train_dataset.add_column(
    "messages",
    [[{'content': row['question'], 'role': 'user'}, {'content': row['answer'], 'role': 'assistant'}] for row in train_dataset]
)

In [None]:
# Load model & tokenizer
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
        {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
        {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
        {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
        {% endif %}
        {% endfor %}"""

    tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
    return model, tokenizer

In [None]:
model_name = "unsloth/llama-3.2-3b-instruct"
model, tokenizer = load_model_and_tokenizer(model_name)

In [None]:
sft_config = SFTConfig(
    output_dir=LOCAL_SAVE_PATH,
    dataset_text_field="messages",
    max_seq_length=2048,
    learning_rate=8e-5,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    logging_steps=1,
    save_strategy="epoch",
    report_to="wandb",
    fsdp="full_shard auto_wrap",
    fsdp_config={
        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer"    },
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
# fsdp_config.yaml
# compute_environment: LOCAL_MACHINE
# debug: false
# distributed_type: FSDP
# downcast_bf16: 'no'
# fsdp_config:
#   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
#   fsdp_backward_prefetch: BACKWARD_PRE
#   fsdp_cpu_ram_efficient_loading: true
#   fsdp_forward_prefetch: false
#   fsdp_offload_params: false
#   fsdp_sharding_strategy: FULL_SHARD
#   fsdp_state_dict_type: SHARDED_STATE_DICT
#   fsdp_sync_module_states: true
#   fsdp_use_orig_params: false
# machine_rank: 0
# main_training_function: main
# mixed_precision: bf16
# num_machines: 1
# num_processes: 2
# rdzv_backend: static
# same_network: true
# tpu_env: []
# tpu_use_cluster: false
# tpu_use_sudo: false
# use_cpu: false

# accelerate launch --config_file fsdp_config.yaml train.py