In [5]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [6]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None 
load_in_4bit = True 


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.18: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 4, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none", 
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False, 
    loftq_config = None,
)

In [10]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


arxiv_dataset = pd.read_csv("arxiv_final.csv")

def prepare_compscholar_data(df):
    data = []
    for _, row in df.iterrows():
        arc = row['article']
        abs = row['abstract']
        data.append({"article": str(arc), "abstract":str(abs)})
    return data

all_data = prepare_compscholar_data(arxiv_dataset)
dataset = Dataset.from_dict({
    "abstract": [item["abstract"] for item in all_data],
    "article": [item["article"] for item in all_data],

})
dataset = dataset.train_test_split(test_size=0.1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Let's see how the `ChatML` format works by printing the 5th element

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['abstract', 'article'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['abstract', 'article'],
        num_rows: 200
    })
})

In [12]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [18]:
def formatting_prompts_func(examples):
    instructions = ["Summarize the following research paper. Your summary should be concise, coherent, and capture the key points of the paper. Ensure that the summary is well-structured and provides a clear understanding of the paper's purpose and outcomes without unnecessary details. You should always address everything in third person, the keyword `we` becomes `this paper`."] * len(examples["article"])

    prompts = []
    for instruction, arc in zip(instructions,examples["article"]):
        prompt = f"<s>[INST] {instruction}\n [/INST] {arc}</s>"
        prompts.append(prompt)

    return {"text": prompts}

tokenized_train_dataset = dataset['train'].map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

In [19]:
tokenized_train_dataset[5]

{'abstract': 'the z - burst model and the direct propagation of uhe proton in negligible extragalactic magnetic fields produce gamma - rays afterglows and precursors halos , respectively at gevs and tev energy band a few degree around the uhecr arrival direction . \n the possible correlation of uhecr clusters ( doublet , triplet ) with nearby bl lac sources at @xmath0 ev offer a test for this necessary gamma - uhecr trace . \n we estimate the secondary gamma energy and spectra and we suggest how to disentangle between the different scenarios . \n we show why z - burst model is still the most realistic model to explain uhecr behaviour and their correlation to known bl lac sources .',
 'article': 'the uhecr events with energy above @xmath1 ev are bounded by the primordial photon drag ( the well known gzk cut - off ) in a very narrow universe ( @xmath2 ) . because of their charge uhecr are bent and blurred by cosmic magnetic fields . however , uhecr because of their extremely rigidity mai

In [20]:
import wandb

In [21]:
wandb.login(key="xxx")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcneuralnets[0m ([33mcneuralnets-ai4bharat[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [22]:
wandb.init(
    project="revel",
    name="run_2",
    config={
        "model_name": "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "max_seq_length": 2048,
        "learning_rate": 2e-4,
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 8,
        "warmup_steps": 5,
        "max_steps": 60,
        "weight_decay": 0.01,
        "lr_scheduler_type": "linear",
        "packing": False,
        "lora_r": 16,
        "lora_alpha": 16,
        "lora_dropout": 0,
    }
)

In [23]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", 
    ),
)

Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1800 [00:00<?, ? examples/s]

In [24]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
7.787 GB of memory reserved.


In [25]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,800 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 10,485,760/7,000,000,000 (0.15% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.4086
2,3.6811
3,3.4609
4,3.1991
5,3.0306
6,2.7098
7,2.4363
8,2.1616
9,2.0007
10,1.7878


In [26]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1011.1919 seconds used for training.
16.85 minutes used for training.
Peak reserved memory = 7.787 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 52.825 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [27]:
wandb.finish()

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▃▄▂▂▂▄█▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▂▄▅███▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▇▇▆▄▂▂▁▂▂▂▂▁▁▂▂▂▂▂▂▂▂▂▁▁▁▁▂▂▁▁▂▁▁▂▁▁▁▁▁

0,1
total_flos,1.10006186348544e+16
train/epoch,0.53333
train/global_step,60.0
train/grad_norm,1.02298
train/learning_rate,0.0
train/loss,1.4227
train_loss,1.72399
train_runtime,1011.1919
train_samples_per_second,0.949
train_steps_per_second,0.059


In [28]:

model.push_to_hub("deeponh/revel_2", token = "xxx") 
tokenizer.push_to_hub("deeponh/revel_2", token = "xxx") 

README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Saved model to https://huggingface.co/deeponh/revel_2


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]