In [1]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0
!pip3 install -q -U huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.4/103.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [52]:
MODEL_CONFIG = "meta-llama/Llama-2-7b-chat-hf"
HEADS = 8
DIMENSIONS = 512
LAYERS = 8 #
INTERMEDIATE_SIZE= 1024
CONTEXT_LENGTH = 256
NEW_MODEL = "meta-llama/Llama-2-55M-1bit-pretrained-riyalRJ"
HUGGINGFACE_ID = "riyalRJ"


# ### Training Parameters


BATCH_SIZE = 16
LEARNING_RATE = 1.5e-4
EPOCHS = 5

In [15]:
from torch import nn
from transformers.models.llama.modeling_llama import *
from transformers import (AutoTokenizer, AutoConfig, LlamaForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments)
from datasets import load_dataset
from huggingface_hub import login
from huggingface_hub import create_repo, HfApi

In [42]:
import os
from getpass import getpass
os.environ["HF_TOKEN"] = getpass("Your Huggingface Hub Token: ")

Your Huggingface Hub Token: ··········


# Code is refered from the [paper](https://github.com/microsoft/unilm/blob/master/bitnet/The-Era-of-1-bit-LLMs__Training_Tips_Code_FAQ.pdf)

In [16]:
def activation_quant(x):
    scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
    y = (x * scale).round().clamp_(-128, 127) / scale
    return y
def weight_quant(w):
    scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
    u = (w * scale).round().clamp_(-1, 1) / scale
    return u

class BitLinear(nn.Linear):
    def forward(self, x):
        w = self.weight # a weight tensor with shape [d, k]
        x = x.to(w.device)
        RMSNorm = LlamaRMSNorm(x.shape[-1]).to(w.device)
        x_norm = RMSNorm(x)
        # A trick for implementing Straight−Through−Estimator (STE) using detach()
        x_quant = x_norm + (activation_quant(x_norm) - x_norm).detach()
        w_quant = w + (weight_quant(w) - w).detach()
        y = F.linear(x_quant, w_quant)
        return y
def convert_to_bitnet(model, copy_weights):
    for name, module in model.named_modules():
        # Replace linear layers with BitNet
        if isinstance(module, LlamaSdpaAttention) or isinstance(module, LlamaMLP):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, nn.Linear):
                    bitlinear = BitLinear(child_module.in_features, child_module.out_features, child_module.bias is not None).to(device="cuda:0")
                    if copy_weights:
                        bitlinear.weight = child_module.weight
                        if child_module.bias is not None:
                            bitlinear.bias = child_module.bias
                    setattr(module, child_name, bitlinear)
        # Remove redundant input_layernorms
        elif isinstance(module, LlamaDecoderLayer):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, LlamaRMSNorm) and child_name == "input_layernorm":
                    setattr(module, child_name, nn.Identity().to(device="cuda:0"))


In [9]:
data=load_dataset("mlabonne/guanaco-llama2-1k",split="train")
tokenizer=AutoTokenizer.from_pretrained(MODEL_CONFIG)



In [11]:
data

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [22]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=False,
        return_length=True,
    )
    return {"input_ids": outputs['input_ids']}

tokenized_data = data.map(
    tokenize, batched=True,
)

total_tokens = tokenized_data.num_rows * CONTEXT_LENGTH
print(f"Training on {total_tokens:_} tokens")

config = AutoConfig.from_pretrained(
    MODEL_CONFIG,
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Training on 256_000 tokens




config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

In [23]:
config.hidden_size = DIMENSIONS
config.max_position_embeddings = DIMENSIONS
config.num_attention_heads = HEADS
config.num_hidden_layers = LAYERS
config.num_key_value_heads = HEADS
config.intermediate_size = INTERMEDIATE_SIZE

In [24]:
model = LlamaForCausalLM(config)
convert_to_bitnet(model, copy_weights=False)
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")
tokenizer.pad_token = tokenizer.eos_token

Model size: 53.7M parameters


In [25]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [28]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.17.5-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.10.0-py2.py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.1/302.1 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x8

In [33]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [38]:
output="result"
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=TrainingArguments(
    output_dir=output,
    per_device_train_batch_size=BATCH_SIZE,
    logging_steps=100,
    gradient_accumulation_steps=2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    warmup_steps=0.1,
    lr_scheduler_type="cosine",
    learning_rate=LEARNING_RATE,
    # max_steps=5000,
    save_steps=0.25,
    fp16=True,
    report_to="wandb"
),
    data_collator=data_collator,
    train_dataset=tokenized_data,
)

trainer.train()

Step,Training Loss
100,7.3677


TrainOutput(global_step=155, training_loss=7.119164448399697, metrics={'train_runtime': 103.2033, 'train_samples_per_second': 48.448, 'train_steps_per_second': 1.502, 'total_flos': 282794051764224.0, 'train_loss': 7.119164448399697, 'epoch': 4.92})

In [40]:
model.save_pretrained("Llama-2-55M-1bit-pretrained-riyalRJ")
tokenizer.save_pretrained("Llama-2-55M-1bit-pretrained-riyalRJ")

('Llama-2-55M-1bit-pretrained-riyalRJ/tokenizer_config.json',
 'Llama-2-55M-1bit-pretrained-riyalRJ/special_tokens_map.json',
 'Llama-2-55M-1bit-pretrained-riyalRJ/tokenizer.model',
 'Llama-2-55M-1bit-pretrained-riyalRJ/added_tokens.json',
 'Llama-2-55M-1bit-pretrained-riyalRJ/tokenizer.json')

In [49]:
repo_name="/content/Llama-2-55M-1bit-pretrained-riyalRJ"
# Push to Hugging Face Hub
api = HfApi()
api.create_repo(repo_id="riyalRJ/Llama-2-55M-1bit-pretrained-riyalRJ",
                exist_ok=True)
api.upload_folder(
    repo_id="riyalRJ/Llama-2-55M-1bit-pretrained-riyalRJ",
    folder_path=repo_name,
    repo_type="model",
    commit_message="Initial commit",
    use_auth_token=True
)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/215M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/riyalRJ/Llama-2-55M-1bit-pretrained-riyalRJ/commit/1eb2c2618a68576506db39beecb3bbd28e6b46f0', commit_message='Initial commit', commit_description='', oid='1eb2c2618a68576506db39beecb3bbd28e6b46f0', pr_url=None, pr_revision=None, pr_num=None)

In [50]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.llama.modeling_llama import *
# Load a pretrained BitNet model
model = "riyalRJ/Llama-2-55M-1bit-pretrained-riyalRJ"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)



tokenizer_config.json:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/215M [00:00<?, ?B/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at riyalRJ/Llama-2-55M-1bit-pretrained-riyalRJ and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight', 'model.layers.6.input_layernorm.weight', 'model.layers.7.input_layernorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

It is a very small model therfore for better usage it is preferred to finetune on the custom dataset for good accuracy .
It is an experimentation on the the [paper](https://arxiv.org/pdf/2402.17764)