In [None]:
%pip install transformers trl accelerate torch bitsandbytes peft datasets -qU

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch

2024-10-17 08:03:50.140717: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
hf_token = "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"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", token=hf_token)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", token=hf_token, torch_dtype="auto", device_map="auto")

In [4]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [5]:
from datasets import load_dataset

ds = load_dataset("Open-Orca/OpenOrca", split="train")

In [6]:
import random

dataset_length = 10**5

dataset_size = len(ds)
random_indices = random.sample(range(dataset_size), dataset_length)

sampled_dataset = ds.select(random_indices)

train_size = int(0.9 * dataset_length)
train_dataset = sampled_dataset.select(range(train_size))
test_dataset = sampled_dataset.select(range(train_size, dataset_length))


In [7]:
def process_func(example):
    MAX_LENGTH = 512
    
    messages = [
        {"role": "system", "content": example["system_prompt"]},
        {"role": "user", "content": example["question"]},
        {"role": "assistant", "content": example["response"]},
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    input_ids = tokenizer.encode(text=text.strip(), add_special_tokens=True, truncation=True, max_length=MAX_LENGTH)

    labels = input_ids[:]  

    pad_len = MAX_LENGTH - len(input_ids)
    input_ids += [tokenizer.pad_token_id] * pad_len
    labels += [tokenizer.pad_token_id] * pad_len

    labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

    return {
        "input_ids": input_ids,
        "labels": labels,
    }


In [8]:
tokenized_train = train_dataset.map(process_func, remove_columns=train_dataset.column_names, batched=False)

Map: 100%|██████████| 90000/90000 [02:36<00:00, 576.65 examples/s]


In [9]:
tokenized_test = test_dataset.map(process_func, remove_columns=test_dataset.column_names, batched=False)

Map: 100%|██████████| 10000/10000 [00:17<00:00, 586.13 examples/s]


# EXP0: Default model (no fine-tune)

In [10]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [11]:
import torch
torch.cuda.empty_cache()

In [12]:
import evaluate
from transformers import TrainingArguments
from transformers import Trainer, DataCollatorForSeq2Seq

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    predictions = np.argmax(logits, axis=-1)
    seq_len = (labels != -100).sum(-1)
    loss = np.exp(logits.mean())  

    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    bleu = bleu_metric.compute(predictions=decoded_predictions, references=decoded_labels)
    rouge = rouge_metric.compute(predictions=decoded_predictions, references=decoded_labels)
    meteor = meteor_metric.compute(predictions=decoded_predictions, references=decoded_labels)

    return {
        "loss": float(loss),
        "perplexity": float(np.exp(loss)),
        "bleu": bleu["bleu"],
        "rouge": rouge["rougeL"].mid.fmeasure,
        "meteor": meteor["meteor"],
    }


args = TrainingArguments(
    output_dir="qwen_instruct_no_fine_tune",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,  
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    warmup_steps=1,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=1e-4,
    lr_scheduler_type="constant",
    #fp16=True,
    eval_accumulation_steps=1
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    compute_metrics=compute_metrics
)

  warn(
Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<00:00, 29.8MB/s]
Downloading extra modules: 4.07kB [00:00, 12.0MB/s]                   
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<00:00, 23.6MB/s]
Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 33.8MB/s]
Downloading builder script: 100%|██████████| 7.02k/7.02k [00:00<00:00, 37.2MB/s]
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jupyter/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [27]:
trainer.model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [26]:
trainer.evaluate?

[0;31mSignature:[0m
[0mtrainer[0m[0;34m.[0m[0mevaluate[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0meval_dataset[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mdata[0m[0;34m.[0m[0mdataset[0m[0;34m.[0m[0mDataset[0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mdata[0m[0;34m.[0m[0mdataset[0m[0;34m.[0m[0mDataset[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_keys[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetric_key_prefix[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'eval'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m[0m[0;34m[0m

In [13]:
tokenized_test.shape, len(tokenized_test[0]['input_ids'])

((10000, 2), 512)

In [None]:
evaluation_results = trainer.evaluate(eval_dataset=tokenized_test)

for metric, value in evaluation_results.items():
    print(f"{metric}: {value:.4f}")

  4%|▎         | 177/5000 [14:17<12:27:46,  9.30s/it]

[0;31mKernelOutOfMemory[0m: Kernel ran out of memory and has been restarted. If the restart fails, restart the kernel from the Kernel menu.
If the error persists, try choosing a different configuration or optimizing your code.

In [2]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import models, transforms
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import ResNetModel
import pandas as pd
from typing import Literal, Union
from torch.utils.data import Dataset, DataLoader
import operator
import sys
from time import time
from tqdm import trange


class CustomResNet(nn.Module):
    def __init__(self, output_units, freeze_all = False, debug = False, resnet_version = 18):
        super(CustomResNet, self).__init__()
        if resnet_version == 18:
            self.resnet = models.resnet18(weights='ResNet18_Weights.DEFAULT')
        elif resnet_version == 50:
            self.resnet = models.resnet50(weights='ResNet50_Weights.DEFAULT')
        if freeze_all:
            for param in self.resnet.parameters():
                param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, output_units)
        if debug:
            for k, v in self.resnet.named_parameters():
                print(k, v.shape, v.requires_grad)

    def forward(self, x):
        return self.resnet(x)



class CIFARDataset(Dataset):
    def __init__(self, dset):
        self.dset = dset

    def __getitem__(self, idx):
        return transform(self.dset[idx]['img']), self.dset[idx]['label']

    def __len__(self):
        return len(self.dset)

    

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

ds = load_dataset("uoft-cs/cifar10")
ds_train = ds["train"]

dataset = CIFARDataset(ds_train)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class Row:
    model_name: str
    epochs: int
    batch: int
    dataset_name: str
    slaves_num: str
    transfer_quantity_GB: float
    transfer_summary_time_in_memory_sec: float
    accuracy: float
    loss: float
    unfreeze: Literal[Union["full", "last_layer"]]
    mean_learning_time_by_epoch_sec: float
    gradient_compression: Literal[Union["none", "simple"]]
    meta: str
    
    
    def __init__(self, 
                 model_name,
                 epochs,
                 batch,
                 dataset_name,
                 slaves_num,
                 transfer_quantity_GB,
                 transfer_summary_time_in_memory_sec,
                 accuracy,
                 loss,
                 unfreeze,
                 mean_learning_time_by_epoch_sec,
                 gradient_compression,
                 meta=None):
        self.model_name = model_name
        self.epochs = epochs
        self.batch = batch
        self.dataset_name = dataset_name
        self.slaves_num = slaves_num
        self.transfer_quantity_GB = transfer_quantity_GB
        self.transfer_summary_time_in_memory_sec = transfer_summary_time_in_memory_sec
        self.accuracy = accuracy
        self.loss = loss
        self.unfreeze = unfreeze
        self.mean_learning_time_by_epoch_sec = mean_learning_time_by_epoch_sec
        self.gradient_compression = gradient_compression
        self.meta = meta

        
def get_nested_attr(obj, attr_path):
    for part in attr_path.split("."):
        obj = getattr(obj, part)
    return obj

def set_nested_attr(obj, attr_path, new_value):
    parts = attr_path.split(".")
    for i, part in enumerate(parts[:-1]):
        obj = getattr(obj, part)
    setattr(obj, parts[-1], new_value)
    

total_bytes = 0
total_time_sending = 0
    
def run_exp(
    model_name,
    epochs,
    batch,
    dataset_name,
    slaves_num,
    unfreeze,
    gradient_compression,
):
    
    global total_bytes, total_time_sending
    total_bytes = 0
    total_time_sending = 0
    
    BATCH_SIZE = batch
    batch_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    EPOCH_COUNT = epochs
    SLAVE_COUNT = slaves_num
    FIXED_LR = 0.001
    OUT_UNITS = 10
    MASTER_NODE = 0

    criterion = nn.CrossEntropyLoss()
    master_model = CustomResNet(output_units=OUT_UNITS, resnet_version=model_name, freeze_all=unfreeze != "full").to(device)
    master_optimizer = optim.Adam(master_model.parameters(), lr=FIXED_LR)
    slave_models = [CustomResNet(output_units=OUT_UNITS, resnet_version=model_name, freeze_all=unfreeze != "full").to(device) for _ in range(SLAVE_COUNT)]

    
    
    def sync_slaves_with_master():
        global total_time_sending
        time_start_sending = time()
        with torch.no_grad():
            for models in zip(master_model.named_parameters(), *list(map(lambda x: x.named_parameters(), slave_models))):
                param_name = models[1][0]
                master_param = get_nested_attr(master_model, param_name)
                for model in slave_models:
                    new_param = nn.Parameter(master_param)
                    set_nested_attr(model, param_name, new_param)
        total_time_sending += time() - time_start_sending

        
    def quantize(gradients_raw, gradient_compression):
        if gradient_compression == "simple":
            return [torch.quantize_per_tensor(x.to("cpu"), 0.1, 10, torch.quint8) for x in gradients_raw]
        else:
            assert gradient_compression == "none"
            return gradients_raw
            
            
    def dequantize(gradients_raw, gradient_compression):
        if gradient_compression == "simple":
            return [x.dequantize() for x in gradients_raw]
        else:
            assert gradient_compression == "none"
            return gradients_raw

    def move_gradients_from_slaves_to_master():
        global total_bytes, total_time_sending
        time_start_sending = time()
        quantize_time_total = 0
        for models in zip(master_model.parameters(), *list(map(lambda x: x.parameters(), slave_models))):
            master_model_params = models[0]
            if not master_model_params.requires_grad:
                continue
            slave_models_params = models[1:]
            gradients_raw = list(map(lambda x: x.grad, slave_models_params))
            if None in gradients_raw:
                return
            quantize_time_started = time()
            gradients_raw = torch.stack(quantize(gradients_raw, gradient_compression))
            total_bytes += gradients_raw.nelement() * gradients_raw.element_size()
            gradient = torch.mean(gradients_raw, dim=0)
            master_model_params.grad = torch.stack(dequantize(gradient, gradient_compression)).to(device)
            quantize_time_total += time() - quantize_time_started

        total_time_sending += time() - time_start_sending - quantize_time_total

    start_time = time()
    for epoch in trange(EPOCH_COUNT, desc="iterating through epochs"):
        
        " 0 - master, 1-N - slaves "
        executing_node = 1

        sync_slaves_with_master()
        move_gradients_from_slaves_to_master()

        index = 0
        while index < batch_loader.__len__():
            if executing_node == MASTER_NODE:
                move_gradients_from_slaves_to_master()
                master_optimizer.step()
                sync_slaves_with_master()
            else:
                inputs, labels = batch_loader.__iter__().__next__()
                index += 1
                inputs = inputs.to(device)
                labels = labels.to(device)
                model = slave_models[executing_node - 1]
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
            executing_node = (executing_node + 1) % (SLAVE_COUNT + 1)

        dataset_test = CIFARDataset(ds["test"])
        test_loader = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=True)
        accuracy_on_batch = []
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = master_model(inputs)
            accuracy_on_batch.append(sum(torch.argmax(outputs, dim=1) == labels) / len(labels))

    transfer_quantity_GB = round(total_bytes / (10 ** 9) / EPOCH_COUNT, 4)
    transfer_summary_time_in_memory_sec = total_time_sending
    accuracy = round(torch.mean(torch.tensor(accuracy_on_batch)).item(), 4)
    loss = round(loss.item(), 7)
    mean_learning_time_by_epoch_sec = round((time() - start_time)  / EPOCH_COUNT, 4)
        
    return transfer_quantity_GB, transfer_summary_time_in_memory_sec, accuracy, loss, mean_learning_time_by_epoch_sec


  warn(


In [3]:
from tqdm import tqdm

exp_res = []
for attempt in tqdm(range(1), desc="iterating through attempts"):
    for model_name in [18]:
        for epochs in [10]:
            for batch in [32]:
                for dataset_name in ["uoft-cs/cifar10"]:
                    for slaves_num in [1, 5, 16]:
                        for unfreeze in ["full"]:
                            for gradient_compression in ["simple"]:
                                transfer_quantity_GB, transfer_summary_time_in_memory_sec, accuracy, loss, mean_learning_time_by_epoch_sec = run_exp(
                                    model_name,
                                    epochs,
                                    batch,
                                    dataset_name,
                                    slaves_num,
                                    unfreeze,
                                    gradient_compression,
                                )
                                row = Row(
                                    model_name,
                                    epochs,
                                    batch,
                                    dataset_name,
                                    slaves_num,
                                    transfer_quantity_GB,
                                    transfer_summary_time_in_memory_sec,
                                    accuracy,
                                    loss,
                                    unfreeze,
                                    mean_learning_time_by_epoch_sec,
                                    gradient_compression,
                                    meta=f"attempt {attempt + 1}, baseline without compression, {slaves_num} slave",
                                )
                                exp_res.append(row.__dict__)
df_res = pd.DataFrame(exp_res)
df_res

df_res.to_csv("automatic_example44.csv", index=False)

iterating through attempts:   0%|          | 0/1 [00:00<?, ?it/s]Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /tmp/xdg_cache/torch/hub/checkpoints/resnet18-f37072fd.pth

  0%|          | 0.00/44.7M [00:00<?, ?B/s][A
  2%|▏         | 1.00M/44.7M [00:00<00:04, 10.1MB/s][A
 21%|██▏       | 9.50M/44.7M [00:00<00:00, 55.5MB/s][A
 43%|████▎     | 19.0M/44.7M [00:00<00:00, 75.4MB/s][A
 64%|██████▍   | 28.6M/44.7M [00:00<00:00, 85.3MB/s][A
100%|██████████| 44.7M/44.7M [00:00<00:00, 84.2MB/s][A

iterating through epochs:   0%|          | 0/10 [00:00<?, ?it/s][A
iterating through epochs:  10%|█         | 1/10 [05:43<51:33, 343.73s/it][A
iterating through epochs:  20%|██        | 2/10 [12:08<49:01, 367.68s/it][A
iterating through epochs:  30%|███       | 3/10 [17:44<41:12, 353.15s/it][A
iterating through epochs:  40%|████      | 4/10 [23:19<34:36, 346.08s/it][A
iterating through epochs:  50%|█████     | 5/10 [29:08<28:56, 347.27s/it][A
iterating through 

In [5]:
df_res.to_csv("automatic_example4411.csv", index=False)

In [6]:
! ls

=0.26.0
EXP0_Qwen_no_fine_tune.ipynb
EXP1_Qwen_fine_tune.ipynb
EXP2_Qwen_GPTQ_no_fine_tune-Copy1.ipynb
EXP3_Qwen_GPTQ_fine_tune.ipynb
Untitled23(1).ipynb
automatic_example44.csv
automatic_example4411.csv
qwen_instruct_fine_tune_peft
qwen_instruct_no_fine_tune


In [7]:
! pwd

/home/jupyter/work/resources/llm-quantization-kashin/kashin
