In [2]:
# !pip install scipy
# !pip install bitsandbytes==0.41.3 --force-reinstall --no-deps
# !pip install git+https://github.com/huggingface/peft  --force-reinstall --no-deps
# !pip install pandas
# !pip install bitsandbytes --force-reinstall --no-deps
# !pip install accelerate

In [2]:
import os
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import tqdm

In [2]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Edentns/DataVortexS-10.7B-dpo-v1.11")

# Define BitsAndBytesConfig for quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable loading model in 4-bit
    bnb_4bit_use_double_quant=True,  # Use double quantization for better accuracy
    bnb_4bit_quant_type="nf4",  # Quantization type
    bnb_4bit_compute_dtype=torch.float16  # Compute dtype to use during quantization
)

# Load quantized model with LoRA adjustments
model_name = "Edentns/DataVortexS-10.7B-dpo-v1.11"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Set torch dtype as float16 for the model
    quantization_config=bnb_config  # Apply BitsAndBytesConfig
)

# Prepare model for LoRA training
lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Continue with your existing training setup...

# Move model to GPU and apply DataParallel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)

# Define your dataset
class ChatDataset(Dataset):
    def __init__(self, tokenizer, file_path='last_df.csv'):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(file_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.iloc[idx]['Question']
        answer = self.data.iloc[idx]['Answer']
        bos_token = self.tokenizer.bos_token_id
        eos_token = self.tokenizer.eos_token_id
        encoding = self.tokenizer.encode('<usr>' + question + '<sys>' + answer, add_special_tokens=False)
        return torch.tensor([bos_token] + encoding + [eos_token], dtype=torch.long)

# DataLoader setup
def collate_fn(batch):
    return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)

# Assuming tokenizer, model, and device setup is done prior to this snippet
dataset = ChatDataset(tokenizer)
data_loader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

optimizer = AdamW(model.parameters(), lr=3e-5)
EPOCHS = 2
best_loss = float('inf')
PATIENCE = 2
patience_counter = 0
# Setup to save the model
model_save_path = 'models'

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm.tqdm(data_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in progress_bar:
        batch = batch.to(device)
        labels = batch.clone()
        
        optimizer.zero_grad()
        outputs = model(input_ids=batch, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        batch_loss = loss.item()
        epoch_loss += batch_loss
        
        # Optional: Update progress bar description with the current loss
        progress_bar.set_description(f"Epoch {epoch+1}/{EPOCHS} Loss: {batch_loss:.4f}")
    
    epoch_loss /= len(data_loader)
    print(f'Epoch {epoch+1}/{EPOCHS}, Average Loss: {epoch_loss:.4f}')
    
    # Early stopping check
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        patience_counter = 0
        # Save the best model
        torch.save(model.state_dict(), f"{model_save_path}/best_model_epoch_{epoch+1}.pt")
    else:
        patience_counter += 1
    
    if patience_counter >= PATIENCE:
        print("Early stopping triggered. Training stopped.")
        break

# Optionally, save the final model state
torch.save(model.state_dict(), f"{model_save_path}/final_model.pt")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.55it/s]
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Using 2 GPUs!


Epoch 1/2:   0%|          | 0/7430 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Epoch 1/2 Loss: 1.0021: 100%|██████████| 7430/7430 [6:05:44<00:00,  2.95s/it]  


Epoch 1/2, Average Loss: 0.9012


Epoch 2/2 Loss: 0.4775: 100%|██████████| 7430/7430 [6:04:40<00:00,  2.94s/it]  


Epoch 2/2, Average Loss: 0.5662




CausalLMOutputWithPast(loss=<generator object gather.<locals>.gather_map.<locals>.<genexpr> at 0x7fbb3144dbd0>, logits=tensor([[[ -3.3977,  -1.2176,   4.8569,  ..., -13.0163, -10.7256,  -9.8619],
         [-12.2143,  -6.5105,   2.1810,  ...,  -8.4827, -10.9635,  -9.5268],
         [-10.0008,  -5.6851,   1.2743,  ..., -10.9109,  -9.5614, -10.2854],
         ...,
         [ -8.9077,   0.4250,   5.6404,  ..., -11.9620,  -9.9592, -11.4269],
         [-10.9127,  -2.0902,   4.9918,  ...,  -9.6915, -11.2368, -11.6481],
         [ -5.8373,   2.7522,   8.5870,  ...,  -5.4468,  -5.1905,  -9.5737]]],
       device='cuda:0', grad_fn=<GatherBackward>), past_key_values=None, hidden_states=None, attentions=None)