In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
    DataCollatorForLanguageModeling,
    Trainer
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

In [None]:
# Load LLaMA 7B model and tokenizer
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [139]:
dataset = load_dataset('csv', data_files='updated_file.csv')

In [140]:
dataset['train'][:5]

{'user': ["I'm considering investing in tech stocks. What do you think?",
  'I have a moderate risk tolerance and a 5-year investment horizon.',
  'What percentage of my portfolio should be in bonds?',
  'Should I consider alternative investments?',
  'I’m familiar with real estate but not commodities.'],
 'system': ['Tech stocks can offer significant growth potential, but they also come with higher volatility. Could you share more about your investment goals and risk tolerance?',
  'Given your moderate risk tolerance and 5-year horizon, a balanced approach might be wise. How about a mix of 50% in stable tech giants and 50% in growth-oriented tech stocks?',
  'For a balanced portfolio, typically 30-40% in bonds is recommended. This can provide stability against the more volatile equities. Would you like a breakdown of specific bond types?',
  'Yes, alternative investments like real estate or commodities can help diversify your portfolio. What’s your familiarity with these asset classes

In [159]:
train_test_split = dataset['train'].train_test_split(test_size=0.1)

train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [160]:
# Define a preprocessing function
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # 
def preprocess_function(examples):
    # Combine 'system' and 'user' into a single input string following the Llama 3.1 format
    inputs = [
        f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{sys}\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n{usr}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" 
        for sys, usr in zip(examples['system'], examples['user'])
    ]
    # Tokenize the inputs
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True)
    return model_inputs

In [161]:
tokenized_dataset_train = train_dataset.map(tokenize_function, batched=True)
tokenized_dataset_train

Map:   0%|          | 0/753 [00:00<?, ? examples/s]

Dataset({
    features: ['user', 'system', 'input_ids', 'attention_mask'],
    num_rows: 753
})

In [162]:
tokenized_dataset_test = test_dataset.map(tokenize_function, batched=True)
tokenized_dataset_test

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Dataset({
    features: ['user', 'system', 'input_ids', 'attention_mask'],
    num_rows: 84
})

In [147]:
tokenized_dataset_test = tokenized_dataset_test.rename_column("input_ids", "labels")

In [148]:
tokenized_dataset_train = tokenized_dataset_train.rename_column("input_ids", "labels")

In [163]:
# setting pad token
tokenizer.pad_token = tokenizer.eos_token
# data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [164]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=16,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target mdules in the transformer layers
    lora_dropout=0.1,
)

In [165]:
# Apply LoRA configuration to the model
model = get_peft_model(model, lora_config)
# trainable parameter count
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [166]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed to "steps" for better monitoring
    eval_steps=150,  # Evaluate every 150 steps
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  # Mixed precision training
    push_to_hub=False,  # Set to True if you want to push to Hugging Face Hub
    optim="adamw_8bit",  # 8-bit Adam optimizer for memory efficiency
    lr_scheduler_type="cosine",  # Cosine learning rate scheduler
    save_strategy="steps",
    save_steps=150,
    save_total_limit=5,
)

In [167]:
# Initialize SFTTrainer
sft_trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    packing=False,  # If you have multi-turn conversations, set packing=True
    args=training_args,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

In [168]:
sft_trainer.train()

ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [input_ids, user, attention_mask, system]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [78]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  # Mixed precision training
    push_to_hub=False,  # Set to True if you want to push to Hugging Face Hub
)

In [79]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    data_collator=data_collator
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [80]:
# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
model.config.use_cache = True

ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [input_ids, attention_mask]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [81]:
model

PeftModel(
  (base_model): LoraModel(
    (model): PeftModel(
      (base_model): LoraModel(
        (model): PeftModel(
          (base_model): LoraModel(
            (model): PeftModel(
              (base_model): LoraModel(
                (model): LlamaForCausalLM(
                  (model): LlamaModel(
                    (embed_tokens): Embedding(128256, 4096)
                    (layers): ModuleList(
                      (0-31): 32 x LlamaDecoderLayer(
                        (self_attn): LlamaSdpaAttention(
                          (q_proj): lora.Linear(
                            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                            (lora_dropout): ModuleDict(
                              (default): Dropout(p=0.1, inplace=False)
                            )
                            (lora_A): ModuleDict(
                              (default): Linear(in_features=4096, out_features=8, bias=False)
                            )
