In [1]:
import os
import torch
import zipfile
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from accelerate import init_empty_weights
from datasets import Dataset
 
# Set environment variable to manage memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
# Specify directories and the path to the zip file
offload_dir = os.path.expanduser("llama_offload/")
 
os.makedirs(offload_dir, exist_ok=True)
 
# Extract only the specified JSON file from the zip archive
target_file = "task024_cosmosqa_answer_generation.json"
 
# Load tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer.pad_token = tokenizer.eos_token
 
# Load the model with accelerate's offloading and device map auto-setup
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B",
        device_map="auto",
        offload_folder=offload_dir,
        load_in_8bit=True,
        llm_int8_enable_fp32_cpu_offload=True
    )
 
# Configure LoRA with reduced rank
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
 



  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  _ = torch.tensor([0], device=i)
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.46s/it]


In [2]:
target_file = r"task024_cosmosqa_answer_generation.json"
with open(target_file, 'r', encoding='utf-8-sig') as f:
    json_data = json.load(f)

# Extract text data (assuming a structure where the data you want is under 'Instances')

instances = json_data['Instances'][0:2223]
input_texts = [str(instance['input']) for instance in instances]  # Convert to string if not already
output_texts = [str(instance['output'][0]) if instance['output'] else "" for instance in instances]  # Handle missing output
print(input_texts, output_texts)




In [3]:
 
# Convert the texts into a Hugging Face Dataset
ds = Dataset.from_dict({'input': input_texts, 'output': output_texts})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["input"], examples["output"], truncation=True, padding="max_length", max_length=512)

# Apply tokenization and set format
tokenized_datasets = ds.map(tokenize_function, batched=True, remove_columns=["input", "output"])
tokenized_datasets.set_format("torch")

# Split dataset into train and eval
train_size = int(0.9 * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|██████████| 2223/2223 [00:00<00:00, 5135.10 examples/s]


In [4]:
# Define training arguments
save_dir="finetuned-weights"
training_args = TrainingArguments(
    output_dir=save_dir,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    report_to="none",
    fp16=torch.cuda.is_available(),
)
 
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)
 
# Train the model
trainer.train()
 
# Evaluate the model
with torch.no_grad():
    eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
 
# Save the model and tokenizer

model.save_pretrained(os.path.join(save_dir, "fine-tuned-llama-lora"))
tokenizer.save_pretrained(os.path.join(save_dir, "fine-tuned-llama-lora"))
 
# Test the model on a sample input
input_text = "Hello, what is the meaning of life?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
 
with torch.no_grad():
    outputs = model.generate(inputs["input_ids"], max_length=50, num_return_sequences=1)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Generated output: {output_text}")
 
# Clear CUDA cache
torch.cuda.empty_cache()

  trainer = Trainer(
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  0%|          | 3/1500 [01:55<16:03:47, 38.63s/it]

KeyboardInterrupt: 