# Git clone qLeap-fft repo inside `/kaggle/working/` directory
## Ensure to have the latest branch
## Switch to quantumLeap directory

In [1]:
import os
from pathlib import Path

def ensure_working_directory():
    """
    Check if we're in the correct working directory, if not switch to it.
    Creates the directory if it doesn't exist.
    """
    target_dir = '/kaggle/working/quantumLeap'
    current_dir = os.getcwd()
    
    # Print current directory
    print(f"Current directory: {current_dir}")
    
    # Check if we need to switch directories
    if current_dir != target_dir:
        # Create directory if it doesn't exist
        Path(target_dir).mkdir(parents=True, exist_ok=True)
        
        try:
            # Change to target directory
            os.chdir(target_dir)
            print(f"Successfully switched to: {target_dir}")
        except Exception as e:
            print(f"Error switching to directory: {str(e)}")
            raise
    else:
        print("Already in correct directory")
    
    # Verify current directory
    print(f"Working directory: {os.getcwd()}")

# Call the function before your main code
ensure_working_directory()

Current directory: /root/quantumLeap
Successfully switched to: /kaggle/working/quantumLeap
Working directory: /kaggle/working/quantumLeap


In [2]:
# # ----------------------------- #
# # Part 1.1: Install and Setup Libraries
# # ----------------------------- #

# # !python -m xformers.info
# import torch
# print(f"PyTorch version: {torch.__version__}")
# print(f"CUDA available: {torch.cuda.is_available()}")
# print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")
# %pip install uv
# !uv pip install -q unsloth wandb bitsandbytes ipywidgets nltk spacy huggingface_hub datasets --system
# # Finally, install a compatible xformers version
# !uv pip install -q xformers==0.0.27 --system  # This version is compatible with PyTorch 2.4.x
# !python -m xformers.info# # !python -m xformers.info

## Restart the session and check if unsloth is installed

In [3]:
# import unsloth

## Install Flash Attention and check version

In [4]:
# !uv pip install flash_attn --no-build-isolation -q --system
# !pip show flash_attn

# Restart again so that all the libraries are properly initialized

In [5]:
# ----------------------------- #
# Part 1.2: Import Necessary Libraries
# ----------------------------- #

# General Libraries
import os
import json
import sys
import subprocess
import argparse
import logging
import math
import random
from datetime import datetime
import re
import gc
import weakref
import multiprocessing

# Torch related
import torch
from torch import nn
import torch.distributed as dist

# Transformers related
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Adafactor
)

# Huggingface TRL for full finetune
from trl import SFTTrainer, SFTConfig

# General huggingface libraries
import huggingface_hub
from datasets import load_dataset, Dataset
from accelerate import Accelerator


# Unsloth specificic libraries
import unsloth
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments, FastLanguageModel

# Other Libraries
from peft import LoraConfig
import wandb
import nltk
import spacy
# from galore_torch import GaLoreAdamW, GaLoreAdafactor, GaLoreAdamW8bit

# Check and import NLTK and spacy modules
# Ensure NLTK's punkt tokenizer is available
import nltk
try:
    nltk.data.find('tokenizers/punkt')
    print('punkt was already available.')
except LookupError:
    nltk.download('punkt')
    print('punkt was not available. It has been downloaded')

# Initialize spaCy English model
try:
    nlp = spacy.load('en_core_web_sm')
    print('en_core_web_sm was already available.')
except OSError:
    print("SpaCy English model not found. Downloading...")
    os.system('python -m spacy download en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
punkt was already available.
en_core_web_sm was already available.


In [6]:
# ----------------------------- #
# Part 2: Configure Environment Variables & Create Main Variables
# ----------------------------- #

# max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
# dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
# load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# base_model_slug = "meta-llama/Llama-3.2-1B-Instruct"
# base_model_name = "lora_model_pum"
# chunks_max_length = max_seq_length
# overlap_size = 1
# # Define your parameters
# batchSize = 2
# ga = 8
# maxSteps = 120
# lRate = 5e-5
# embLRate = 1e-5
# optim = "adamw_8bit"
# lrSchedule = "linear"
# dataset_slug = "psychology_of_unconscious"

# # Set environment variables before importing torch-related modules
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [7]:
# ----------------------------- #
# Part 2: Load and Clean the Text Data
# ----------------------------- #

def load_and_clean_text(file_path):
    """
    Loads text from a file and removes Project Gutenberg's license and headers/footers.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # # Remove Project Gutenberg's license text and headers/footers
    # start_pattern = r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK.*\*\*\*'
    # end_pattern = r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK.*\*\*\*'

    # text = re.sub(f'.*{start_pattern}', '', text, flags=re.DOTALL)
    # text = re.sub(f'{end_pattern}.*', '', text, flags=re.DOTALL)
    return text.strip()

# Replace 'psychology_of_unconscious.txt' with your actual file path
file_path = '/kaggle/working/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt'
clean_text = load_and_clean_text(file_path)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt'

In [8]:
# # ----------------------------- #
# # Part 3: Parse Text into Discourse Units
# # ----------------------------- #

def parse_discourse_units(text, overwrite=False):
    """
    Parses text into discourse units using spaCy.
    Currently splits text into sentences.
    """
    paragraphs = text.split('\n\n')
    paragraphs = [para.strip() for para in paragraphs if para.strip()]

    discourse_units = []
    for para in paragraphs:
        doc = nlp(para)
        sentences = [sent.text for sent in doc.sents]
        discourse_units.extend(sentences)

    output_path = '/kaggle/working/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious_discourse_units.json'
    if not os.path.exists(output_path) or overwrite:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(discourse_units, f, ensure_ascii=False, indent=4)
    else:
        with open(output_path, 'r') as f:
            discourse_units = json.load(f)

    print(f"Total Discourse Units: {len(discourse_units)}")
    return discourse_units

In [9]:
# ----------------------------- #
# Part 4: Create Chunks Using Hybrid Strategy
# ----------------------------- #

def create_chunks(discourse_units, tokenizer, max_length=4096, overlap_size=1, overwrite=False):
    """
    Creates chunks from discourse units using a sliding window with overlapping chunks.
    Optimized to work directly with token IDs and utilize efficient list operations.
    """
    chunks = []
    current_chunk_tokens = []
    current_length = 0

    for unit in discourse_units:
        unit_tokens = tokenizer.encode(unit, add_special_tokens=True)
        unit_length = len(unit_tokens)

        if current_length + unit_length <= max_length:
            current_chunk_tokens.extend(unit_tokens)
            current_length += unit_length
        else:
            # Decode and append the current chunk
            chunk_text = tokenizer.decode(
                current_chunk_tokens, skip_special_tokens=True)
            chunks.append(chunk_text)

            # Prepare overlap tokens
            overlap_tokens = current_chunk_tokens[-overlap_size:]
            current_chunk_tokens = overlap_tokens + unit_tokens
            current_length = len(current_chunk_tokens)

    # Append any remaining tokens as the last chunk
    if current_chunk_tokens:
        chunk_text = tokenizer.decode(
            current_chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)

    # Write or read chunks as before
    output_path = '/kaggle/working/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious_chunks.json'
    if not os.path.exists(output_path) or overwrite:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(discourse_units, f, ensure_ascii=False, indent=4)
    else:
        with open(output_path, 'r') as f:
            discourse_units = json.load(f)

    print(f"Total Chunks Created: {len(chunks)}")
    return chunks

In [10]:
# ----------------------------- #
# Part 5: Create and Tokenize Dataset
# ----------------------------- #

# To Do - make book titles and prompt generic so
def create_tokenized_dataset(chunks, tokenizer, max_length=1024, num_proc=2):

    # Create a Dataset object from chunks

    book_title = 'Psychology of the Unconscious by C. G. Jung'
    wikipedia_prompt = """
    Psychology Book

    ### Title: {}

    ### Article: {}
    """

    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

    def formatting_prompts_func(examples):
        titles = book_title
        texts = examples["text"]
        outputs = []
        for title, text in zip([book_title]*len(chunks), texts):
            text = wikipedia_prompt.format(title, text) + EOS_TOKEN
            outputs.append(text)
        return {"text": outputs, }
    pass

    # convert chunks variable to huggingface dataset

    from datasets import Dataset

    dataset = Dataset.from_dict({"text": chunks})

    dataset = dataset.map(formatting_prompts_func,
                          batched=True, num_proc=num_proc)
    # Split the dataset into training and validation sets
    split = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split['train']
    eval_dataset = split['test']

    print(len(dataset))
    # Find the maximum length of the text field in the entire dataset
    max_length = max(len(text) for text in dataset['text'])
    print(f"The maximum length of the text field in the dataset is: {max_length} characters")
    print(f"Training Dataset Size: {len(train_dataset)}")
#     print(f"First 5 rows of training dataset: {train_dataset[:5]}")
    print(f"Validation Dataset Size: {len(eval_dataset)}")
#     print(f"First 5 rows of validation dataset: {eval_dataset[:5]}")
    return train_dataset, eval_dataset

In [11]:
# ----------------------------- #
# Part 6: Set up environment and other important utilities
# ----------------------------- #

def setup_environment():
    """
    Initializes the Accelerator for distributed training.
    """
    return Accelerator()


def get_custom_lr_scheduler(optimizer, num_warmup_steps, num_training_steps, initial_phase_steps):
    """
    Defines a custom learning rate scheduler with warmup, constant, and linear annealing phases.
    """
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return current_step / num_warmup_steps  # Linear warmup
        elif current_step < initial_phase_steps:
            return 1.0  # Constant learning rate for initial phase
        else:
            # Linear annealing for the remaining steps
            return 1.0 - ((current_step - initial_phase_steps) / (num_training_steps - initial_phase_steps))

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


def setup_training_steps(initial_rows, annealing_rows, batch_size, gradient_accumulation_steps, num_epochs):
    """
    Calculates total and initial training steps based on dataset size and training parameters.
    """
    total_rows = initial_rows + annealing_rows
    total_steps = (total_rows * num_epochs) // (batch_size *
                                                gradient_accumulation_steps)
    initial_steps = (initial_rows * num_epochs) // (batch_size *
                                                    gradient_accumulation_steps)
    return max(1, total_steps), max(1, initial_steps)


def print_memory_usage(step_desc):
    """
    Prints the CUDA memory summary if CUDA is available.
    """
    if torch.cuda.is_available():
        print(f"Memory Usage at {step_desc}:")
        print(torch.cuda.memory_summary())
        print("\n")
    else:
        print(f"No CUDA available at {step_desc}.\n")


def inference(model, tokenizer):
    """
    Runs inference using the trained model.
    """
    # Define sample prompts
    prompts = [
        "Explain what is medical anthropology and its importance in elevating the quality of life?",
        "Explain what are the types of Jungian archetypes and how they manifest in the human psyche?"
    ]

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=256)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Prompt: {prompt}\nGenerated Text: {generated_text}\n")
        
#  Login to Huggingface
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()

def setup_huggingface_access():
    """Setup Hugging Face access with token from environment or manual input"""
    # First try to get token from environment variable
    token = os.getenv('HUGGINGFACE_TOKEN')
    
    if not token:
        # If not in environment, prompt for token
        token = input("Enter your Hugging Face token: ")
        
    if token:
        try:
            login(token, add_to_git_credential=True)
            print("Successfully logged in to Hugging Face!")
        except Exception as e:
            print(f"Failed to log in to Hugging Face: {str(e)}")
            return False
    else:
        print("No Hugging Face token provided")
        return False
    
    return True

In [12]:
# ----------------------------- #
# Part 5: Load the Tokenizer and Model
# ----------------------------- #

def load_model_and_tokenizer(base_model_slug, max_seq_length=4096, dtype=None, load_in_4bit=True, device_map = "auto"):
    """
    Load and configure the model and tokenizer with specified parameters.
    
    Args:
        base_model_slug (str): The model identifier from HuggingFace
        max_seq_length (int): Maximum sequence length for the model
        dtype: Data type for model parameters
        load_in_4bit (bool): Whether to use 4-bit quantization
        
    Returns:
        tuple: (model, tokenizer)
    """
    # Check CUDA is available
    import torch
    if not torch.cuda.is_available():
        print("WARNING: CUDA is not available. This might affect performance.")
    else:
        print("CUDA available")
        
    # Check available GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_properties(i).name}")

    # Determine optimal device map based on available GPUs
    device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Using single device: {device_map}")

    # Model paths
    model_name = base_model_slug
    models_dir = os.path.join(os.path.dirname("~/"), "models")
    model_path = os.path.join(models_dir, model_name)

    # Create models directory if it doesn't exist
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # Load or download model
    try:
        if os.path.exists(model_path):
            print(f"Loading model from local path: {model_path}")
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name=base_model_slug,
                max_seq_length=max_seq_length,
                dtype=dtype,
                load_in_4bit=load_in_4bit,
                token=os.getenv('HUGGINGFACE_TOKEN'),
            )
        else:
            print(f"Downloading model from HuggingFace: {model_name}")
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name=base_model_slug,
                max_seq_length=max_seq_length,
                dtype=dtype,
                load_in_4bit=load_in_4bit,
                token=os.getenv('HUGGINGFACE_TOKEN'),
            )
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

    # Configure PEFT model
    model = FastLanguageModel.get_peft_model(
        model,
        r=128,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
            "embed_tokens", "lm_head",  # Add for continual pretraining
        ],
        lora_alpha=32,
        lora_dropout=0,  # Supports any, but = 0 is optimized
        bias="none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
        random_state=3407,
        use_rslora=True,   # We support rank stabilized LoRA
        loftq_config=None,  # And LoftQ
    )

    print("Model and tokenizer loaded successfully!")
    return model, tokenizer

    # Print model device distribution
    if hasattr(model, 'hf_device_map'):
        print("\nModel Device Distribution:")
        for name, device in model.hf_device_map.items():
            print(f"{name}: {device}")

    # Print memory usage per GPU
    print("\nGPU Memory Usage after model loading:")
    for i in range(num_gpus):
        memory_used = torch.cuda.memory_allocated(i) / (1024**3)
        memory_total = torch.cuda.get_device_properties(i).total_memory / (1024**3)
        print(f"GPU {i}: {memory_used:.2f}GB / {memory_total:.2f}GB")

In [13]:
import os

# Set the environment variable
os.environ["HUGGINGFACE_TOKEN"] = "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"
os.environ["WANDB_API_KEY"] = "1ca3c5e9222c2504acbc07cf7f88267006ae68c4"
# Verify it's set correctly
print(os.getenv("HUGGINGFACE_TOKEN"))
print(os.getenv("WANDB_API_KEY"))

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
1ca3c5e9222c2504acbc07cf7f88267006ae68c4


In [14]:
# ----------------------------- #
# Part 2: Configure Environment Variables & Create Main Variables
# ----------------------------- #

# Unsloth modell initialization variables
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
max_length = max_seq_length
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
base_model_slug = "meta-llama/Llama-3.1-8B-Instruct"
base_model_name = "lora_model_pum"
chunks_max_length = max_seq_length
overlap_size = 1
# Define your parameters
batchSize = 2
ga = 16
maxSteps = 120
warmupSteps = 10
numTrainEpochs = 1
lRate = 5e-5
embLRate = 1e-5
optim = "adamw_8bit"
lrSchedule = "linear"
dataset_slug = "psychology_of_unconscious"

from datetime import datetime
import pytz
import wandb
# Get the current date and time in Indian Standard Time (IST)
ist = pytz.timezone('Asia/Kolkata')
current_datetime = datetime.now(ist)

# Format the datetime string
# Example format: 20240428_153045 (YYYYMMDD_HHMMSS)
formatted_datetime = current_datetime.strftime("%Y%m%d_%H%M%S")

# Define Run Name
run_name = f"""Kaggle-quantumLeap-{formatted_datetime}-{base_model_slug}-{dataset_slug}-{max_seq_length}_maxSeqLength-{max_length}_maxLength-{batchSize}_batchSize-{ga}_ga-{maxSteps}_maxSteps-{warmupSteps}_warmupSteps-{numTrainEpochs}_numTrainEpochs-{lRate}_lRate-{embLRate}_embLRate-{optim}_optim-{lrSchedule}_lrSchedule"""

# Initialize Weights & Biases
# It's recommended to set your W&B API key as an environment variable for security.
wandb.login(key=os.getenv("WANDB_API_KEY"))
wandb.init(project="Trellis-FFT-v2", name=run_name)

# Set environment variables before importing torch-related modules
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33molabs-asia[0m ([33molabs-asia-olabs-pro[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [15]:

# ----------------------------- #
# Part 9: Data Processing
# ----------------------------- #

# # Perform Inference Before Training
# inference(model, tokenizer)

# Set number of processes to use for data loading
num_cpus = multiprocessing.cpu_count()
num_proc = max(num_cpus-2, 2)  # Adjust based on prior recommendations
print(f"Number of CPU cores: {num_cpus}")
print(f"Number of processes: {num_proc}")

# Login to Hugging Face
if not setup_huggingface_access():
    raise Exception("Failed to setup Hugging Face access. Please check your token.")

# Load Model and Tokenizer
model, tokenizer = load_model_and_tokenizer(base_model_slug)
print(f"Model Device: {model.device}")

# Load and Clean Text Data
file_path = "/kaggle/working/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt"
clean_text = load_and_clean_text(file_path)

# Parse Discourse Units
discourse_units = parse_discourse_units(clean_text, overwrite=True)

# Create Chunks
chunks = create_chunks(
    discourse_units,
    tokenizer,
    max_length=max_length,
    overlap_size=overlap_size,
    overwrite=True,
)

# Create Tokenized Dataset
train_dataset, eval_dataset = create_tokenized_dataset(
    chunks, tokenizer, max_length)

# Save datasets as Hugging Face `datasets`
train_dataset.save_to_disk('/kaggle/working/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/train_dataset')
eval_dataset.save_to_disk('/kaggle/working/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/eval_dataset')

### To Do - Make the below as dynamic and as a functio
# # Uncomment following if you want to just load the data from temp directory
# from datasets import load_from_disk

# train_dataset = load_from_disk('/kaggle/working/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/train_dataset')
# eval_dataset = load_from_disk('/kaggle/working/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/eval_dataset')

Number of CPU cores: 256
Number of processes: 254
Successfully logged in to Hugging Face!
CUDA available
Number of available GPUs: 1
GPU 0: NVIDIA A100-SXM4-40GB
Using single device: cuda:0
Downloading model from HuggingFace: meta-llama/Llama-3.1-8B-Instruct
==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.381 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

meta-llama/Llama-3.1-8B-Instruct does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.
Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers, TRL and unsloth via:
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.10.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32
Model and tokenizer loaded successfully!
Model Device: cpu


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt'

In [None]:
# ----------------------------- #
# Part 8: Configure Training Arguments
# ----------------------------- #

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = batchSize,
        gradient_accumulation_steps = ga,

        # Use warmup_ratio and num_train_epochs for longer runs!
        max_steps = maxSteps,
        warmup_steps = warmupSteps,
        # warmup_ratio = 0.1,
        num_train_epochs = numTrainEpochs,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = lRate,
        embedding_learning_rate = embLRate,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = optim,
        weight_decay = 0.01,
        lr_scheduler_type = lrSchedule,
        seed = 3407,
        output_dir = "outputs",

        report_to=["tensorboard", "wandb"],
        logging_dir=f"./trel-fft-logs/{run_name}",
    ),
)

In [None]:
# ----------------------------- #
# Part 11: Start Training
# ----------------------------- #

#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_stats = trainer.train()

#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


In [None]:
import time

# add current timestamp to model name
model.save_pretrained(f"qLeap_model_base_v0_{int(time.time())}") # Local saving
tokenizer.save_pretrained(f"qLeap_model_base_v0_{int(time.time())}")
model.push_to_hub(f"olabs-ai/qLeap_model_base_v0_base_pum_{int(time.time())}", token = "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") # Online saving
tokenizer.push_to_hub(f"olabs-ai/qLeap_model_base_v0_pum_{int(time.time())}", token = "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") # Online saving

# # Merge to 16bit
# if False: model.save_pretrained_merged("qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit", token = "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")

# Merge to 4bit
if True: model.save_pretrained_merged("olabs-ai/qLeap_model_base_v0_pum_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit",)
if True: model.push_to_hub_merged("olabs-ai/qLeap_model_base_v0_pum_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit", token = "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")

# # Just LoRA adapters
# if False: model.save_pretrained_merged("qLeap_model_v0_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora", token = "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")
    
    
# # Save to 8bit Q8_0
if True: model.save_pretrained_gguf("olabs-ai/qLeap_model_base_v0_pum_8bit_Q8_{int(time.time())}", tokenizer,)
if True: model.push_to_hub_gguf("olabs-ai/qLeap_model_base_v0_pum_8bit_Q8_{int(time.time())}", tokenizer, token = "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")

# # Save to 16bit GGUF
# if False: model.save_pretrained_gguf("qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16", token = "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")

# # Save to q4_k_m GGUF
# if False: model.save_pretrained_gguf("qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m", token = "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")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q5_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q5_k_m", token = "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")

In [None]:
break

In [None]:
# import wandb
# from pprint import pprint

# def get_run_config(project_name, run_id):
#     try:
#         # Initialize the wandb API
#         api = wandb.Api()

#         # Access the specific run
#         run = api.run(f"{project_name}/{run_id}")

#         # Get the full configuration
#         config = run.config

#         # Filter for trainer-specific configuration
#         trainer_config = {k: v for k, v in config.items() if k.startswith(('train', 'learning', 'optim', 'fp16', 'bf16', 'gradient', 'weight_decay', 'warmup', 'max_steps', 'num_train_epochs', 'per_device'))}

#         return trainer_config

#     except wandb.errors.CommError:
#         print(f"Error: Unable to access run {run_id}. Make sure the run ID is correct and you have the necessary permissions.")
#         return None
#     except Exception as e:
#         print(f"An error occurred: {str(e)}")
#         return None

# # Usage
# project_name = "olabs-asia-olabs-pro/huggingface"
# run_id = "ppqtwwmy"

# trainer_config = get_run_config(project_name, run_id)

# if trainer_config:
#     print(f"Trainer configuration for run {run_id}:")
#     pprint(trainer_config)

# The Loss from earlier training was too high. We shall use training arguments from unsloth colab notebook "Llama-3.1 8b + Unsloth 2x faster finetuning.ipynb". URL below
### https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=95_Nn-89DhsL

In [None]:
print(dataset[0])

In [None]:
from transformers import IntervalStrategy
import wandb

# Initialize wandb
wandb.init(project="huggingface", name="refined_training_run")

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = dataset.select(range(len(dataset) // 10)),  # Use 10% of data for evaluation
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Set both max_steps and num_train_epochs
        max_steps = 120,
        num_train_epochs = 3,

        # Use a single learning rate for all parameters
        learning_rate = 5e-5,

        # Warmup strategy from successful runs
        warmup_steps = 10,
        warmup_ratio = 0,

        # Explicitly set precision based on hardware support
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        
        logging_steps = 1,
        
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        
        seed = 3407,
        output_dir = "outputs",
        
        report_to = "wandb",  # Enable Weights & Biases logging
        
        # Set both save and evaluation strategies to 'steps'
        save_strategy = IntervalStrategy.STEPS,
        eval_strategy = IntervalStrategy.STEPS,
        save_steps = 1,  # Save checkpoint every 20 steps
        eval_steps = 1,  # Evaluate every 20 steps (matching save_steps)
        
        load_best_model_at_end = True,
        metric_for_best_model = "eval_loss",
    ),
    compute_metrics = compute_metrics,
)

# ... (rest of the code remains the same)

# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Start training
trainer_stats = trainer.train()

# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Generation code (unchanged)
instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Hero Archetype", # concept_name
        "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import StoppingCriteria, StoppingCriteriaList

class EndOfTextCriteria(StoppingCriteria):
    def __init__(self, eos_token_id):
        self.eos_token_id = eos_token_id

    def __call__(self, input_ids, scores, **kwargs):
        return input_ids[0][-1] == self.eos_token_id

stopping_criteria = StoppingCriteriaList([EndOfTextCriteria(tokenizer.eos_token_id)])

outputs = model.generate(**inputs, 
                         max_new_tokens=64, 
                         stopping_criteria=stopping_criteria,
                         use_cache=True)

print(tokenizer.batch_decode(outputs))

In [None]:
# # delete previous trainer
# del trainer

#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()
#@title Show final memory and time stats

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")



instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Hero Archetype", # concept_name
        "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# Text Streaming

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 4096)

# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")



# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256,
#                    repetition_penalty = 0.1)

In [None]:
import time

# add current timestamp to model name
model.save_pretrained(f"qLeap_model_base_v0_{int(time.time())}") # Local saving
tokenizer.save_pretrained(f"qLeap_model_base_v0_{int(time.time())}")
model.push_to_hub(f"olabs-ai/qLeap_model_base_v0_base_{int(time.time())}", token = "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") # Online saving
tokenizer.push_to_hub(f"olabs-ai/qLeap_model_base_v0_{int(time.time())}", token = "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") # Online saving

# # Merge to 16bit
# if False: model.save_pretrained_merged("qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit", token = "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")

# # Merge to 4bit
# if False: model.save_pretrained_merged("qLeap_model_v0_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit", token = "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")

# # Just LoRA adapters
# if False: model.save_pretrained_merged("qLeap_model_v0_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora", token = "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")
    
    
# # Save to 8bit Q8_0
# if False: model.save_pretrained_gguf("qLeap_model_v0_8bit_Q8_{int(time.time())}", tokenizer,)
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_8bit_Q8_{int(time.time())}", tokenizer, token = "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")

# # Save to 16bit GGUF
# if False: model.save_pretrained_gguf("qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16", token = "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")

# # Save to q4_k_m GGUF
# if False: model.save_pretrained_gguf("qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m", token = "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")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q5_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q5_k_m", token = "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")

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Hero Archetype", # concept_name
        "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# Text Streaming

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 4096)

# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")



# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256,
#                    repetition_penalty = 0.1)

# Instruction  Tuning

In [None]:

# Instruction FineTune - Create an instruction_pompt based on the concept_examples.csv file

import json
import ast
import logging

import csv

with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/concept_examples.csv', 'r') as f:
    reader = csv.DictReader(f)
    data = list(reader)
    
type(data)


# Configure logging
logging.basicConfig(
    filename='transformation_errors.log',
    filemode='w',
    level=logging.ERROR,
    format='%(levelname)s:%(message)s'
)

# Sample original data
original_data = data

def transform_data(original_data):
    """
    Transforms the original data by expanding 'example_scenario' into separate dictionaries.

    Parameters:
        original_data (list): List of dictionaries with 'concept_name', 'detailed_explanation', and 'example_scenario'.

    Returns:
        new_data (list): Transformed list with one 'example_scenario' per dictionary.
    """
    new_data = []

    for idx, entry in enumerate(original_data, start=1):
        concept_name = entry.get('concept_name', '').strip()
        detailed_explanation = entry.get('detailed_explanation', '').strip()
        example_scenario_str = entry.get('example_scenario', '').strip()

        if not concept_name or not detailed_explanation or not example_scenario_str:
            logging.error(f"Entry {idx} is missing required fields. Skipping.")
            continue

        # Attempt to parse with json.loads
        try:
            example_scenarios = json.loads(example_scenario_str)
            if not isinstance(example_scenarios, list):
                raise ValueError("Parsed 'example_scenario' is not a list.")
        except json.JSONDecodeError:
            # Fallback to ast.literal_eval
            try:
                example_scenarios = ast.literal_eval(example_scenario_str)
                if not isinstance(example_scenarios, list):
                    raise ValueError("Parsed 'example_scenario' is not a list.")
            except (ValueError, SyntaxError) as e:
                logging.error(f"Entry {idx} ('{concept_name}') has invalid 'example_scenario': {e}")
                continue

        # Iterate through each scenario and create a new entry
        for scenario_idx, scenario in enumerate(example_scenarios, start=1):
            if not isinstance(scenario, str):
                logging.error(f"Entry {idx} ('{concept_name}') has non-string scenario at position {scenario_idx}. Skipping this scenario.")
                continue

            new_entry = {
                'concept_name': concept_name,
                'detailed_explanation': detailed_explanation,
                'example_scenario': scenario.strip()
            }
            new_data.append(new_entry)

    return new_data

# Transform the data
transformed_data = transform_data(original_data)

# Optional: Save the transformed data to a JSON file
with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/transformed_data.json', 'w', encoding='utf-8') as f:
    json.dump(transformed_data, f, ensure_ascii=False, indent=4)

print(f"Transformation complete. {len(transformed_data)} entries created.")
print("Check 'transformation_errors.log' for any errors encountered during transformation.")

print(len(transformed_data))

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def instruction_prompt_func(examples):
    concept_name = examples["concept_name"]
    detailed_explanation = examples["detailed_explanation"]
    example_scenario = examples["example_scenario"]
    return { "text" : instruction_prompt.format(concept_name, detailed_explanation, example_scenario), }
pass


# convert transformed_data to a huggingface dataset
instruction_dataset = Dataset.from_dict(transformed_data)
instruction_dataset = instruction_dataset.map(instruction_prompt_func, batched = True,)

from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = instruction_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer_stats = trainer.train()
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Give an example scenario that illustrates the concept of Hero archetype as described by Jungian psychology.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# Text Streaming

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

inputs = tokenizer(
[
    instruction_prompt.format(
        "When trying to understand how nature plays a role in the development of a child's personality, which concept should be considered?",
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   repetition_penalty = 0.1)

In [None]:
# add current timestamp to model name
model.save_pretrained(f"qLeap_model_v0_{int(time.time())}") # Local saving
tokenizer.save_pretrained(f"qLeap_model_instruct_v0_{int(time.time())}")
model.push_to_hub(f"olabs-ai/qLeap_model_instruct_v0_{int(time.time())}", token = "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") # Online saving
tokenizer.push_to_hub(f"olabs-ai/qLeap_model_instruct_v0_{int(time.time())}", token = "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") # Online saving

# # Merge to 16bit
# if False: model.save_pretrained_merged("qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit", token = "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")

# # Merge to 4bit
# if False: model.save_pretrained_merged("qLeap_model_v0_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit", token = "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")

# # Just LoRA adapters
# if False: model.save_pretrained_merged("qLeap_model_v0_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora", token = "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")
    
    
# # Save to 8bit Q8_0
# if False: model.save_pretrained_gguf("qLeap_model_v0_8bit_Q8_{int(time.time())}", tokenizer,)
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_8bit_Q8_{int(time.time())}", tokenizer, token = "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")

# # Save to 16bit GGUF
# if False: model.save_pretrained_gguf("qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16", token = "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")

# # Save to q4_k_m GGUF
# if False: model.save_pretrained_gguf("qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m", token = "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")
# if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q5_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q5_k_m", token = "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")

# Inference

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Give an example scenario that illustrates the concept of Hero archetype as described by Jungian psychology.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# Text Streaming

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

inputs = tokenizer(
[
    instruction_prompt.format(
        "When trying to understand how nature plays a role in the development of a child's personality, which concept should be considered?",
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   repetition_penalty = 0.1)