In [1]:
import json
import ast
import logging
import csv
import os
import torch
from typing import List, Dict, Any
from datasets import Dataset
from transformers import TextStreamer
from unsloth import (
    FastLanguageModel,
    UnslothTrainer,
    UnslothTrainingArguments,
    is_bfloat16_supported
)

# Configure logging
logging.basicConfig(
    filename='transformation_errors.log',
    filemode='w',
    level=logging.ERROR,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Define paths
INPUT_CSV_PATH = '/root/quantumLeap/data/psychologoy-of-unconscious-mind/concept_examples.csv'
OUTPUT_JSON_PATH = '/root/qLeap-fft/data/input/Instruction_Data/transformed_data.json'

In [2]:
def read_csv_data(input_csv_path: str) -> List[Dict[str, str]]:
    """Read and validate the input CSV file."""
    try:
        with open(input_csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            return list(reader)
    except Exception as e:
        logging.error(f"Error reading CSV file: {e}")
        raise

def transform_data(original_data: List[Dict[str, str]]) -> List[Dict[str, str]]:
    """Transform the original data by expanding example scenarios."""
    new_data = []

    for idx, entry in enumerate(original_data, start=1):
        concept_name = entry.get('concept_name', '').strip()
        detailed_explanation = entry.get('detailed_explanation', '').strip()
        example_scenario_str = entry.get('example_scenario', '').strip()

        if not all([concept_name, detailed_explanation, example_scenario_str]):
            logging.error(f"Entry {idx} is missing required fields. Skipping.")
            continue

        try:
            example_scenarios = json.loads(example_scenario_str)
        except json.JSONDecodeError:
            try:
                example_scenarios = ast.literal_eval(example_scenario_str)
            except (ValueError, SyntaxError) as e:
                logging.error(f"Entry {idx} ('{concept_name}') has invalid example_scenario: {e}")
                continue

        if not isinstance(example_scenarios, list):
            logging.error(f"Entry {idx} ('{concept_name}'): example_scenario is not a list")
            continue

        for scenario_idx, scenario in enumerate(example_scenarios, start=1):
            if not isinstance(scenario, str):
                logging.error(f"Entry {idx} ('{concept_name}'): non-string scenario at position {scenario_idx}")
                continue

            new_data.append({
                'concept_name': concept_name,
                'detailed_explanation': detailed_explanation,
                'example_scenario': scenario.strip()
            })

    return new_data

# Process and save the data
original_data = read_csv_data(INPUT_CSV_PATH)
transformed_data = transform_data(original_data)

# Save transformed data
os.makedirs(os.path.dirname(OUTPUT_JSON_PATH), exist_ok=True)
with open(OUTPUT_JSON_PATH, 'w', encoding='utf-8') as f:
    json.dump(transformed_data, f, ensure_ascii=False, indent=4)

print(f"Processed {len(transformed_data)} examples")

In [3]:
# Define instruction template
instruction_template = """<|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
<|eot_id|>
<|start_header_id|>user<|end_header_id|>

{}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

{}<|eot_id|>"""

def create_instruction_dataset(transformed_data: List[Dict[str, str]]) -> Dataset:
    """Create an instruction dataset from transformed data."""
    def instruction_prompt_func(examples):
        return {
            "text": [
                instruction_template.format(
                    f"Explain the concept of {cn} and provide an example.",
                    f"{de}\n\nExample:\n{es}"
                )
                for cn, de, es in zip(
                    examples["concept_name"],
                    examples["detailed_explanation"],
                    examples["example_scenario"]
                )
            ]
        }

    dataset = Dataset.from_list(transformed_data)
    return dataset.map(instruction_prompt_func, batched=True)

# Create the dataset
instruction_dataset = create_instruction_dataset(transformed_data)

# Print a sample to verify
print("\nSample processed example:")
print(instruction_dataset[0]["text"])

In [4]:
# Model initialization parameters
max_seq_length = 1024
model_name = "lora_model_pum"
load_in_4bit = True

# Initialize model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit
)

# Check for special tokens
special_tokens = [
    "<|start_header_id|>",
    "<|end_header_id|>",
    "<|eot_id|>",
    "system",
    "user",
    "assistant"
]

for token in special_tokens:
    if token not in tokenizer.get_vocab():
        print(f"Warning: {token} not in vocabulary!")

# Configure model
FastLanguageModel.for_inference(model)
model.config.torch_dtype = torch.bfloat16

In [5]:
def setup_training(model, tokenizer, dataset, 
                  batch_size=2, gradient_accumulation=8, max_steps=120):
    """Setup the training configuration."""
    
    from datetime import datetime
    import pytz
    import wandb

    # Define your parameters
    batchSize = 2
    ga = 8
    maxSteps = 120
    lRate = 5e-5
    embLRate = 1e-5
    optim = "adamw_8bit"
    lrSchedule = "linear"

    # Assume these variables are defined elsewhere in your code
    base_model_slug = "your_base_model_slug"  # Replace with your actual value
    max_seq_length = 512  # Replace with your actual value

    # Get the current date and time in Indian Standard Time (IST)
    ist = pytz.timezone('Asia/Kolkata')
    current_datetime = datetime.now(ist)

    # Format the datetime string
    # Example format: 20240428_153045 (YYYYMMDD_HHMMSS)
    formatted_datetime = current_datetime.strftime("%Y%m%d_%H%M%S")

    # Create the run name with the current date and time
    run_name = f"""Unsloth-CPT-Base-{formatted_datetime}-28Octv1-{base_model_slug}-{max_seq_length}max_seq_length-{batchSize}batchSize-{ga}ga-{maxSteps}maxSteps-{lRate}lRate-{embLRate}embLRate-{optim}optim-{lrSchedule}lrSchedule"""

    # Initialize Weights & Biases
    # It's recommended to set your W&B API key as an environment variable for security.
    # Example: export WANDB_API_KEY="your_api_key"
    wandb.login(key="1ca3c5e9222c2504acbc07cf7f88267006ae68c4")  # Consider using environment variables for security
    wandb.init(project="Unsloth-CPT", name=run_name)
    
    training_args = UnslothTrainingArguments(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation,
        max_steps=max_steps,
        warmup_steps=10,
        learning_rate=5e-5,
        embedding_learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to=["tensorboard", "wandb"],
        logging_dir="./tel-fft-logs"
    )

    return UnslothTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        args=training_args
    )

# Setup trainer
trainer = setup_training(model, tokenizer, instruction_dataset)

# Start training
trainer.train()

In [None]:
model.save_pretrained("lora_model_pum_instruct") # Local saving
tokenizer.save_pretrained("lora_model_pum_instruct")

!huggingface-cli login --token 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 --add-to-git-credential
if True:
    model.push_to_hub("olabs-ai/qLeap_v07_instruct", token = "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") # Online saving
    tokenizer.push_to_hub("olabs-ai/qLeap_v07_instruct", token = "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") # Online saving
    model.push_to_hub_gguf("olabs-ai/qLeap_v07_instruct", tokenizer, quantization_method = "q4_k_m", token = "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")


In [None]:

import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer
import warnings
warnings.filterwarnings('ignore')

# Model initialization parameters
max_seq_length = 1024
dtype = None
load_in_4bit = True

# Enable faster inference
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model_pum", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

FastLanguageModel.for_inference(model)

# Instruction prompt matching the fine-tuning template
instruction_prompt = """<|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
<|eot_id|>
<|start_header_id|>user<|end_header_id|>

Explain the concept of {} and provide an example.<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

"""

# Set model dtype
model.config.torch_dtype = torch.bfloat16

# Example usage
concept_name = "Semiotics"

# Format input
inputs = tokenizer(
    [instruction_prompt.format(concept_name)],
    return_tensors="pt"
).to("cuda")

# Initialize text streamer
text_streamer = TextStreamer(tokenizer)

# Generate output with modified parameters
outputs = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.get_vocab().get("<|eot_id|>", tokenizer.eos_token_id),  # Use <|eot_id|> if available
    min_length=50,
    early_stopping=True
)

# Optional: Print the full response
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
def generate_explanation(model, tokenizer, concept_name: str, max_new_tokens: int = 512) -> str:
    """Generate explanation for a given concept."""
    prompt = instruction_template.format(
        f"Explain the concept of {concept_name} and provide an example.",
        ""
    )
    
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    text_streamer = TextStreamer(tokenizer)
    
    outputs = model.generate(
        **inputs,
        streamer=text_streamer,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.get_vocab().get("<|eot_id|>", tokenizer.eos_token_id),
        min_length=50,
        early_stopping=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate example
concept = "Defense Mechanisms"
response = generate_explanation(model, tokenizer, concept)
print(f"\nGenerated explanation for {concept}:")
print(response)