# Git clone qLeap-fft repo inside `/root/` directory
## Ensure to have the latest branch
## Switch to quantumLeap directory

In [1]:
import os

# Set these environment variables before importing torch-related modules
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

from pathlib import Path

def ensure_working_directory():
    """
    Check if we're in the correct working directory, if not switch to it.
    Creates the directory if it doesn't exist.
    """
    target_dir = '/home/ubuntu/quantumLeap'
    current_dir = os.getcwd()
    
    # Print current directory
    print(f"Current directory: {current_dir}")
    
    # Check if we need to switch directories
    if current_dir != target_dir:
        # Create directory if it doesn't exist
        Path(target_dir).mkdir(parents=True, exist_ok=True)
        
        try:
            # Change to target directory
            os.chdir(target_dir)
            print(f"Successfully switched to: {target_dir}")
        except Exception as e:
            print(f"Error switching to directory: {str(e)}")
            raise
    else:
        print("Already in correct directory")
    
    # Verify current directory
    print(f"Working directory: {os.getcwd()}")

# Call the function before your main code
ensure_working_directory()

Current directory: /home/ubuntu/quantumLeap
Already in correct directory
Working directory: /home/ubuntu/quantumLeap


In [None]:
# ----------------------------- #
# Part 1.1: Install and Setup Libraries - for Ola Krutrim Cloud Instance
# ----------------------------- #

# # if executing below in terminal with virtual env, do not need to add --system tag
# pip install uv #install this in the virtual environment where you want to execute the notebook.
# pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121 # as on 07Nov2024, xformers is compatible with torch=2.4.0 only; uv doesnt work for installing torch
# uv pip install packaging ninja
# uv pip install flash-attn --no-build-isolation
# uv pip install unsloth
# python -m xformers.info
# uv pip install wandb bitsandbytes ipywidgets nltk spacy huggingface_hub datasets tqdm Iprogress ipywidgets python-dotenv tensorboard -q

# # restart once you have installed all of the above
# !nvidia-smi
# !nvcc --version
# import torch
# print(torch.__version__)          # Should reflect 2.5.0+cu124
# print(torch.version.cuda)         # Should output 12.4
# print(torch.cuda.is_available())  # Should return True

# Restart again so that all the libraries are properly initialized

In [2]:
# ----------------------------- #
# Part 1.2: Import Necessary Libraries
# ----------------------------- #

# General Libraries
import os
import json
import sys
import subprocess
import argparse
import logging
import math
import random
from datetime import datetime
import re
import gc
import weakref
import multiprocessing

# Torch related
import torch
from torch import nn
import torch.distributed as dist

# Transformers related
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Adafactor
)

# Huggingface TRL for full finetune
from trl import SFTTrainer, SFTConfig

# General huggingface libraries
import huggingface_hub
from datasets import load_dataset, Dataset
from accelerate import Accelerator


# Unsloth specificic libraries
import unsloth
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments, FastLanguageModel

# Other Libraries
from peft import LoraConfig
import wandb
import nltk
import spacy
# from galore_torch import GaLoreAdamW, GaLoreAdafactor, GaLoreAdamW8bit

# Check and import NLTK and spacy modules
# Ensure NLTK's punkt tokenizer is available
import nltk
try:
    nltk.data.find('tokenizers/punkt')
    print('punkt was already available.')
except LookupError:
    nltk.download('punkt')
    print('punkt was not available. It has been downloaded')

# Initialize spaCy English model
try:
    nlp = spacy.load('en_core_web_sm')
    print('en_core_web_sm was already available.')
except OSError:
    print("SpaCy English model not found. Downloading...")
    os.system('python -m spacy download en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
punkt was already available.
en_core_web_sm was already available.


In [3]:
# ----------------------------- #
# Part 2: Load and Clean the Text Data
# ----------------------------- #

def load_and_clean_text(file_path):
    """
    Loads text from a file and removes Project Gutenberg's license and headers/footers.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # # Remove Project Gutenberg's license text and headers/footers
    # start_pattern = r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK.*\*\*\*'
    # end_pattern = r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK.*\*\*\*'

    # text = re.sub(f'.*{start_pattern}', '', text, flags=re.DOTALL)
    # text = re.sub(f'{end_pattern}.*', '', text, flags=re.DOTALL)
    return text.strip()

# Replace 'psychology_of_unconscious.txt' with your actual file path
file_path = '/home/ubuntu/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt'
clean_text = load_and_clean_text(file_path)

In [None]:
%pip install t

In [1]:
import tiktoken
from openai import OpenAI
import json
import time
from typing import List, Dict
import numpy as np
import os
from datetime import datetime

class SemanticChunker:
    def __init__(self, model_name: str = "meta-llama/Llama-3.2-3B-Instruct"):
        """Initialize the semantic chunker with model configuration"""
        self.client = OpenAI(
            base_url="http://localhost:8000/v1",
            api_key="dummy"
        )
        self.model_name = model_name
        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
        self.max_tokens = 3000
        
        # Set up logging directory with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.log_dir = f"/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/Psychology_Of_Unconscious_Mind/chunks_{timestamp}"
        os.makedirs(self.log_dir, exist_ok=True)
        
        # Set up logging file for processing summary
        self.log_file = os.path.join(self.log_dir, "processing_log.txt")
        
    def log_message(self, message: str):
        """Write log message with timestamp"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(self.log_file, 'a', encoding='utf-8') as f:
            f.write(f"[{timestamp}] {message}\n")
        print(message)
        
    def count_tokens(self, text: str) -> int:
        """Count tokens in text using tiktoken"""
        return len(self.encoding.encode(text))
    
    def create_initial_chunks(self, text: str) -> List[str]:
        """Create initial chunks of approximately max_tokens size"""
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for paragraph in paragraphs:
            para_tokens = self.count_tokens(paragraph)
            
            if current_tokens + para_tokens > self.max_tokens:
                # Join current chunk and add to chunks
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = [paragraph]
                current_tokens = para_tokens
            else:
                current_chunk.append(paragraph)
                current_tokens += para_tokens
        
        # Add the last chunk if it exists
        if current_chunk:
            chunks.append('\n\n'.join(current_chunk))
            
        self.log_message(f"Created {len(chunks)} initial chunks")
        return chunks
    
    def save_chunk_log(self, chunk_num: int, original_chunk: str, llm_response: str, parsed_sections: List[Dict]):
        """Save intermediate chunks and responses"""
        log_file = os.path.join(self.log_dir, f"chunk_{chunk_num:04d}.json")
        log_data = {
            "chunk_number": chunk_num,
            "original_text": original_chunk,
            "llm_raw_response": llm_response,
            "parsed_sections": parsed_sections,
            "token_count": self.count_tokens(original_chunk)
        }
        with open(log_file, 'w', encoding='utf-8') as f:
            json.dump(log_data, f, indent=2, ensure_ascii=False)
    
    def get_semantic_sections(self, chunk: str) -> tuple[List[Dict], str]:
        """Send chunk to LLM for semantic sectioning with structured JSON output"""

        # Define the JSON schema
        schema = {
            "type": "object",
            "properties": {
                "sections": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "topic": {"type": "string"},
                            "content": {"type": "string"},
                            "key_concepts": {
                                "type": "array",
                                "items": {"type": "string"}
                            }
                        },
                        "required": ["topic", "content", "key_concepts"],
                        "additionalProperties": False
                    }
                }
            },
            "required": ["sections"],
            "additionalProperties": False
        }

        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a text analysis expert. Break the given text into coherent sections by topic."
                    },
                    {
                        "role": "user",
                        "content": f"Analyze this text and break it into coherent sections:\n\n{chunk}"
                    }
                ],
                max_tokens=self.max_tokens,
                temperature=0.2,
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "text_sections",
                        "schema": schema,
                        "strict": True
                    }
                }
            )
            
            result = response.choices[0].message.content
            
            try:
                parsed = json.loads(result)
                self.log_message(f"Successfully parsed JSON with {len(parsed['sections'])} sections")
                return parsed['sections'], result
            except json.JSONDecodeError as e:
                self.log_message(f"JSON parsing error: {str(e)}")
                self.log_message(f"Raw response: {result}")
                return [], result
                
        except Exception as e:
            self.log_message(f"Error in LLM request: {str(e)}")
            return [], str(e)
    
    def process_text(self, text: str) -> List[Dict]:
        """Process entire text into semantic sections"""
        self.log_message("Starting text processing")
        
        # Create initial chunks
        initial_chunks = self.create_initial_chunks(text)
        
        # Process each chunk
        all_sections = []
        for i, chunk in enumerate(initial_chunks):
            self.log_message(f"Processing chunk {i+1}/{len(initial_chunks)}")
            sections, raw_response = self.get_semantic_sections(chunk)
            
            # Save intermediate results
            self.save_chunk_log(i+1, chunk, raw_response, sections)
            
            all_sections.extend(sections)
            time.sleep(1)  # Rate limiting
            
        self.log_message(f"Processing complete. Total sections created: {len(all_sections)}")
        return all_sections

    def save_sections(self, sections: List[Dict], output_file: str):
        """Save processed sections to JSON file"""
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump({'sections': sections}, f, indent=2, ensure_ascii=False)
        self.log_message(f"Saved sections to {output_file}")

def print_chunk_summary(log_dir: str):
    """Print summary of processed chunks"""
    print("\nChunk Processing Summary:")
    print("-" * 50)
    
    for file in sorted(os.listdir(log_dir)):
        if file.endswith(".json") and file != "processing_log.txt":
            with open(os.path.join(log_dir, file), 'r') as f:
                data = json.load(f)
                print(f"\nChunk {data['chunk_number']}:")
                print(f"Token count: {data['token_count']}")
                print(f"Sections created: {len(data['parsed_sections'])}")
                for section in data['parsed_sections']:
                    print(f"- {section['topic']}")

def main():
    # Initialize chunker
    chunker = SemanticChunker()
    
    # Read input file
    input_file = "/home/ubuntu/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt"
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Process text
    sections = chunker.process_text(text)
    
    # Save results
    output_file = os.path.join(chunker.log_dir, "semantic_sections.json")
    chunker.save_sections(sections, output_file)
    
    # Print chunk summary
    print_chunk_summary(chunker.log_dir)

if __name__ == "__main__":
    main()

Starting text processing
Created 71 initial chunks
Processing chunk 1/71
Error in LLM request: Connection error.
Processing chunk 2/71
Error in LLM request: Connection error.
Processing chunk 3/71
Error in LLM request: Connection error.


KeyboardInterrupt: 

In [2]:
# Import statements and data structures
import tiktoken
from openai import OpenAI
import json
import time
from typing import List, Dict, Tuple, Optional
import numpy as np
import os
from datetime import datetime
from pprint import pprint
import re
from dataclasses import dataclass
from enum import Enum

# Core data structures
class SectionType(Enum):
    HEADER = "header"
    CONTENT = "content"
    QUOTE = "quote"
    ATTRIBUTION = "attribution"
    LIST = "list"
    FRONT_MATTER = "front_matter"
    TABLE_OF_CONTENTS = "table_of_contents"
    
@dataclass
class Section:
    text: str
    type: SectionType
    level: int = 0
    metadata: Dict = None

# Main class initialization
class SemanticChunker:
    def __init__(self, model_name: str = "meta-llama/Llama-3.2-3B-Instruct"):
        """Initialize the semantic chunker with model configuration"""
        self.client = OpenAI(
            base_url="http://localhost:8000/v1",
            api_key="dummy"
        )
        self.model_name = model_name
        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
        self.max_tokens = 3000
        
        # Set up logging directory with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.log_dir = f"/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/Psychology_Of_Unconscious_Mind/chunks_{timestamp}"
        os.makedirs(self.log_dir, exist_ok=True)
        
        # Set up logging file for processing summary
        self.log_file = os.path.join(self.log_dir, "processing_log.txt")
        
        # Initialize state variables
        self.missed_text = ""  # Store text not included in LLM output

    def log_message(self, message: str):
        """Write log message with timestamp and print to console"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] {message}"
        with open(self.log_file, 'a', encoding='utf-8') as f:
            f.write(log_entry + "\n")
        print(log_entry)
    
    def print_separator(self, message: str = ""):
        """Print a separator line with optional message"""
        print(f"\n{'='*100}")
        if message:
            print(f"{message}")
            print('='*100)
    
    def count_tokens(self, text: str) -> int:
        """Count tokens in text using tiktoken"""
        return len(self.encoding.encode(text))
    
    # Continuing the SemanticChunker class...

    def is_chapter_heading(self, text: str) -> Tuple[bool, int]:
        """
        Enhanced chapter heading detection with level identification.
        Returns (is_heading, level).
        """
        text = text.strip()
        if not text:
            return False, 0
            
        # Chapter patterns
        chapter_patterns = [
            (r'^CHAPTER\s+[IVXL]+', 1),  # Main chapter headers
            (r'^[IVX]+\.\s*—\s*', 2),    # Sub-chapter headers
            (r'^\d+\.\s*—\s*', 2),       # Numbered sections
            (r'^\s*[A-Z][A-Z\s]+$', 1),  # ALL CAPS lines
            (r'^\s*[IVX]+\.\s+[A-Z]', 2) # Roman numeral sections
        ]
        
        for pattern, level in chapter_patterns:
            if re.match(pattern, text, re.I):
                return True, level
        
        # Check for centered text formatting
        line_length = len(text)
        leading_spaces = len(text) - len(text.lstrip())
        trailing_spaces = len(text) - len(text.rstrip())
        
        # Improved centered text detection
        is_centered = (abs(leading_spaces - trailing_spaces) <= 2 and 
                      leading_spaces > 5 and
                      text.strip())  # Must have content
        is_caps = text.isupper()
        reasonable_length = 10 < len(text.strip()) < 100
        
        if is_centered:
            if is_caps and reasonable_length:
                return True, 1  # Main header
            elif reasonable_length:
                return True, 2  # Sub header
                
        return False, 0
    
    def analyze_text_structure(self, text: str) -> List[Section]:
        """
        Enhanced text structure analysis with improved section detection.
        """
        sections = []
        lines = text.split('\n')
        current_section = []
        current_type = None
        current_level = 0
        
        def flush_section():
            nonlocal current_section, current_type, current_level
            if current_section:
                # Skip empty sections but preserve intentional spacing
                content = '\n'.join(current_section).strip()
                if content or current_type in {SectionType.HEADER, SectionType.FRONT_MATTER}:
                    sections.append(Section(
                        text='\n'.join(current_section),
                        type=current_type or SectionType.CONTENT,
                        level=current_level
                    ))
                current_section = []
                current_type = None
                current_level = 0
        
        i = 0
        while i < len(lines):
            line = lines[i]
            next_line = lines[i + 1] if i + 1 < len(lines) else ""
            
            # Detect centered headers
            if line.strip() and line.strip().isupper():
                leading_spaces = len(line) - len(line.lstrip())
                if leading_spaces > 10:  # Likely centered
                    flush_section()
                    current_type = SectionType.HEADER
                    current_level = 1
                    current_section = [line]
                    # Include following blank lines
                    while i + 1 < len(lines) and not lines[i + 1].strip():
                        current_section.append(lines[i + 1])
                        i += 1
                    flush_section()
                    i += 1
                    continue
            
            # Detect Table of Contents
            if re.match(r'^\s*CONTENTS\s*$', line, re.I):
                flush_section()
                current_type = SectionType.TABLE_OF_CONTENTS
                current_section = [line]
                # Include following blank lines
                while i + 1 < len(lines) and not lines[i + 1].strip():
                    current_section.append(lines[i + 1])
                    i += 1
                i += 1
                continue
            
            # Detect Front Matter
            if re.match(r'^\s*(?:AUTHOR\'S\s+NOTE|PREFACE|INTRODUCTION)\s*$', line, re.I):
                flush_section()
                current_type = SectionType.FRONT_MATTER
                current_section = [line]
                # Include following blank lines
                while i + 1 < len(lines) and not lines[i + 1].strip():
                    current_section.append(lines[i + 1])
                    i += 1
                i += 1
                continue
            
            # Detect chapter headings
            is_heading, level = self.is_chapter_heading(line)
            if is_heading:
                flush_section()
                current_type = SectionType.HEADER
                current_level = level
                current_section = [line]
                # Include following blank lines
                while i + 1 < len(lines) and not lines[i + 1].strip():
                    current_section.append(lines[i + 1])
                    i += 1
                flush_section()
                i += 1
                continue
            
            # Detect quotes
            if ((line.startswith('"') and len(line) > 50) or 
                (line.startswith('_') and line.endswith('_'))):
                if current_type != SectionType.QUOTE:
                    flush_section()
                    current_type = SectionType.QUOTE
                
            # Detect attributions
            if re.match(r'^\s*(?:—|--)\s*[A-Z]', line):
                flush_section()
                current_type = SectionType.ATTRIBUTION
                current_section = [line]
                i += 1
                continue
                
            # Detect lists
            if re.match(r'^\s{4,}(?:[\w\-]+\.|\-|\*)\s', line):
                if current_type != SectionType.LIST:
                    flush_section()
                    current_type = SectionType.LIST
            
            current_section.append(line)
            i += 1
            
            # Handle section transitions
            if i < len(lines):
                next_line = lines[i]
                # Detect section breaks by multiple blank lines
                if (not line.strip() and not next_line.strip() and 
                    current_type not in {SectionType.HEADER, SectionType.FRONT_MATTER}):
                    flush_section()
        
        flush_section()  # Flush any remaining content
        
        # Filter and clean sections
        filtered_sections = []
        for section in sections:
            if section.text.strip() or section.type in {SectionType.HEADER, SectionType.FRONT_MATTER}:
                filtered_sections.append(section)
        
        return filtered_sections
    
    # Continuing the SemanticChunker class...

    def get_complete_paragraphs(self, text: str, max_tokens: int) -> Tuple[str, str]:
        """
        Get complete paragraphs up to max_tokens with improved error handling.
        """
        try:
            # First, analyze the structure
            sections = self.analyze_text_structure(text)
            if not sections:
                raise ValueError("No sections found in text")
                
            self.log_message(f"Found {len(sections)} sections")
            
            current_sections = []
            current_tokens = 0
            section_index = 0
            
            while section_index < len(sections):
                section = sections[section_index]
                section_tokens = max(1, self.count_tokens(section.text))  # Prevent zero tokens
                
                # Handle oversized sections
                if section_tokens > max_tokens:
                    if not current_sections:
                        # Split large section
                        split_text = section.text[:max_tokens*4].strip()  # Approximate chars
                        if split_text:
                            current_sections.append(
                                Section(split_text, section.type, section.level)
                            )
                        remaining_text = section.text[max_tokens*4:]
                        if remaining_text:
                            self.missed_text = remaining_text
                        break
                    else:
                        break
                        
                # Check if adding section would exceed limit
                if current_tokens + section_tokens > max_tokens:
                    break
                    
                # Add section
                current_sections.append(section)
                current_tokens += section_tokens
                section_index += 1
                
            # Combine sections with proper spacing
            processed_text = self.combine_sections(current_sections)
            remaining_text = self.combine_sections(sections[section_index:])
            
            return processed_text, remaining_text
            
        except Exception as e:
            self.log_message(f"Error in get_complete_paragraphs: {str(e)}")
            # Return empty chunk and original text on error
            return "", text

    
    def create_initial_chunks(self, text: str) -> List[str]:
        """
        Create initial chunks while preserving document structure.
        Returns empty list and logs error if no valid chunks can be created.
        """
        chunks = []
        remaining_text = text
        chunk_number = 0
        
        try:
            while remaining_text.strip():
                chunk_number += 1
                self.log_message(f"\nProcessing chunk {chunk_number}")
                
                # Handle missed text from previous chunk
                if self.missed_text:
                    self.log_message("Adding missed text from previous chunk")
                    remaining_text = self.missed_text + '\n\n' + remaining_text
                    self.missed_text = ""
                
                # Get complete paragraphs up to token limit
                chunk_text, remaining_text = self.get_complete_paragraphs(
                    remaining_text, 
                    self.max_tokens
                )
                
                if not chunk_text.strip():
                    self.log_message("Warning: Empty chunk produced")
                    if len(remaining_text.strip()) < self.max_tokens:
                        # Handle last small chunk
                        if remaining_text.strip():
                            chunks.append(remaining_text.strip())
                        break
                    else:
                        # Force split if stuck
                        chunk_text = remaining_text[:self.max_tokens*4]  # Use characters as approximation
                        remaining_text = remaining_text[self.max_tokens*4:]
                
                token_count = self.count_tokens(chunk_text)
                if token_count > 0:  # Prevent empty chunks
                    self.log_message(f"Created chunk {chunk_number} ({token_count} tokens)")
                    chunks.append(chunk_text)
                    
                # Safeguards
                if not remaining_text.strip():
                    break
                if len(chunks) >= 100:  # Limit total chunks
                    self.log_message("Warning: Maximum chunk limit reached")
                    break
                if chunk_number > 10 and not chunks:  # Detect processing failure
                    raise RuntimeError("Failed to create any valid chunks after multiple attempts")
                    
        except Exception as e:
            self.log_message(f"Error in chunk creation: {str(e)}")
            if not chunks:  # Ensure at least one chunk is created
                self.log_message("Attempting emergency chunk creation")
                try:
                    # Create a single chunk with first max_tokens characters
                    chunks = [text[:self.max_tokens*4].strip()]
                except:
                    self.log_message("Emergency chunk creation failed")
                    
        finally:
            # Always save whatever chunks were created
            if chunks:
                os.makedirs(self.log_dir, exist_ok=True)
                for i, chunk in enumerate(chunks):
                    chunk_file = os.path.join(self.log_dir, f"chunk_{i+1:04d}.txt")
                    with open(chunk_file, 'w', encoding='utf-8') as f:
                        f.write(chunk)
                self.log_message(f"Created {len(chunks)} initial chunks")
            else:
                self.log_message("Warning: No chunks were created")
                
            return chunks

    def get_semantic_sections(self, chunk: str) -> Tuple[List[str], Dict]:
        """
        Process chunks through LLM for semantic analysis with improved handling.
        """
        try:
            self.log_message(f"Sending request to LLM (input tokens: {self.count_tokens(chunk)})")
            
            # Enhanced prompt for better structure preservation
            system_prompt = """You are a text analysis expert. Your task is to:
            1. Maintain the original document structure exactly as provided
            2. Split the input into semantically coherent sections
            3. Preserve all formatting, spacing, and special characters
            4. Keep headers with their associated content
            5. Keep lists and quotes intact within their sections
            6. Mark sections using <START_SECTION> and <END_SECTION> tags
            7. Mark incomplete sections with <INCOMPLETE> tags
            8. Handle front matter, tables of contents, and chapter headings appropriately
            9. Preserve all original line breaks and paragraph spacing

            Critical Rules:
            - Do not modify any text content
            - Preserve all original formatting
            - Keep structural elements together (headers with content)
            - Maintain document hierarchy
            - Include ALL text - do not skip anything
            """
            
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Split this text into sections, preserving ALL content and structure:\n\n{chunk}"}
                ],
                max_tokens=self.max_tokens,
                temperature=0.2,
                timeout=60  # 1-minute timeout
            )
            
            result = response.choices[0].message.content
            
            # Extract sections with improved parsing
            sections = []
            section_pattern = r'<START_SECTION>(.*?)<END_SECTION>'
            for match in re.finditer(section_pattern, result, re.DOTALL):
                section_text = match.group(1).strip()
                if section_text:  # Keep even short sections if they're structural
                    if len(section_text) > 50 or any(marker in section_text.upper() 
                        for marker in ['CHAPTER', 'CONTENTS', 'NOTE', 'INTRODUCTION']):
                        sections.append(section_text)
            
            # Handle incomplete sections
            incomplete_pattern = r'<INCOMPLETE>(.*?)</INCOMPLETE>'
            incomplete_match = re.search(incomplete_pattern, result, re.DOTALL)
            if incomplete_match:
                incomplete_text = incomplete_match.group(1).strip()
                if incomplete_text:
                    self.missed_text = incomplete_text
                    self.log_message(f"Found incomplete section ({self.count_tokens(incomplete_text)} tokens)")
            
            # Verify content preservation
            if not sections:
                self.log_message("Warning: No sections created by LLM")
                self.missed_text = chunk
            elif not incomplete_match:
                missed_text = self.verify_output_completeness(chunk, sections)
                if missed_text:
                    self.missed_text = missed_text
                    self.log_message(f"Found missed text ({self.count_tokens(missed_text)} tokens)")
            
            # Collect metrics
            metrics = {
                "completion_tokens": response.usage.completion_tokens,
                "prompt_tokens": response.usage.prompt_tokens,
                "total_tokens": response.usage.total_tokens,
                "finish_reason": response.choices[0].finish_reason,
                "sections_created": len(sections),
                "has_missed_text": bool(self.missed_text),
                "avg_section_length": sum(len(s) for s in sections) / len(sections) if sections else 0,
                "timestamp": datetime.now().isoformat()
            }
            
            return sections, metrics
                
        except Exception as e:
            self.log_message(f"Error in LLM request: {str(e)}")
            return [], {}

    def verify_output_completeness(self, input_text: str, output_sections: List[str]) -> str:
        """
        Enhanced verification of content preservation with improved detection.
        """
        # Normalize texts for comparison
        input_normalized = ' '.join(input_text.split())
        output_normalized = ' '.join(' '.join(output_sections).split())
        
        # Quick full-text comparison
        if input_normalized == output_normalized:
            return ""
        
        # Find missing content using sliding window
        words = input_normalized.split()
        missing_sequences = set()  # Use set to avoid duplicates
        
        # Use multiple window sizes for better detection
        for window_size in [5, 10, 15]:  # Try different window sizes
            i = 0
            while i < len(words) - window_size:
                sequence = ' '.join(words[i:i+window_size])
                if sequence not in output_normalized:
                    # Find complete missing phrase
                    start = i
                    while start > 0 and ' '.join(words[start-1:i+window_size]) not in output_normalized:
                        start -= 1
                    end = i + window_size
                    while end < len(words) and ' '.join(words[i:end+1]) not in output_normalized:
                        end += 1
                    missing_sequences.add(' '.join(words[start:end]))
                    i = end
                else:
                    i += 1
        
        return '\n'.join(sorted(missing_sequences)) if missing_sequences else ""

    def validate_chunk(self, chunk: str, original_sections: List[Section]) -> bool:
        """
        Comprehensive chunk validation with detailed reporting.
        """
        # Normalize texts for comparison
        chunk_text = ' '.join(chunk.split())
        
        # Track missing content by section type
        missing_by_type = {}
        
        for section in original_sections:
            section_text = ' '.join(section.text.split())
            
            # For headers and front matter, require exact matches
            if section.type in [SectionType.HEADER, SectionType.FRONT_MATTER]:
                if section_text not in chunk_text:
                    missing_by_type.setdefault(section.type, []).append(section.text)
                continue
            
            # For other content, use sliding window detection
            words = section_text.split()
            window_size = 5
            missing_chunks = set()
            
            i = 0
            while i < len(words) - window_size:
                sequence = ' '.join(words[i:i+window_size])
                if sequence not in chunk_text:
                    # Find complete phrase
                    start = i
                    while start > 0 and ' '.join(words[start-1:i+window_size]) not in chunk_text:
                        start -= 1
                    end = i + window_size
                    while end < len(words) and ' '.join(words[i:end+1]) not in chunk_text:
                        end += 1
                    missing_chunks.add(' '.join(words[start:end]))
                    i = end
                else:
                    i += 1
            
            if missing_chunks:
                missing_by_type.setdefault(section.type, []).extend(missing_chunks)
        
        # Report missing content by type
        if missing_by_type:
            self.log_message("Missing content detected:")
            for section_type, missing_content in missing_by_type.items():
                self.log_message(f"\n{section_type.value}:")
                for content in missing_content:
                    self.log_message(f"  - {content[:100]}...")
            return False
        
        return True

    def combine_sections(self, sections: List[Section]) -> str:
        """Safely combine sections with proper spacing."""
        if not sections:
            return ""
            
        parts = []
        for section in sections:
            if parts:  # Add spacing between sections
                parts.append("")
                if section.type == SectionType.HEADER:
                    parts.append("")  # Extra space for headers
            parts.append(section.text.rstrip())
            
        return "\n".join(parts)

    def save_chunk_log(self, chunk_num: int, original_chunk: str, 
                    sections: List[str], metrics: Dict):
        """Save processing logs with proper error handling."""
        try:
            log_file = os.path.join(self.log_dir, f"chunk_{chunk_num:04d}.json")
            
            # Safely calculate averages
            num_sections = len(sections) if sections else 1
            total_chars = sum(len(s) for s in sections) if sections else 0
            
            log_data = {
                "chunk_number": chunk_num,
                "timestamp": datetime.now().isoformat(),
                "original_text": {
                    "content": original_chunk[:1000],  # Limit size
                    "length": len(original_chunk),
                    "tokens": self.count_tokens(original_chunk)
                },
                "sections": {
                    "count": len(sections),
                    "average_length": total_chars / num_sections,
                    "content": [s[:1000] for s in sections]  # Limit size
                },
                "metrics": metrics
            }
            
            with open(log_file, 'w', encoding='utf-8') as f:
                json.dump(log_data, f, indent=2, ensure_ascii=False)
                
        except Exception as e:
            self.log_message(f"Error saving chunk log: {str(e)}")
            
    def process_text(self, text: str, max_chunks: int = None) -> List[str]:
        """
        Main text processing pipeline with enhanced error handling and logging.
        """
        self.log_message("Starting text processing")
        
        try:
            # Create initial chunks
            initial_chunks = self.create_initial_chunks(text)
            
            if max_chunks:
                initial_chunks = initial_chunks[:max_chunks]
                self.log_message(f"Processing limited to first {max_chunks} chunks")
            
            # Process each chunk
            semantic_chunks = []
            for i, chunk in enumerate(initial_chunks):
                self.log_message(f"\nProcessing chunk {i+1}/{len(initial_chunks)}")
                
                # Detailed chunk analysis
                chunk_tokens = self.count_tokens(chunk)
                self.log_message(f"Chunk size: {len(chunk)} chars, {chunk_tokens} tokens")
                
                # Print input preview
                self.print_separator("INPUT CHUNK")
                print(f"Chunk {i+1}:")
                print("="*80)
                print(chunk[:1000] + "..." if len(chunk) > 1000 else chunk)
                print("="*80)
                
                # Get semantic sections
                sections, metrics = self.get_semantic_sections(chunk)
                
                # Process and validate sections
                self.print_separator("SEMANTIC SECTIONS")
                for j, section in enumerate(sections):
                    section_tokens = self.count_tokens(section)
                    print(f"\nSection {j+1} ({section_tokens} tokens):")
                    print("-"*40)
                    print(section[:500] + "..." if len(section) > 500 else section)
                    print("-"*40)
                
                # Print metrics
                self.print_separator("PROCESSING METRICS")
                pprint(metrics)
                
                if self.missed_text:
                    self.print_separator("MISSED CONTENT")
                    print(self.missed_text)
                
                semantic_chunks.extend(sections)
                
                # Save detailed processing log
                self.save_chunk_log(i+1, chunk, sections, metrics)
                
                time.sleep(1)  # Rate limiting
            
            self.log_message(f"Processing complete. Created {len(semantic_chunks)} semantic chunks")
            return semantic_chunks
            
        except Exception as e:
            self.log_message(f"Error in text processing: {str(e)}")
            raise

def main():
    """
    Main execution function with enhanced error handling and reporting.
    """
    try:
        # Initialize chunker
        chunker = SemanticChunker()
        
        # Read input file
        input_file = "/home/ubuntu/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt"
        
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file not found: {input_file}")
        
        with open(input_file, 'r', encoding='utf-8') as f:
            text = f.read()
        
        chunker.log_message(f"Starting processing of {input_file}")
        chunker.log_message(f"Input text: {len(text)} chars, {chunker.count_tokens(text)} tokens")
        
        # Process text with limit for testing
        semantic_chunks = chunker.process_text(text, max_chunks=3)
        
        # Save final chunks
        output_dir = os.path.join(chunker.log_dir, "semantic_chunks")
        os.makedirs(output_dir, exist_ok=True)
        
        for i, chunk in enumerate(semantic_chunks):
            output_file = os.path.join(output_dir, f"semantic_chunk_{i+1:04d}.txt")
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(chunk)
        
        chunker.log_message(f"Saved {len(semantic_chunks)} semantic chunks to {output_dir}")
        
        # Generate processing summary
        summary_file = os.path.join(chunker.log_dir, "processing_summary.json")
        summary = {
            "timestamp": datetime.now().isoformat(),
            "input_file": input_file,
            "input_stats": {
                "chars": len(text),
                "tokens": chunker.count_tokens(text)
            },
            "output_stats": {
                "total_chunks": len(semantic_chunks),
                "total_tokens": sum(chunker.count_tokens(c) for c in semantic_chunks),
                "avg_chunk_size": sum(len(c) for c in semantic_chunks) / len(semantic_chunks)
            },
            "output_directory": output_dir
        }
        
        with open(summary_file, 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2)
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise

def test_structure_analysis():
    """
    Enhanced test function with detailed validation and reporting.
    """
    chunker = SemanticChunker()
    
    test_text = """
                             AUTHOR'S NOTE

My task in this work has been to investigate an individual phantasy
system, and in the doing of it problems of such magnitude have been
uncovered, that my endeavor to grasp them in their entirety has
necessarily meant only a superficial orientation toward those paths, the
opening and exploration of which may possibly crown the work of future
investigators with success.

                                CONTENTS

        INTRODUCTION                                                     3
        
        Relation of the Incest Phantasy to the Oedipus Legend—Moral
        revulsion over such a discovery

 I.—    CONCERNING THE TWO KINDS OF THINKING                             8
"""
    
    try:
        print("\nTesting structural analysis...")
        sections = chunker.analyze_text_structure(test_text)
        
        print("\nIdentified sections:")
        for i, section in enumerate(sections, 1):
            print(f"\nSection {i}:")
            print("="*80)
            print(f"Type: {section.type}")
            print(f"Level: {section.level}")
            print(f"Length: {len(section.text)} chars, {chunker.count_tokens(section.text)} tokens")
            print("-"*40)
            print(section.text)
            print("="*80)
        
        print("\nTesting chunking with structure preservation...")
        chunks = chunker.create_initial_chunks(test_text)
        
        print("\nResulting chunks:")
        for i, chunk in enumerate(chunks, 1):
            print(f"\nChunk {i}:")
            print("="*80)
            print(chunk)
            print("="*80)
            
        print("\nValidating chunk content...")
        for i, chunk in enumerate(chunks, 1):
            print(f"\nValidating chunk {i}:")
            is_valid = chunker.validate_chunk(chunk, sections)
            print(f"Chunk {i} validation: {'PASSED' if is_valid else 'FAILED'}")
            
        # Generate test summary
        test_summary = {
            "timestamp": datetime.now().isoformat(),
            "sections_identified": len(sections),
            "chunks_created": len(chunks),
            "section_types": {str(s.type): sum(1 for sec in sections if sec.type == s.type) for s in sections},
            "validation_results": [chunker.validate_chunk(c, sections) for c in chunks]
        }
        
        print("\nTest Summary:")
        pprint(test_summary)
            
    except Exception as e:
        print(f"Error during testing: {str(e)}")
        raise

if __name__ == "__main__":
    if os.environ.get("SEMANTIC_CHUNKER_TEST"):
        test_structure_analysis()
    else:
        main()

[2024-11-10 01:49:52] Starting processing of /home/ubuntu/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt
[2024-11-10 01:49:52] Input text: 845623 chars, 199664 tokens
[2024-11-10 01:49:52] Starting text processing
[2024-11-10 01:49:52] 
Processing chunk 1
[2024-11-10 01:49:52] Starting get_complete_paragraphs with 845623 chars of text
[2024-11-10 01:49:52] Found 2957 sections
[2024-11-10 01:49:52] Processing section 1: SectionType.HEADER, 9 tokens
[2024-11-10 01:49:52] Processing section 2: SectionType.HEADER, 20 tokens
[2024-11-10 01:49:52] Processing section 3: SectionType.HEADER, 12 tokens
[2024-11-10 01:49:52] Processing section 5: SectionType.HEADER, 12 tokens
[2024-11-10 01:49:52] Processing section 7: SectionType.HEADER, 12 tokens
[2024-11-10 01:49:52] Processing section 9: SectionType.HEADER, 14 tokens
[2024-11-10 01:49:52] Processing section 11: SectionType.HEADER, 11 tokens
[2024-11-10 01:4

ZeroDivisionError: division by zero

In [5]:
# # ----------------------------- #
# # Part 3: Parse Text into Discourse Units
# # ----------------------------- #

def parse_discourse_units(text, overwrite=False):
    """
    Parses text into discourse units using spaCy.
    Currently splits text into sentences.
    """
    paragraphs = text.split('\n\n')
    paragraphs = [para.strip() for para in paragraphs if para.strip()]

    discourse_units = []
    for para in paragraphs:
        doc = nlp(para)
        sentences = [sent.text for sent in doc.sents]
        discourse_units.extend(sentences)

    output_path = '/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious_discourse_units.json'
    if not os.path.exists(output_path) or overwrite:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(discourse_units, f, ensure_ascii=False, indent=4)
    else:
        with open(output_path, 'r') as f:
            discourse_units = json.load(f)

    print(f"Total Discourse Units: {len(discourse_units)}")
    return discourse_units

In [6]:
# ----------------------------- #
# Part 4: Create Chunks Using Hybrid Strategy
# ----------------------------- #

def create_chunks(discourse_units, tokenizer, max_length=4096, overlap_size=1, overwrite=False):
    """
    Creates chunks from discourse units using a sliding window with overlapping chunks.
    Optimized to work directly with token IDs and utilize efficient list operations.
    """
    chunks = []
    current_chunk_tokens = []
    current_length = 0

    for unit in discourse_units:
        unit_tokens = tokenizer.encode(unit, add_special_tokens=True)
        unit_length = len(unit_tokens)

        if current_length + unit_length <= max_length:
            current_chunk_tokens.extend(unit_tokens)
            current_length += unit_length
        else:
            # Decode and append the current chunk
            chunk_text = tokenizer.decode(
                current_chunk_tokens, skip_special_tokens=True)
            chunks.append(chunk_text)

            # Prepare overlap tokens
            overlap_tokens = current_chunk_tokens[-overlap_size:]
            current_chunk_tokens = overlap_tokens + unit_tokens
            current_length = len(current_chunk_tokens)

    # Append any remaining tokens as the last chunk
    if current_chunk_tokens:
        chunk_text = tokenizer.decode(
            current_chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)

    # Write or read chunks as before
    output_path = '/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious_chunks.json'
    if not os.path.exists(output_path) or overwrite:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(discourse_units, f, ensure_ascii=False, indent=4)
    else:
        with open(output_path, 'r') as f:
            discourse_units = json.load(f)

    print(f"Total Chunks Created: {len(chunks)}")
    return chunks

In [7]:
# ----------------------------- #
# Part 5: Create and Tokenize Dataset
# ----------------------------- #

# To Do - make book titles and prompt generic so
def create_tokenized_dataset(chunks, tokenizer, max_length=1024, num_proc=2):

    # Create a Dataset object from chunks

    book_title = 'Psychology of the Unconscious by C. G. Jung'
    wikipedia_prompt = """
    Psychology Book

    ### Title: {}

    ### Article: {}
    """

    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

    def formatting_prompts_func(examples):
        titles = book_title
        texts = examples["text"]
        outputs = []
        for title, text in zip([book_title]*len(chunks), texts):
            text = wikipedia_prompt.format(title, text) + EOS_TOKEN
            outputs.append(text)
        return {"text": outputs, }
    pass

    # convert chunks variable to huggingface dataset

    from datasets import Dataset

    dataset = Dataset.from_dict({"text": chunks})

    dataset = dataset.map(formatting_prompts_func,
                          batched=True, num_proc=num_proc)
    # Split the dataset into training and validation sets
    split = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split['train']
    eval_dataset = split['test']

    print(len(dataset))
    # Find the maximum length of the text field in the entire dataset
    max_length = max(len(text) for text in dataset['text'])
    print(f"The maximum length of the text field in the dataset is: {max_length} characters")
    print(f"Training Dataset Size: {len(train_dataset)}")
#     print(f"First 5 rows of training dataset: {train_dataset[:5]}")
    print(f"Validation Dataset Size: {len(eval_dataset)}")
#     print(f"First 5 rows of validation dataset: {eval_dataset[:5]}")
    return train_dataset, eval_dataset

In [8]:
# ----------------------------- #
# Part 6: Set up environment and other important utilities
# ----------------------------- #

def setup_environment():
    """
    Initializes the Accelerator for distributed training.
    """
    return Accelerator()


def get_custom_lr_scheduler(optimizer, num_warmup_steps, num_training_steps, initial_phase_steps):
    """
    Defines a custom learning rate scheduler with warmup, constant, and linear annealing phases.
    """
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return current_step / num_warmup_steps  # Linear warmup
        elif current_step < initial_phase_steps:
            return 1.0  # Constant learning rate for initial phase
        else:
            # Linear annealing for the remaining steps
            return 1.0 - ((current_step - initial_phase_steps) / (num_training_steps - initial_phase_steps))

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


def setup_training_steps(initial_rows, annealing_rows, batch_size, gradient_accumulation_steps, num_epochs):
    """
    Calculates total and initial training steps based on dataset size and training parameters.
    """
    total_rows = initial_rows + annealing_rows
    total_steps = (total_rows * num_epochs) // (batch_size *
                                                gradient_accumulation_steps)
    initial_steps = (initial_rows * num_epochs) // (batch_size *
                                                    gradient_accumulation_steps)
    return max(1, total_steps), max(1, initial_steps)


def print_memory_usage(step_desc):
    """
    Prints the CUDA memory summary if CUDA is available.
    """
    if torch.cuda.is_available():
        print(f"Memory Usage at {step_desc}:")
        print(torch.cuda.memory_summary())
        print("\n")
    else:
        print(f"No CUDA available at {step_desc}.\n")


def inference(model, tokenizer):
    """
    Runs inference using the trained model.
    """
    # Define sample prompts
    prompts = [
        "Explain what is medical anthropology and its importance in elevating the quality of life?",
        "Explain what are the types of Jungian archetypes and how they manifest in the human psyche?"
    ]

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=256)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Prompt: {prompt}\nGenerated Text: {generated_text}\n")
        
def compute_metrics(eval_pred):
    """
    Computes perplexity based on model predictions and labels.
    """
    logits, labels = eval_pred
    # Convert to torch tensors
    logits = torch.tensor(logits)
    labels = torch.tensor(labels)
    
    # Ensure shapes match
    if logits.shape[:2] != labels.shape:
        raise ValueError(f"Logits shape {logits.shape} does not match labels shape {labels.shape}")
    
    # Shift logits and labels
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = labels[:, 1:].contiguous()

    # Check label values
    if shift_labels.max() >= model.config.vocab_size:
        raise ValueError(f"Label value {shift_labels.max()} exceeds vocab size {model.config.vocab_size}")
    
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    perplexity = torch.exp(loss).item()
    return {"perplexity": perplexity}

#  Login to Huggingface
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()

def setup_huggingface_access():
    """Setup Hugging Face access with token from environment or manual input"""
    # First try to get token from environment variable
    token = os.getenv('HUGGINGFACE_TOKEN')
    
    if not token:
        # If not in environment, prompt for token
        token = input("Enter your Hugging Face token: ")
        
    if token:
        try:
            login(token, add_to_git_credential=True)
            print("Successfully logged in to Hugging Face!")
        except Exception as e:
            print(f"Failed to log in to Hugging Face: {str(e)}")
            return False
    else:
        print("No Hugging Face token provided")
        return False
    
    return True

In [9]:
def load_model_and_tokenizer(base_model_slug, max_seq_length=4096, dtype=None, load_in_4bit=True):
    """
    Load and configure the model and tokenizer with specified parameters on a single GPU.
    """
    import torch
    import os

    # Force CUDA if available
    if torch.cuda.is_available():
        print("CUDA is available.")
        print(f"Using GPU: {torch.cuda.get_device_properties(0).name}")
        device = torch.device("cuda:0")
        device_map = {"": 0}  # Force everything to GPU 0
    else:
        print("WARNING: CUDA is not available. Using CPU.")
        device = torch.device("cpu")
        device_map = "cpu"

    # Print initial GPU memory
    if torch.cuda.is_available():
        print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0)/1024**3:.2f}GB")

    try:
        # Load base model and tokenizer
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=base_model_slug,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
            device_map=device_map,
            token=os.getenv('HUGGINGFACE_TOKEN'),
        )
        
        print(f"Model device after loading: {next(model.parameters()).device}")
        
        # Configure PEFT model
        model = FastLanguageModel.get_peft_model(
            model,
            r=128,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
                "embed_tokens", "lm_head",
            ],
            lora_alpha=32,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
            use_rslora=True,
            loftq_config=None,
        )
        
        # Ensure model is on GPU after PEFT configuration
        if torch.cuda.is_available():
            model = model.to(device)
            
        # Verify final device placement
        print(f"Final model device: {next(model.parameters()).device}")
        
        # Print GPU memory usage
        if torch.cuda.is_available():
            print(f"\nGPU Memory After Complete Setup: {torch.cuda.memory_allocated(0)/1024**3:.2f}GB")
            
    except Exception as e:
        print(f"Error in model loading/configuration: {str(e)}")
        raise

    return model, tokenizer

In [None]:
import os

# Set the environment variable
os.environ["HUGGINGFACE_TOKEN"] = "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"
os.environ["WANDB_API_KEY"] = "1ca3c5e9222c2504acbc07cf7f88267006ae68c4"
# Verify it's set correctly
print(os.getenv("HUGGINGFACE_TOKEN"))
print(os.getenv("WANDB_API_KEY"))

In [None]:
# ----------------------------- #
# Part 2: Configure Environment Variables & Create Main Variables
# ----------------------------- #

# Unsloth modell initialization variables
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
max_length = max_seq_length
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# device_map = "auto"
base_model_slug = "Qwen/Qwen2.5-7B-Instruct"
base_model_name = "lora_model_pum"
chunks_max_length = max_seq_length
overlap_size = 1
# Define your parameters
batchSize = 2
ga = 8
maxSteps = 10
warmupSteps = 10
numTrainEpochs = 1
lRate = 5e-5
embLRate = 1e-5
optim = "adamw_8bit"
lrSchedule = "linear"
dataset_slug = "psychology_of_unconscious"

from datetime import datetime
import pytz
import wandb
# Get the current date and time in Indian Standard Time (IST)
ist = pytz.timezone('Asia/Kolkata')
current_datetime = datetime.now(ist)

# Format the datetime string
# Example format: 20240428_153045 (YYYYMMDD_HHMMSS)
formatted_datetime = current_datetime.strftime("%Y%m%d_%H%M%S")

# Define Run Name
run_name = f"""Kaggle-quantumLeap-{formatted_datetime}-{base_model_slug}-{dataset_slug}-{max_seq_length}_maxSeqLength-{max_length}_maxLength-{batchSize}_batchSize-{ga}_ga-{maxSteps}_maxSteps-{numTrainEpochs}_numTrainEpochs-{lRate}_lRate-{embLRate}_embLRate-{optim}_optim-{lrSchedule}_lrSchedule"""

# Initialize Weights & Biases
# It's recommended to set your W&B API key as an environment variable for security.
wandb.login(key=os.getenv("WANDB_API_KEY"))
wandb.init(project="OLA-quantumLeap", name=run_name)

In [None]:

# ----------------------------- #
# Part 9: Data Processing
# ----------------------------- #

# # Perform Inference Before Training
# inference(model, tokenizer)

# Set number of processes to use for data loading
num_cpus = multiprocessing.cpu_count()
num_proc = max(num_cpus-2, 2)  # Adjust based on prior recommendations
print(f"Number of CPU cores: {num_cpus}")
print(f"Number of processes: {num_proc}")

# Login to Hugging Face
if not setup_huggingface_access():
    raise Exception("Failed to setup Hugging Face access. Please check your token.")

# Load Model and Tokenizer
model, tokenizer = load_model_and_tokenizer(base_model_slug)
print(f"Model Device: {model.device}")

# Load and Clean Text Data
file_path = "/home/ubuntu/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt"
clean_text = load_and_clean_text(file_path)

# Parse Discourse Units
discourse_units = parse_discourse_units(clean_text, overwrite=True)

# Create Chunks
chunks = create_chunks(
    discourse_units,
    tokenizer,
    max_length=max_length,
    overlap_size=overlap_size,
    overwrite=True,
)

# Create Tokenized Dataset
train_dataset, eval_dataset = create_tokenized_dataset(
    chunks, tokenizer, max_length)

# Save datasets as Hugging Face `datasets`
train_dataset.save_to_disk('/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/train_dataset')
eval_dataset.save_to_disk('/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/eval_dataset')

### To Do - Make the below as dynamic and as a functio
# # Uncomment following if you want to just load the data from temp directory
# from datasets import load_from_disk

# train_dataset = load_from_disk('/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/train_dataset')
# eval_dataset = load_from_disk('/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/eval_dataset')

In [None]:
break

In [None]:
from transformers import IntervalStrategy
from transformers.integrations import TensorBoardCallback

import wandb

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,  # Use 10% of data for evaluation
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = batchSize,
        gradient_accumulation_steps = ga,

        # Set both max_steps and num_train_epochs
        max_steps = maxSteps,
        num_train_epochs = numTrainEpochs,

        # Use a single learning rate for all parameters
        learning_rate = lRate,

        # Warmup strategy from successful runs
        warmup_steps = warmupSteps,
        # warmup_ratio = 0,

        # Explicitly set precision based on hardware support
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        
        logging_steps = 1,
        
        optim = optim,
        weight_decay = 0.01,
        lr_scheduler_type = lrSchedule,
        
        seed = 3407,
        output_dir = "outputs",
        
        report_to=["tensorboard", "wandb"],
        logging_dir=f"./trel-fft-logs/{run_name}",
        
        # Set both save and evaluation strategies to 'steps'
        # save_strategy = IntervalStrategy.STEPS,
        # eval_strategy = IntervalStrategy.STEPS,
        # save_steps = 1,  # Save checkpoint every 20 steps
        # eval_steps = 1,  # Evaluate every 20 steps (matching save_steps)
        
        # load_best_model_at_end = True,
        # metric_for_best_model = "eval_loss",
    ),
    # compute_metrics = compute_metrics,
)

In [None]:
# %%time

# instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

# ### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
# concept_name: {}
# detailed_explanation: {}

# ### Response:
# {}"""

# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 1024, use_cache = True)
# tokenizer.batch_decode(outputs)


# %%time
# # Text Streaming goes into a loop and doesnt adher to EOS

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024)


# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")



# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256,
#                    repetition_penalty = 0.1)

In [None]:
# ----------------------------- #
# Part 11: Start Training
# ----------------------------- #

#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_stats = trainer.train()

#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")



In [None]:
import wandb
from pprint import pprint

def get_run_config(project_name, run_id):
    try:
        # Initialize the wandb API
        api = wandb.Api()

        # Access the specific run
        run = api.run(f"{project_name}/{run_id}")

        # Get the full configuration
        config = run.config

        # Filter for trainer-specific configuration
        trainer_config = {k: v for k, v in config.items() if k.startswith(('train', 'learning', 'optim', 'fp16', 'bf16', 'gradient', 'weight_decay', 'warmup', 'max_steps', 'num_train_epochs', 'per_device'))}

        return trainer_config

    except wandb.errors.CommError:
        print(f"Error: Unable to access run {run_id}. Make sure the run ID is correct and you have the necessary permissions.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Usage
project_name = "olabs-asia-olabs-pro/OLA-quantumLeap"
run_id = "we4axhd1"

trainer_config = get_run_config(project_name, run_id)

if trainer_config:
    print(f"Trainer configuration for run {run_id}:")
    pprint(trainer_config)

In [None]:
# %%time

# instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

# ### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
# concept_name: {}
# detailed_explanation: {}

# ### Response:
# {}"""

# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 1024, use_cache = True)
# tokenizer.batch_decode(outputs)


# %%time
# # Text Streaming goes into a loop and doesnt adher to EOS

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024)


# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")



# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256,
#                    repetition_penalty = 0.1)

In [None]:
import time
import os

# Create timestamp
timestamp = int(time.time())

# Create directory if it doesn't exist
save_dir = f"/root/quantumLeap/models/qLeap_model_v0_{timestamp}"
os.makedirs(save_dir, exist_ok=True)

# Save functions with explicit paths
def save_model_versions(model, tokenizer, timestamp, token):
    """
    Save model in different formats with proper error handling
    """
    try:
        # Save base model locally
        print("Saving base model locally...")
        # model.save_pretrained(f"{save_dir}/base")
        # tokenizer.save_pretrained(f"{save_dir}/base")
        
        # Save 8-bit Q8_0 version
        print("Saving 8-bit Q8_0 version...")
        try:
            model.push_to_hub_gguf(
                f"olabs-ai/qLeap_model_v0_8bit_Q8_{timestamp}",
                tokenizer,
                token=token,
                quantization_method="q8_0"
            )
            print("Successfully saved 8-bit model")
        except Exception as e:
            print(f"Error saving 8-bit model: {str(e)}")
            
        # Optional: Save 16-bit version
        print("Saving 16-bit version...")
        try:
            model.push_to_hub_gguf(
                f"olabs-ai/qLeap_model_v0_16bit_GGUF_{timestamp}",
                tokenizer,
                quantization_method="f16",
                token=token
            )
            print("Successfully saved 16-bit model")
        except Exception as e:
            print(f"Error saving 16-bit model: {str(e)}")
            
    except Exception as e:
        print(f"Error in save process: {str(e)}")
        raise

# Call the save function
huggingface_token = "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"
save_model_versions(model, tokenizer, timestamp, huggingface_token)

### if the loss from earlier training is too high try training arguments from unsloth colab notebook "Llama-3.1 8b + Unsloth 2x faster finetuning.ipynb". URL below
### https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=95_Nn-89DhsL

In [None]:
break

# Dataset creation based on the book itself using AugmenToolkit

# Instruction  Tuning

In [None]:

# Instruction FineTune - Create an instruction_pompt based on the concept_examples.csv file

import json
import ast
import logging

import csv

with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/concept_examples.csv', 'r') as f:
    reader = csv.DictReader(f)
    data = list(reader)
    
type(data)


# Configure logging
logging.basicConfig(
    filename='transformation_errors.log',
    filemode='w',
    level=logging.ERROR,
    format='%(levelname)s:%(message)s'
)

# Sample original data
original_data = data

def transform_data(original_data):
    """
    Transforms the original data by expanding 'example_scenario' into separate dictionaries.

    Parameters:
        original_data (list): List of dictionaries with 'concept_name', 'detailed_explanation', and 'example_scenario'.

    Returns:
        new_data (list): Transformed list with one 'example_scenario' per dictionary.
    """
    new_data = []

    for idx, entry in enumerate(original_data, start=1):
        concept_name = entry.get('concept_name', '').strip()
        detailed_explanation = entry.get('detailed_explanation', '').strip()
        example_scenario_str = entry.get('example_scenario', '').strip()

        if not concept_name or not detailed_explanation or not example_scenario_str:
            logging.error(f"Entry {idx} is missing required fields. Skipping.")
            continue

        # Attempt to parse with json.loads
        try:
            example_scenarios = json.loads(example_scenario_str)
            if not isinstance(example_scenarios, list):
                raise ValueError("Parsed 'example_scenario' is not a list.")
        except json.JSONDecodeError:
            # Fallback to ast.literal_eval
            try:
                example_scenarios = ast.literal_eval(example_scenario_str)
                if not isinstance(example_scenarios, list):
                    raise ValueError("Parsed 'example_scenario' is not a list.")
            except (ValueError, SyntaxError) as e:
                logging.error(f"Entry {idx} ('{concept_name}') has invalid 'example_scenario': {e}")
                continue

        # Iterate through each scenario and create a new entry
        for scenario_idx, scenario in enumerate(example_scenarios, start=1):
            if not isinstance(scenario, str):
                logging.error(f"Entry {idx} ('{concept_name}') has non-string scenario at position {scenario_idx}. Skipping this scenario.")
                continue

            new_entry = {
                'concept_name': concept_name,
                'detailed_explanation': detailed_explanation,
                'example_scenario': scenario.strip()
            }
            new_data.append(new_entry)

    return new_data

# Transform the data
transformed_data = transform_data(original_data)

# Optional: Save the transformed data to a JSON file
with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/transformed_data.json', 'w', encoding='utf-8') as f:
    json.dump(transformed_data, f, ensure_ascii=False, indent=4)

print(f"Transformation complete. {len(transformed_data)} entries created.")
print("Check 'transformation_errors.log' for any errors encountered during transformation.")

print(len(transformed_data))

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def instruction_prompt_func(examples):
    concept_name = examples["concept_name"]
    detailed_explanation = examples["detailed_explanation"]
    example_scenario = examples["example_scenario"]
    return { "text" : instruction_prompt.format(concept_name, detailed_explanation, example_scenario), }
pass


# convert transformed_data to a huggingface dataset
instruction_dataset = Dataset.from_dict(transformed_data)
instruction_dataset = instruction_dataset.map(instruction_prompt_func, batched = True,)

from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = instruction_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer_stats = trainer.train()
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Give an example scenario that illustrates the concept of Hero archetype as described by Jungian psychology.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# Text Streaming

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

inputs = tokenizer(
[
    instruction_prompt.format(
        "When trying to understand how nature plays a role in the development of a child's personality, which concept should be considered?",
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   repetition_penalty = 0.1)

In [None]:
# add current timestamp to model name
model.save_pretrained(f"qLeap_model_base_v0_{int(time.time())}") # Local saving
tokenizer.save_pretrained(f"qLeap_model_instruct_v0_{int(time.time())}")
model.push_to_hub(f"olabs-ai/qLeap_model_instruct_v0_{int(time.time())}", token = "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") # Online saving
tokenizer.push_to_hub(f"olabs-ai/qLeap_model_instruct_v0_{int(time.time())}", token = "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") # Online saving
        
# Save to 8bit GGUF
if False: model.save_pretrained_gguf("qLeap_model_v0_8bit_Q8_{int(time.time())}", tokenizer,)
if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_8bit_GGUF_{int(time.time())}", tokenizer,quantization_method = "q8_0", token = "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")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16", token = "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")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m", token = "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")
if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q5_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q5_k_m", token = "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")



# # Merge to 16bit
# if False: model.save_pretrained_merged("qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit", token = "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")

# # Merge to 4bit
# if False: model.save_pretrained_merged("qLeap_model_v0_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit", token = "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")

# # Just LoRA adapters
# if False: model.save_pretrained_merged("qLeap_model_v0_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora", token = "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")



# Inference

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Give an example scenario that illustrates the concept of Hero archetype as described by Jungian psychology.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# Text Streaming

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

inputs = tokenizer(
[
    instruction_prompt.format(
        "When trying to understand how nature plays a role in the development of a child's personality, which concept should be considered?",
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   repetition_penalty = 0.1)