# Git clone qLeap-fft repo inside `/root/` directory
## Ensure to have the latest branch
## Switch to quantumLeap directory

In [None]:
import os

# Set these environment variables before importing torch-related modules
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

from pathlib import Path

def ensure_working_directory():
    """
    Check if we're in the correct working directory, if not switch to it.
    Creates the directory if it doesn't exist.
    """
    target_dir = '/home/ubuntu/quantumLeap'
    current_dir = os.getcwd()
    
    # Print current directory
    print(f"Current directory: {current_dir}")
    
    # Check if we need to switch directories
    if current_dir != target_dir:
        # Create directory if it doesn't exist
        Path(target_dir).mkdir(parents=True, exist_ok=True)
        
        try:
            # Change to target directory
            os.chdir(target_dir)
            print(f"Successfully switched to: {target_dir}")
        except Exception as e:
            print(f"Error switching to directory: {str(e)}")
            raise
    else:
        print("Already in correct directory")
    
    # Verify current directory
    print(f"Working directory: {os.getcwd()}")

# Call the function before your main code
ensure_working_directory()

In [None]:
# ----------------------------- #
# Part 1.1: Install and Setup Libraries - for Ola Krutrim Cloud Instance
# ----------------------------- #

# # if executing below in terminal with virtual env, do not need to add --system tag
# pip install uv #install this in the virtual environment where you want to execute the notebook.
# pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121 # as on 07Nov2024, xformers is compatible with torch=2.4.0 only; uv doesnt work for installing torch
# uv pip install packaging ninja
# uv pip install flash-attn --no-build-isolation
# uv pip install unsloth
# python -m xformers.info
# uv pip install wandb bitsandbytes ipywidgets nltk spacy huggingface_hub datasets tqdm Iprogress ipywidgets python-dotenv tensorboard -q

# # restart once you have installed all of the above
# !nvidia-smi
# !nvcc --version
# import torch
# print(torch.__version__)          # Should reflect 2.5.0+cu124
# print(torch.version.cuda)         # Should output 12.4
# print(torch.cuda.is_available())  # Should return True

# Restart again so that all the libraries are properly initialized

In [None]:
# ----------------------------- #
# Part 1.2: Import Necessary Libraries
# ----------------------------- #

# General Libraries
import os
import json
import sys
import subprocess
import argparse
import logging
import math
import random
from datetime import datetime
import re
import gc
import weakref
import multiprocessing

# Torch related
import torch
from torch import nn
import torch.distributed as dist

# Transformers related
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Adafactor
)

# Huggingface TRL for full finetune
from trl import SFTTrainer, SFTConfig

# General huggingface libraries
import huggingface_hub
from datasets import load_dataset, Dataset
from accelerate import Accelerator


# Unsloth specificic libraries
import unsloth
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments, FastLanguageModel

# Other Libraries
from peft import LoraConfig
import wandb
import nltk
import spacy
# from galore_torch import GaLoreAdamW, GaLoreAdafactor, GaLoreAdamW8bit

# Check and import NLTK and spacy modules
# Ensure NLTK's punkt tokenizer is available
import nltk
try:
    nltk.data.find('tokenizers/punkt')
    print('punkt was already available.')
except LookupError:
    nltk.download('punkt')
    print('punkt was not available. It has been downloaded')

# Initialize spaCy English model
try:
    nlp = spacy.load('en_core_web_sm')
    print('en_core_web_sm was already available.')
except OSError:
    print("SpaCy English model not found. Downloading...")
    os.system('python -m spacy download en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

In [3]:
# ----------------------------- #
# Part 2: Load and Clean the Text Data
# ----------------------------- #

def load_and_clean_text(file_path):
    """
    Loads text from a file and removes Project Gutenberg's license and headers/footers.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # # Remove Project Gutenberg's license text and headers/footers
    # start_pattern = r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK.*\*\*\*'
    # end_pattern = r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK.*\*\*\*'

    # text = re.sub(f'.*{start_pattern}', '', text, flags=re.DOTALL)
    # text = re.sub(f'{end_pattern}.*', '', text, flags=re.DOTALL)
    return text.strip()

# Replace 'psychology_of_unconscious.txt' with your actual file path
file_path = '/home/ubuntu/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt'
clean_text = load_and_clean_text(file_path)

In [None]:
%pip install t

In [None]:
import tiktoken
from openai import OpenAI
import json
import time
from typing import List, Dict, Tuple, Optional
import numpy as np
import os
from datetime import datetime
from pprint import pprint
import re
from dataclasses import dataclass
from enum import Enum

# Add these new data structures after imports
class SectionType(Enum):
    HEADER = "header"
    CONTENT = "content"
    QUOTE = "quote"
    ATTRIBUTION = "attribution"
    LIST = "list"
    FRONT_MATTER = "front_matter"
    TABLE_OF_CONTENTS = "table_of_contents"
    
@dataclass
class Section:
    text: str
    type: SectionType
    level: int = 0
    metadata: Dict = None
    

class SemanticChunker:
    def __init__(self, model_name: str = "meta-llama/Llama-3.2-3B-Instruct"):
        """Initialize the semantic chunker with model configuration"""
        self.client = OpenAI(
            base_url="http://localhost:8000/v1",
            api_key="dummy"
        )
        self.model_name = model_name
        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
        self.max_tokens = 3000
        
        # Set up logging directory with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.log_dir = f"/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/Psychology_Of_Unconscious_Mind/chunks/chunks_{timestamp}"
        os.makedirs(self.log_dir, exist_ok=True)
        
        # Set up logging file for processing summary
        self.log_file = os.path.join(self.log_dir, "processing_log.txt")
        
        # Initialize state variables
        self.missed_text = ""  # Store text not included in LLM output
        
    def log_message(self, message: str):
        """Write log message with timestamp and print to console"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] {message}"
        with open(self.log_file, 'a', encoding='utf-8') as f:
            f.write(log_entry + "\n")
        print(log_entry)
    
    def print_separator(self, message: str = ""):
        """Print a separator line with optional message"""
        print(f"\n{'='*100}")
        if message:
            print(f"{message}")
            print('='*100)
    
    def count_tokens(self, text: str) -> int:
        """Count tokens in text using tiktoken"""
        return len(self.encoding.encode(text))
    
    def find_chapter_breaks(self, text: str) -> List[int]:
        """Find indices where chapters begin (centered headings)"""
        lines = text.split('\n')
        chapter_breaks = []
        
        for i, line in enumerate(lines):
            if self.is_chapter_heading(line):
                chapter_breaks.append(i)
        
        return chapter_breaks
    
    def is_chapter_heading(self, text: str) -> Tuple[bool, int]:
        """
        Enhanced chapter heading detection with level identification.
        Returns (is_heading, level).
        """
        text = text.strip()
        if not text:
            return False, 0
            
        # Chapter patterns
        chapter_patterns = [
            (r'^CHAPTER\s+[IVXL]+', 1),  # Main chapter headers
            (r'^[IVX]+\.\s*—\s*', 2),    # Sub-chapter headers
            (r'^\d+\.\s*—\s*', 2),       # Numbered sections
        ]
        
        for pattern, level in chapter_patterns:
            if re.match(pattern, text, re.I):
                return True, level
        
        # Check for centered text formatting
        line_length = len(text)
        leading_spaces = len(text) - len(text.lstrip())
        trailing_spaces = len(text) - len(text.rstrip())
        
        is_centered = abs(leading_spaces - trailing_spaces) <= 2 and leading_spaces > 5
        is_caps = text.isupper()
        reasonable_length = 10 < len(text.strip()) < 100
        
        if is_centered:
            if is_caps and reasonable_length:
                return True, 1  # Main header
            elif reasonable_length:
                return True, 2  # Sub header
                
        return False, 0
    
    def analyze_text_structure(self, text: str) -> List[Section]:
        """
        Analyze text structure and break it into typed sections.
        """
        sections = []
        lines = text.split('\n')
        current_section = []
        current_type = None
        current_level = 0
        
        def flush_section():
            nonlocal current_section, current_type
            if current_section:
                sections.append(Section(
                    text='\n'.join(current_section),
                    type=current_type or SectionType.CONTENT,
                    level=current_level
                ))
                current_section = []
                current_type = None
        
        in_toc = False
        in_front_matter = False
        
        for i, line in enumerate(lines):
            # Detect Table of Contents
            if re.match(r'^\s*CONTENTS\s*$', line, re.I):
                flush_section()
                in_toc = True
                current_type = SectionType.TABLE_OF_CONTENTS
                continue
                
            # Detect Front Matter
            if re.match(r'^\s*AUTHOR\'S\s+NOTE\s*$', line, re.I):
                flush_section()
                in_front_matter = True
                current_type = SectionType.FRONT_MATTER
                continue
                
            # Check for section transitions
            is_heading, level = self.is_chapter_heading(line)
            if is_heading:
                flush_section()
                current_section = [line]
                current_type = SectionType.HEADER
                current_level = level
                continue
                
            # Detect quotes
            if line.startswith('"') and len(line) > 50:
                flush_section()
                current_type = SectionType.QUOTE
                
            # Detect attributions
            if re.match(r'^\s*—\s*[A-Z]', line):
                flush_section()
                current_type = SectionType.ATTRIBUTION
                
            # Detect lists
            if re.match(r'^\s{4,}(?:[\w\-]+\.|\-|\*)\s', line):
                if current_type != SectionType.LIST:
                    flush_section()
                    current_type = SectionType.LIST
                    
            current_section.append(line)
            
            # Handle section transitions
            if in_toc and not line.strip() and i < len(lines)-1 and lines[i+1].strip():
                in_toc = False
                flush_section()
                
            if in_front_matter and not line.strip() and i < len(lines)-1 and lines[i+1].strip():
                in_front_matter = False
                flush_section()
        
        flush_section()  # Flush any remaining content
        return sections
    
    
    def get_complete_paragraphs(self, text: str, max_tokens: int) -> Tuple[str, str]:
        """
        Enhanced version that respects document structure with optimization.
        """
        # Add debug logging
        self.log_message(f"Starting get_complete_paragraphs with {len(text)} chars of text")
        
        # First, analyze the structure
        sections = self.analyze_text_structure(text)
        self.log_message(f"Found {len(sections)} sections")
        
        current_sections = []
        current_tokens = 0
    def analyze_text_structure(self, text: str) -> List[Section]:
        """
        Analyze text structure and break it into typed sections.
        """
        sections = []
        lines = text.split('\n')
        current_section = []
        current_type = None
        current_level = 0
        
        def flush_section():
            nonlocal current_section, current_type
            if current_section:
                sections.append(Section(
                    text='\n'.join(current_section),
                    type=current_type or SectionType.CONTENT,
                    level=current_level
                ))
                current_section = []
                current_type = None
        
        in_toc = False
        in_front_matter = False
        
        for i, line in enumerate(lines):
            # Detect Table of Contents
            if re.match(r'^\s*CONTENTS\s*$', line, re.I):
                flush_section()
                in_toc = True
                current_type = SectionType.TABLE_OF_CONTENTS
                continue
                
            # Detect Front Matter
            if re.match(r'^\s*AUTHOR\'S\s+NOTE\s*$', line, re.I):
                flush_section()
                in_front_matter = True
                current_type = SectionType.FRONT_MATTER
                continue
                
            # Check for section transitions
            is_heading, level = self.is_chapter_heading(line)
            if is_heading:
                flush_section()
                current_section = [line]
                current_type = SectionType.HEADER
                current_level = level
                continue
                
            # Detect quotes
            if line.startswith('"') and len(line) > 50:
                flush_section()
                current_type = SectionType.QUOTE
                
            # Detect attributions
            if re.match(r'^\s*—\s*[A-Z]', line):
                flush_section()
                current_type = SectionType.ATTRIBUTION
                
            # Detect lists
            if re.match(r'^\s{4,}(?:[\w\-]+\.|\-|\*)\s', line):
                if current_type != SectionType.LIST:
                    flush_section()
                    current_type = SectionType.LIST
                    
            current_section.append(line)
            
            # Handle section transitions
            if in_toc and not line.strip() and i < len(lines)-1 and lines[i+1].strip():
                in_toc = False
                flush_section()
                
            if in_front_matter and not line.strip() and i < len(lines)-1 and lines[i+1].strip():
                in_front_matter = False
                flush_section()
        
        flush_section()  # Flush any remaining content
        return sections
    
    
    def get_complete_paragraphs(self, text: str, max_tokens: int) -> Tuple[str, str]:
        """
        Enhanced version that respects document structure with optimization.
        """
        # Add debug logging
        self.log_message(f"Starting get_complete_paragraphs with {len(text)} chars of text")
        
        # First, analyze the structure
        sections = self.analyze_text_structure(text)
        self.log_message(f"Found {len(sections)} sections")
        
        current_sections = []
        current_tokens = 0
    def analyze_text_structure(self, text: str) -> List[Section]:
        """
        Analyze text structure and break it into typed sections.
        """
        sections = []
        lines = text.split('\n')
        current_section = []
        current_type = None
        current_level = 0
        
        def flush_section():
            nonlocal current_section, current_type
            if current_section:
                sections.append(Section(
                    text='\n'.join(current_section),
                    type=current_type or SectionType.CONTENT,
                    level=current_level
                ))
                current_section = []
                current_type = None
        
        in_toc = False
        in_front_matter = False
        
        for i, line in enumerate(lines):
            # Detect Table of Contents
            if re.match(r'^\s*CONTENTS\s*$', line, re.I):
                flush_section()
                in_toc = True
                current_type = SectionType.TABLE_OF_CONTENTS
                continue
                
            # Detect Front Matter
            if re.match(r'^\s*AUTHOR\'S\s+NOTE\s*$', line, re.I):
                flush_section()
                in_front_matter = True
                current_type = SectionType.FRONT_MATTER
                continue
                
            # Check for section transitions
            is_heading, level = self.is_chapter_heading(line)
            if is_heading:
                flush_section()
                current_section = [line]
                current_type = SectionType.HEADER
                current_level = level
                continue
                
            # Detect quotes
            if line.startswith('"') and len(line) > 50:
                flush_section()
                current_type = SectionType.QUOTE
                
            # Detect attributions
            if re.match(r'^\s*—\s*[A-Z]', line):
                flush_section()
                current_type = SectionType.ATTRIBUTION
                
            # Detect lists
            if re.match(r'^\s{4,}(?:[\w\-]+\.|\-|\*)\s', line):
                if current_type != SectionType.LIST:
                    flush_section()
                    current_type = SectionType.LIST
                    
            current_section.append(line)
            
            # Handle section transitions
            if in_toc and not line.strip() and i < len(lines)-1 and lines[i+1].strip():
                in_toc = False
                flush_section()
                
            if in_front_matter and not line.strip() and i < len(lines)-1 and lines[i+1].strip():
                in_front_matter = False
                flush_section()
        
        flush_section()  # Flush any remaining content
        return sections
    
    
    def get_complete_paragraphs(self, text: str, max_tokens: int) -> Tuple[str, str]:
        """
        Enhanced version that respects document structure with optimization.
        """
        # Add debug logging
        self.log_message(f"Starting get_complete_paragraphs with {len(text)} chars of text")
        
        # First, analyze the structure
        sections = self.analyze_text_structure(text)
        self.log_message(f"Found {len(sections)} sections")
        
        current_sections = []
        current_tokens = 0
    def analyze_text_structure(self, text: str) -> List[Section]:
        """
        Analyze text structure and break it into typed sections.
        """
        sections = []
        lines = text.split('\n')
        current_section = []
        current_type = None
        current_level = 0
        
        def flush_section():
            nonlocal current_section, current_type
            if current_section:
                sections.append(Section(
                    text='\n'.join(current_section),
                    type=current_type or SectionType.CONTENT,
                    level=current_level
                ))
                current_section = []
                current_type = None
        
        in_toc = False
        in_front_matter = False
        
        for i, line in enumerate(lines):
            # Detect Table of Contents
            if re.match(r'^\s*CONTENTS\s*$', line, re.I):
                flush_section()
                in_toc = True
                current_type = SectionType.TABLE_OF_CONTENTS
                continue
                
            # Detect Front Matter
            if re.match(r'^\s*AUTHOR\'S\s+NOTE\s*$', line, re.I):
                flush_section()
                in_front_matter = True
                current_type = SectionType.FRONT_MATTER
                continue
                
            # Check for section transitions
            is_heading, level = self.is_chapter_heading(line)
            if is_heading:
                flush_section()
                current_section = [line]
                current_type = SectionType.HEADER
                current_level = level
                continue
                
            # Detect quotes
            if line.startswith('"') and len(line) > 50:
                flush_section()
                current_type = SectionType.QUOTE
                
            # Detect attributions
            if re.match(r'^\s*—\s*[A-Z]', line):
                flush_section()
                current_type = SectionType.ATTRIBUTION
                
            # Detect lists
            if re.match(r'^\s{4,}(?:[\w\-]+\.|\-|\*)\s', line):
                if current_type != SectionType.LIST:
                    flush_section()
                    current_type = SectionType.LIST
                    
            current_section.append(line)
            
            # Handle section transitions
            if in_toc and not line.strip() and i < len(lines)-1 and lines[i+1].strip():
                in_toc = False
                flush_section()
                
            if in_front_matter and not line.strip() and i < len(lines)-1 and lines[i+1].strip():
                in_front_matter = False
                flush_section()
        
        flush_section()  # Flush any remaining content
        return sections
    
    def analyze_text_structure(self, text: str) -> List[Section]:
        """
        Enhanced text structure analysis with better header and spacing detection.
        """
        sections = []
        lines = text.split('\n')
        current_section = []
        current_type = None
        current_level = 0
        
        def flush_section():
            nonlocal current_section, current_type
            if current_section:
                # Skip empty sections
                content = '\n'.join(current_section).strip()
                if content:  # Only create section if there's actual content
                    sections.append(Section(
                        text='\n'.join(current_section),
                        type=current_type or SectionType.CONTENT,
                        level=current_level
                    ))
                current_section = []
                current_type = None
        
        in_toc = False
        in_front_matter = False
        
        i = 0
        while i < len(lines):
            line = lines[i]
            next_line = lines[i + 1] if i + 1 < len(lines) else ""
            
            # Detect centered headers
            if line.strip() and line.strip().isupper():
                leading_spaces = len(line) - len(line.lstrip())
                if leading_spaces > 10:  # Likely centered
                    flush_section()
                    current_type = SectionType.HEADER
                    current_level = 1
                    current_section = [line]
                    if not next_line.strip():  # Include following blank line
                        current_section.append(next_line)
                        i += 1
                    flush_section()
                    i += 1
                    continue
            
            # Detect Table of Contents
            if re.match(r'^\s*CONTENTS\s*$', line, re.I):
                flush_section()
                in_toc = True
                current_type = SectionType.TABLE_OF_CONTENTS
                current_section = [line]
                if not next_line.strip():  # Include following blank line
                    current_section.append(next_line)
                    i += 1
                i += 1
                continue
            
            # Detect Author's Note
            if re.match(r'^\s*AUTHOR\'S\s+NOTE\s*$', line, re.I):
                flush_section()
                in_front_matter = True
                current_type = SectionType.FRONT_MATTER
                current_section = [line]
                if not next_line.strip():  # Include following blank line
                    current_section.append(next_line)
                    i += 1
                i += 1
                continue
            
            # Detect chapter headings
            is_heading, level = self.is_chapter_heading(line)
            if is_heading:
                flush_section()
                current_type = SectionType.HEADER
                current_level = level
                current_section = [line]
                if not next_line.strip():  # Include following blank line
                    current_section.append(next_line)
                    i += 1
                flush_section()
                i += 1
                continue
            
            # Handle section content
            if in_toc:
                if not line.strip() and not next_line.strip():
                    in_toc = False
                    flush_section()
                else:
                    current_section.append(line)
            elif in_front_matter:
                if not line.strip() and not next_line.strip():
                    in_front_matter = False
                    flush_section()
                else:
                    current_section.append(line)
            else:
                current_section.append(line)
            
            i += 1
        
        flush_section()  # Flush any remaining content
        
        # Filter out empty sections and preserve correct spacing
        filtered_sections = []
        for section in sections:
            if section.text.strip():
                filtered_sections.append(section)
        
        return filtered_sections

    def get_complete_paragraphs(self, text: str, max_tokens: int) -> Tuple[str, str]:
        """
        Enhanced version with corrected content processing logic.
        """
        self.log_message(f"Starting get_complete_paragraphs with {len(text)} chars of text")
        
        sections = self.analyze_text_structure(text)
        self.log_message(f"Found {len(sections)} sections")
        
        current_sections = []
        current_tokens = 0
        section_index = 0
        
        try:
            while section_index < len(sections):
                section = sections[section_index]
                section_tokens = self.count_tokens(section.text)
                
                self.log_message(f"Processing section {section_index + 1}: {section.type}, {section_tokens} tokens")
                
                # If this section would exceed our token limit
                if current_tokens + section_tokens > max_tokens:
                    if current_sections:  # Only break if we have content
                        break
                
                # Always include header with its following content
                if section.type == SectionType.HEADER:
                    # Add the header
                    current_sections.append(section)
                    current_tokens += section_tokens
                    
                    # Look ahead for content
                    next_index = section_index + 1
                    if next_index < len(sections) and sections[next_index].type == SectionType.CONTENT:
                        next_section = sections[next_index]
                        next_tokens = self.count_tokens(next_section.text)
                        if current_tokens + next_tokens <= max_tokens:
                            current_sections.append(next_section)
                            current_tokens += next_tokens
                            section_index += 1  # Skip the content section in next iteration
                    
                # Handle content sections not attached to headers
                elif section.type == SectionType.CONTENT:
                    current_sections.append(section)
                    current_tokens += section_tokens
                
                # Handle other section types (TABLE_OF_CONTENTS, etc.)
                else:
                    current_sections.append(section)
                    current_tokens += section_tokens
                
                section_index += 1
                self.log_message(f"After processing: current_tokens={current_tokens}, max_tokens={max_tokens}, sections_processed={len(current_sections)}")
            
            # Combine sections with proper spacing
            processed_sections = []
            for i, section in enumerate(current_sections):
                # Add extra newline before sections (except the first one)
                if i > 0:
                    processed_sections.append("")
                
                # Add the section text
                processed_sections.append(section.text.rstrip())
                
                # Add extra newline after headers
                if section.type == SectionType.HEADER:
                    processed_sections.append("")
            
            processed_text = "\n".join(processed_sections)
            
            # Prepare remaining sections
            remaining_sections = []
            if section_index < len(sections):
                for section in sections[section_index:]:
                    if remaining_sections:
                        remaining_sections.append("")
                    remaining_sections.append(section.text.rstrip())
            
            remaining_text = "\n".join(remaining_sections) if remaining_sections else ""
            
            self.log_message(f"Completed processing: {len(current_sections)} sections included, {len(sections) - section_index} remaining")
            self.log_message(f"Processed text preview: {processed_text[:200]}...")
            
            return processed_text, remaining_text
            
        except Exception as e:
            self.log_message(f"Error in get_complete_paragraphs: {str(e)}")
            if current_sections:
                return "\n".join([s.text for s in current_sections]), text
            return "", text

    def process_text(self, text: str, max_chunks: int = None) -> List[str]:
        """Process entire text into semantic sections with enhanced logging"""
        self.log_message("Starting text processing")
        
        # Create initial chunks
        initial_chunks = self.create_initial_chunks(text)
        
        if max_chunks:
            initial_chunks = initial_chunks[:max_chunks]
            self.log_message(f"Processing limited to first {max_chunks} chunks")
        
        # Process each chunk
        semantic_chunks = []
        for i, chunk in enumerate(initial_chunks):
            self.log_message(f"Processing chunk {i+1}/{len(initial_chunks)}")
            
            # Get semantic sections
            sections, metrics = self.get_semantic_sections(chunk)
            
            # Print processing details
            self.print_separator("INPUT CHUNK")
            print(f"Chunk {i+1} (Tokens: {self.count_tokens(chunk)})")
            print("Content preview:")
            print(chunk[:1000] + "..." if len(chunk) > 1000 else chunk)
            
            self.print_separator("SEMANTIC SECTIONS")
            for j, section in enumerate(sections):
                print(f"\nSection {j+1} (Tokens: {self.count_tokens(section)})")
                print("Content preview:")
                print(section[:500] + "..." if len(section) > 500 else section)
            
            self.print_separator("METRICS")
            pprint(metrics)
            
            if self.missed_text:
                self.print_separator("MISSED TEXT")
                print(self.missed_text)
            
            semantic_chunks.extend(sections)
            
            # Save intermediate results
            self.save_chunk_log(i+1, chunk, sections, metrics)
            
            time.sleep(1)  # Rate limiting
            
        self.log_message(f"Processing complete. Total semantic chunks created: {len(semantic_chunks)}")
        return semantic_chunks
    
    def verify_output_completeness(self, input_text: str, output_sections: List[str]) -> str:
        """Verify all input text is present in output sections and return missing text"""
        # Normalize texts for comparison
        input_normalized = ' '.join(input_text.split())
        output_normalized = ' '.join(' '.join(output_sections).split())
        
        # Find missing content
        words = input_normalized.split()
        window_size = 5  # Look for sequences of 5 words
        
        missing_sequences = []
        i = 0
        while i < len(words) - window_size:
            sequence = ' '.join(words[i:i+window_size])
            if sequence not in output_normalized:
                # Find complete missing phrase
                start = i
                while start > 0 and ' '.join(words[start-1:i+window_size]) not in output_normalized:
                    start -= 1
                end = i + window_size
                while end < len(words) and ' '.join(words[i:end+1]) not in output_normalized:
                    end += 1
                missing_sequences.append(' '.join(words[start:end]))
                i = end
            else:
                i += 1
        
        return '\n'.join(missing_sequences) if missing_sequences else ""
    
    def create_initial_chunks(self, text: str) -> List[str]:
        """
        Create initial chunks with enhanced logging.
        """
        chunks = []
        remaining_text = text
        chunk_number = 0
        
        while remaining_text.strip():
            chunk_number += 1
            self.log_message(f"\nProcessing chunk {chunk_number}")
            
            # Add any missed text from previous chunk
            if self.missed_text:
                self.log_message("Adding missed text from previous chunk")
                remaining_text = self.missed_text + '\n\n' + remaining_text
                self.missed_text = ""
            
            # Get complete paragraphs up to token limit
            chunk_text, remaining_text = self.get_complete_paragraphs(remaining_text, self.max_tokens)
            
            if chunk_text.strip():
                self.log_message(f"Created chunk {chunk_number} with {self.count_tokens(chunk_text)} tokens")
                chunks.append(chunk_text)
                
                # Debug output
                preview = chunk_text[:200] + "..." if len(chunk_text) > 200 else chunk_text
                self.log_message(f"Chunk {chunk_number} preview:\n{preview}")
            else:
                self.log_message("Warning: Empty chunk produced")
                if not remaining_text.strip():
                    break
            
            if len(chunks) >= 100:  # Safety limit
                self.log_message("Warning: Maximum chunk limit reached")
                break
        
        self.log_message(f"Created {len(chunks)} initial chunks")
        
        # Save the chunks
        os.makedirs(self.log_dir, exist_ok=True)
        for i, chunk in enumerate(chunks):
            with open(os.path.join(self.log_dir, f"chunk_{i+1:04d}.txt"), 'w', encoding='utf-8') as f:
                f.write(chunk)
                
        return chunks
    
    def get_semantic_sections(self, chunk: str) -> Tuple[List[str], Dict]:
        """Update the system prompt for better structural preservation."""
        try:
            self.log_message(f"Sending request to LLM (input tokens: {self.count_tokens(chunk)})")
            
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {
                        "role": "system",
                        "content": """You are a text analysis expert. Your task is to:
                        1. Maintain the original document structure (headers, lists, quotes)
                        2. Split the input text into coherent semantic sections
                        3. Each section must respect structural boundaries
                        4. Use <START_SECTION> and <END_SECTION> to mark sections
                        5. Include ALL text from the input - do not skip any content
                        6. Preserve ALL formatting, indentation, and special characters
                        7. If there's a header, keep it with its content
                        8. Keep lists and quotes intact within their sections
                        9. If a section would be incomplete, mark it with <INCOMPLETE> tags
                        
                        Rules:
                        - Preserve ALL text exactly as provided
                        - Maintain original formatting and spacing
                        - Don't add any commentary
                        - Don't modify the text
                        - Keep structural elements together
                        - Respect document hierarchy"""
                    },
                    {
                        "role": "user",
                        "content": f"Split this text into coherent sections, preserving ALL content and structure:\n\n{chunk}"
                    }
                ],
                max_tokens=self.max_tokens,
                temperature=0.2
            )
            
            result = response.choices[0].message.content
            
            # Extract sections
            sections = []
            section_pattern = r'<START_SECTION>(.*?)<END_SECTION>'
            for match in re.finditer(section_pattern, result, re.DOTALL):
                section_text = match.group(1).strip()
                if section_text and len(section_text) > 50:  # Ignore empty or very short sections
                    sections.append(section_text)
            
            # Check for incomplete section
            incomplete_pattern = r'<INCOMPLETE>(.*?)</INCOMPLETE>'
            incomplete_match = re.search(incomplete_pattern, result, re.DOTALL)
            if incomplete_match:
                incomplete_text = incomplete_match.group(1).strip()
                if incomplete_text:
                    self.missed_text = incomplete_text
                    self.log_message(f"Found incomplete section ({self.count_tokens(incomplete_text)} tokens)")
            
            # Verify all content is included
            if not incomplete_match:  # Only check if no explicit incomplete section
                missed_text = self.verify_output_completeness(chunk, sections)
                if missed_text:
                    self.missed_text = missed_text
                    self.log_message(f"Found missed text ({self.count_tokens(missed_text)} tokens)")
            
            metrics = {
                "completion_tokens": response.usage.completion_tokens,
                "prompt_tokens": response.usage.prompt_tokens,
                "total_tokens": response.usage.total_tokens,
                "finish_reason": response.choices[0].finish_reason,
                "sections_created": len(sections),
                "has_missed_text": bool(self.missed_text)
            }
            
            return sections, metrics
                
        except Exception as e:
            self.log_message(f"Error in LLM request: {str(e)}")
            return [], {}
    
    def process_text(self, text: str, max_chunks: int = None) -> List[str]:
        """Process entire text into semantic sections"""
        self.log_message("Starting text processing")
        
        # Create initial chunks
        initial_chunks = self.create_initial_chunks(text)
        
        if max_chunks:
            initial_chunks = initial_chunks[:max_chunks]
            self.log_message(f"Processing limited to first {max_chunks} chunks")
        
        # Process each chunk
        semantic_chunks = []
        for i, chunk in enumerate(initial_chunks):
            self.log_message(f"Processing chunk {i+1}/{len(initial_chunks)}")
            
            # Get semantic sections
            sections, metrics = self.get_semantic_sections(chunk)
            
            # Print processing details
            self.print_separator("INPUT CHUNK")
            print(f"Chunk {i+1} (Tokens: {self.count_tokens(chunk)})")
            print(chunk[:1000] + "..." if len(chunk) > 1000 else chunk)
            
            self.print_separator("SEMANTIC SECTIONS")
            for j, section in enumerate(sections):
                print(f"\nSection {j+1} (Tokens: {self.count_tokens(section)})")
                print(section[:500] + "..." if len(section) > 500 else section)
            
            self.print_separator("METRICS")
            pprint(metrics)
            
            if self.missed_text:
                self.print_separator("MISSED TEXT")
                print(self.missed_text)
            
            semantic_chunks.extend(sections)
            
            # Save intermediate results
            self.save_chunk_log(i+1, chunk, sections, metrics)
            
            time.sleep(1)  # Rate limiting
            
        self.log_message(f"Processing complete. Total semantic chunks created: {len(semantic_chunks)}")
        return semantic_chunks

    def save_chunk_log(self, chunk_num: int, original_chunk: str, sections: List[str], metrics: Dict):
        """Save intermediate processing results"""
        log_file = os.path.join(self.log_dir, f"chunk_{chunk_num:04d}.json")
        log_data = {
            "chunk_number": chunk_num,
            "original_text": original_chunk,
            "semantic_sections": sections,
            "missed_text": self.missed_text,
            "metrics": metrics,
            "token_counts": {
                "input": self.count_tokens(original_chunk),
                "sections": [self.count_tokens(s) for s in sections],
                "missed": self.count_tokens(self.missed_text) if self.missed_text else 0
            }
        }
        
        with open(log_file, 'w', encoding='utf-8') as f:
            json.dump(log_data, f, indent=2, ensure_ascii=False)

    def validate_chunk(self, chunk: str, original_sections: List[Section]) -> bool:
        """Validate that chunk contains all expected content"""
        # Normalize texts for comparison
        chunk_text = ' '.join(chunk.split())
        original_text = ' '.join(' '.join(s.text for s in original_sections).split())
        
        # Check if all content is present
        missing_content = []
        words = original_text.split()
        window_size = 5
        
        i = 0
        while i < len(words) - window_size:
            sequence = ' '.join(words[i:i+window_size])
            if sequence not in chunk_text:
                missing_content.append(sequence)
            i += 1
        
        if missing_content:
            self.log_message("Missing content detected:")
            for mc in missing_content:
                self.log_message(f"  - {mc}")
            return False
        
        return True
    
    
def main():
    # Initialize chunker
    chunker = SemanticChunker()
    
    # Read input file
    input_file = "/home/ubuntu/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt"
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Process text (limit to first 5 chunks for testing)
    semantic_chunks = chunker.process_text(text)
    
    # Save final chunks
    output_dir = os.path.join(chunker.log_dir, "semantic_chunks")
    os.makedirs(output_dir, exist_ok=True)
    
    for i, chunk in enumerate(semantic_chunks):
        output_file = os.path.join(output_dir, f"semantic_chunk_{i+1:04d}.txt")
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(chunk)
    
    chunker.log_message(f"Saved {len(semantic_chunks)} semantic chunks to {output_dir}")

def test_structure_analysis():
    chunker = SemanticChunker()
    
    test_text = """
                             AUTHOR'S NOTE

My task in this work has been to investigate an individual phantasy
system, and in the doing of it problems of such magnitude have been
uncovered, that my endeavor to grasp them in their entirety has
necessarily meant only a superficial orientation toward those paths, the
opening and exploration of which may possibly crown the work of future
investigators with success.

                                CONTENTS

        INTRODUCTION                                                     3
        
        Relation of the Incest Phantasy to the Oedipus Legend—Moral
        revulsion over such a discovery

 I.—    CONCERNING THE TWO KINDS OF THINKING                             8
"""
    
    try:
        print("Testing structural analysis...")
        sections = chunker.analyze_text_structure(test_text)
        
        print("\nIdentified sections:")
        for i, section in enumerate(sections, 1):
            print(f"\nSection {i}:")
            print(f"Type: {section.type}")
            print(f"Level: {section.level}")
            print(f"Content preview: {section.text[:100]}...")
        
        print("\nTesting chunking with structure preservation...")
        chunks = chunker.create_initial_chunks(test_text)
        
        print("\nResulting chunks:")
        for i, chunk in enumerate(chunks, 1):
            print(f"\nChunk {i}:")
            print(chunk[:200])
            print("...")
            
    except Exception as e:
        print(f"Error during testing: {str(e)}")
    
    # Add validation
    print("\nValidating chunk content...")
    for i, chunk in enumerate(chunks, 1):
        print(f"\nValidating chunk {i}:")
        is_valid = chunker.validate_chunk(chunk, sections)
        print(f"Chunk {i} validation: {'PASSED' if is_valid else 'FAILED'}")
        
if __name__ == "__main__":
    # test_structure_analysis()
    main()  # Comment out for testing

In [5]:
# # ----------------------------- #
# # Part 3: Parse Text into Discourse Units
# # ----------------------------- #

def parse_discourse_units(text, overwrite=False):
    """
    Parses text into discourse units using spaCy.
    Currently splits text into sentences.
    """
    paragraphs = text.split('\n\n')
    paragraphs = [para.strip() for para in paragraphs if para.strip()]

    discourse_units = []
    for para in paragraphs:
        doc = nlp(para)
        sentences = [sent.text for sent in doc.sents]
        discourse_units.extend(sentences)

    output_path = '/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious_discourse_units.json'
    if not os.path.exists(output_path) or overwrite:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(discourse_units, f, ensure_ascii=False, indent=4)
    else:
        with open(output_path, 'r') as f:
            discourse_units = json.load(f)

    print(f"Total Discourse Units: {len(discourse_units)}")
    return discourse_units

In [6]:
# ----------------------------- #
# Part 4: Create Chunks Using Hybrid Strategy
# ----------------------------- #

def create_chunks(discourse_units, tokenizer, max_length=4096, overlap_size=1, overwrite=False):
    """
    Creates chunks from discourse units using a sliding window with overlapping chunks.
    Optimized to work directly with token IDs and utilize efficient list operations.
    """
    chunks = []
    current_chunk_tokens = []
    current_length = 0

    for unit in discourse_units:
        unit_tokens = tokenizer.encode(unit, add_special_tokens=True)
        unit_length = len(unit_tokens)

        if current_length + unit_length <= max_length:
            current_chunk_tokens.extend(unit_tokens)
            current_length += unit_length
        else:
            # Decode and append the current chunk
            chunk_text = tokenizer.decode(
                current_chunk_tokens, skip_special_tokens=True)
            chunks.append(chunk_text)

            # Prepare overlap tokens
            overlap_tokens = current_chunk_tokens[-overlap_size:]
            current_chunk_tokens = overlap_tokens + unit_tokens
            current_length = len(current_chunk_tokens)

    # Append any remaining tokens as the last chunk
    if current_chunk_tokens:
        chunk_text = tokenizer.decode(
            current_chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)

    # Write or read chunks as before
    output_path = '/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious_chunks.json'
    if not os.path.exists(output_path) or overwrite:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(discourse_units, f, ensure_ascii=False, indent=4)
    else:
        with open(output_path, 'r') as f:
            discourse_units = json.load(f)

    print(f"Total Chunks Created: {len(chunks)}")
    return chunks

In [7]:
# ----------------------------- #
# Part 5: Create and Tokenize Dataset
# ----------------------------- #

# To Do - make book titles and prompt generic so
def create_tokenized_dataset(chunks, tokenizer, max_length=1024, num_proc=2):

    # Create a Dataset object from chunks

    book_title = 'Psychology of the Unconscious by C. G. Jung'
    wikipedia_prompt = """
    Psychology Book

    ### Title: {}

    ### Article: {}
    """

    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

    def formatting_prompts_func(examples):
        titles = book_title
        texts = examples["text"]
        outputs = []
        for title, text in zip([book_title]*len(chunks), texts):
            text = wikipedia_prompt.format(title, text) + EOS_TOKEN
            outputs.append(text)
        return {"text": outputs, }
    pass

    # convert chunks variable to huggingface dataset

    from datasets import Dataset

    dataset = Dataset.from_dict({"text": chunks})

    dataset = dataset.map(formatting_prompts_func,
                          batched=True, num_proc=num_proc)
    # Split the dataset into training and validation sets
    split = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split['train']
    eval_dataset = split['test']

    print(len(dataset))
    # Find the maximum length of the text field in the entire dataset
    max_length = max(len(text) for text in dataset['text'])
    print(f"The maximum length of the text field in the dataset is: {max_length} characters")
    print(f"Training Dataset Size: {len(train_dataset)}")
#     print(f"First 5 rows of training dataset: {train_dataset[:5]}")
    print(f"Validation Dataset Size: {len(eval_dataset)}")
#     print(f"First 5 rows of validation dataset: {eval_dataset[:5]}")
    return train_dataset, eval_dataset

In [8]:
# ----------------------------- #
# Part 6: Set up environment and other important utilities
# ----------------------------- #

def setup_environment():
    """
    Initializes the Accelerator for distributed training.
    """
    return Accelerator()


def get_custom_lr_scheduler(optimizer, num_warmup_steps, num_training_steps, initial_phase_steps):
    """
    Defines a custom learning rate scheduler with warmup, constant, and linear annealing phases.
    """
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return current_step / num_warmup_steps  # Linear warmup
        elif current_step < initial_phase_steps:
            return 1.0  # Constant learning rate for initial phase
        else:
            # Linear annealing for the remaining steps
            return 1.0 - ((current_step - initial_phase_steps) / (num_training_steps - initial_phase_steps))

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


def setup_training_steps(initial_rows, annealing_rows, batch_size, gradient_accumulation_steps, num_epochs):
    """
    Calculates total and initial training steps based on dataset size and training parameters.
    """
    total_rows = initial_rows + annealing_rows
    total_steps = (total_rows * num_epochs) // (batch_size *
                                                gradient_accumulation_steps)
    initial_steps = (initial_rows * num_epochs) // (batch_size *
                                                    gradient_accumulation_steps)
    return max(1, total_steps), max(1, initial_steps)


def print_memory_usage(step_desc):
    """
    Prints the CUDA memory summary if CUDA is available.
    """
    if torch.cuda.is_available():
        print(f"Memory Usage at {step_desc}:")
        print(torch.cuda.memory_summary())
        print("\n")
    else:
        print(f"No CUDA available at {step_desc}.\n")


def inference(model, tokenizer):
    """
    Runs inference using the trained model.
    """
    # Define sample prompts
    prompts = [
        "Explain what is medical anthropology and its importance in elevating the quality of life?",
        "Explain what are the types of Jungian archetypes and how they manifest in the human psyche?"
    ]

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=256)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Prompt: {prompt}\nGenerated Text: {generated_text}\n")
        
def compute_metrics(eval_pred):
    """
    Computes perplexity based on model predictions and labels.
    """
    logits, labels = eval_pred
    # Convert to torch tensors
    logits = torch.tensor(logits)
    labels = torch.tensor(labels)
    
    # Ensure shapes match
    if logits.shape[:2] != labels.shape:
        raise ValueError(f"Logits shape {logits.shape} does not match labels shape {labels.shape}")
    
    # Shift logits and labels
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = labels[:, 1:].contiguous()

    # Check label values
    if shift_labels.max() >= model.config.vocab_size:
        raise ValueError(f"Label value {shift_labels.max()} exceeds vocab size {model.config.vocab_size}")
    
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    perplexity = torch.exp(loss).item()
    return {"perplexity": perplexity}

#  Login to Huggingface
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()

def setup_huggingface_access():
    """Setup Hugging Face access with token from environment or manual input"""
    # First try to get token from environment variable
    token = os.getenv('HUGGINGFACE_TOKEN')
    
    if not token:
        # If not in environment, prompt for token
        token = input("Enter your Hugging Face token: ")
        
    if token:
        try:
            login(token, add_to_git_credential=True)
            print("Successfully logged in to Hugging Face!")
        except Exception as e:
            print(f"Failed to log in to Hugging Face: {str(e)}")
            return False
    else:
        print("No Hugging Face token provided")
        return False
    
    return True

In [9]:
def load_model_and_tokenizer(base_model_slug, max_seq_length=4096, dtype=None, load_in_4bit=True):
    """
    Load and configure the model and tokenizer with specified parameters on a single GPU.
    """
    import torch
    import os

    # Force CUDA if available
    if torch.cuda.is_available():
        print("CUDA is available.")
        print(f"Using GPU: {torch.cuda.get_device_properties(0).name}")
        device = torch.device("cuda:0")
        device_map = {"": 0}  # Force everything to GPU 0
    else:
        print("WARNING: CUDA is not available. Using CPU.")
        device = torch.device("cpu")
        device_map = "cpu"

    # Print initial GPU memory
    if torch.cuda.is_available():
        print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0)/1024**3:.2f}GB")

    try:
        # Load base model and tokenizer
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=base_model_slug,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
            device_map=device_map,
            token=os.getenv('HUGGINGFACE_TOKEN'),
        )
        
        print(f"Model device after loading: {next(model.parameters()).device}")
        
        # Configure PEFT model
        model = FastLanguageModel.get_peft_model(
            model,
            r=128,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
                "embed_tokens", "lm_head",
            ],
            lora_alpha=32,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
            use_rslora=True,
            loftq_config=None,
        )
        
        # Ensure model is on GPU after PEFT configuration
        if torch.cuda.is_available():
            model = model.to(device)
            
        # Verify final device placement
        print(f"Final model device: {next(model.parameters()).device}")
        
        # Print GPU memory usage
        if torch.cuda.is_available():
            print(f"\nGPU Memory After Complete Setup: {torch.cuda.memory_allocated(0)/1024**3:.2f}GB")
            
    except Exception as e:
        print(f"Error in model loading/configuration: {str(e)}")
        raise

    return model, tokenizer

In [None]:
import os

# Set the environment variable
os.environ["HUGGINGFACE_TOKEN"] = "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"
os.environ["WANDB_API_KEY"] = "1ca3c5e9222c2504acbc07cf7f88267006ae68c4"
# Verify it's set correctly
print(os.getenv("HUGGINGFACE_TOKEN"))
print(os.getenv("WANDB_API_KEY"))

In [None]:
# ----------------------------- #
# Part 2: Configure Environment Variables & Create Main Variables
# ----------------------------- #

# Unsloth modell initialization variables
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
max_length = max_seq_length
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# device_map = "auto"
base_model_slug = "Qwen/Qwen2.5-7B-Instruct"
base_model_name = "lora_model_pum"
chunks_max_length = max_seq_length
overlap_size = 1
# Define your parameters
batchSize = 2
ga = 8
maxSteps = 10
warmupSteps = 10
numTrainEpochs = 1
lRate = 5e-5
embLRate = 1e-5
optim = "adamw_8bit"
lrSchedule = "linear"
dataset_slug = "psychology_of_unconscious"

from datetime import datetime
import pytz
import wandb
# Get the current date and time in Indian Standard Time (IST)
ist = pytz.timezone('Asia/Kolkata')
current_datetime = datetime.now(ist)

# Format the datetime string
# Example format: 20240428_153045 (YYYYMMDD_HHMMSS)
formatted_datetime = current_datetime.strftime("%Y%m%d_%H%M%S")

# Define Run Name
run_name = f"""Kaggle-quantumLeap-{formatted_datetime}-{base_model_slug}-{dataset_slug}-{max_seq_length}_maxSeqLength-{max_length}_maxLength-{batchSize}_batchSize-{ga}_ga-{maxSteps}_maxSteps-{numTrainEpochs}_numTrainEpochs-{lRate}_lRate-{embLRate}_embLRate-{optim}_optim-{lrSchedule}_lrSchedule"""

# Initialize Weights & Biases
# It's recommended to set your W&B API key as an environment variable for security.
wandb.login(key=os.getenv("WANDB_API_KEY"))
wandb.init(project="OLA-quantumLeap", name=run_name)

In [None]:

# ----------------------------- #
# Part 9: Data Processing
# ----------------------------- #

# # Perform Inference Before Training
# inference(model, tokenizer)

# Set number of processes to use for data loading
num_cpus = multiprocessing.cpu_count()
num_proc = max(num_cpus-2, 2)  # Adjust based on prior recommendations
print(f"Number of CPU cores: {num_cpus}")
print(f"Number of processes: {num_proc}")

# Login to Hugging Face
if not setup_huggingface_access():
    raise Exception("Failed to setup Hugging Face access. Please check your token.")

# Load Model and Tokenizer
model, tokenizer = load_model_and_tokenizer(base_model_slug)
print(f"Model Device: {model.device}")

# Load and Clean Text Data
file_path = "/home/ubuntu/quantumLeap/data/input/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/psychology_of_unconscious.txt"
clean_text = load_and_clean_text(file_path)

# Parse Discourse Units
discourse_units = parse_discourse_units(clean_text, overwrite=True)

# Create Chunks
chunks = create_chunks(
    discourse_units,
    tokenizer,
    max_length=max_length,
    overlap_size=overlap_size,
    overwrite=True,
)

# Create Tokenized Dataset
train_dataset, eval_dataset = create_tokenized_dataset(
    chunks, tokenizer, max_length)

# Save datasets as Hugging Face `datasets`
train_dataset.save_to_disk('/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/train_dataset')
eval_dataset.save_to_disk('/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/eval_dataset')

### To Do - Make the below as dynamic and as a functio
# # Uncomment following if you want to just load the data from temp directory
# from datasets import load_from_disk

# train_dataset = load_from_disk('/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/train_dataset')
# eval_dataset = load_from_disk('/home/ubuntu/quantumLeap/data/preprocess/Step_2_Classic_Texts_and_Ethnographies/2.1_Public_Domain_Books/Project_Gutenberg/eval_dataset')

In [None]:
break

In [None]:
from transformers import IntervalStrategy
from transformers.integrations import TensorBoardCallback

import wandb

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,  # Use 10% of data for evaluation
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = batchSize,
        gradient_accumulation_steps = ga,

        # Set both max_steps and num_train_epochs
        max_steps = maxSteps,
        num_train_epochs = numTrainEpochs,

        # Use a single learning rate for all parameters
        learning_rate = lRate,

        # Warmup strategy from successful runs
        warmup_steps = warmupSteps,
        # warmup_ratio = 0,

        # Explicitly set precision based on hardware support
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        
        logging_steps = 1,
        
        optim = optim,
        weight_decay = 0.01,
        lr_scheduler_type = lrSchedule,
        
        seed = 3407,
        output_dir = "outputs",
        
        report_to=["tensorboard", "wandb"],
        logging_dir=f"./trel-fft-logs/{run_name}",
        
        # Set both save and evaluation strategies to 'steps'
        # save_strategy = IntervalStrategy.STEPS,
        # eval_strategy = IntervalStrategy.STEPS,
        # save_steps = 1,  # Save checkpoint every 20 steps
        # eval_steps = 1,  # Evaluate every 20 steps (matching save_steps)
        
        # load_best_model_at_end = True,
        # metric_for_best_model = "eval_loss",
    ),
    # compute_metrics = compute_metrics,
)

In [None]:
# %%time

# instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

# ### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
# concept_name: {}
# detailed_explanation: {}

# ### Response:
# {}"""

# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 1024, use_cache = True)
# tokenizer.batch_decode(outputs)


# %%time
# # Text Streaming goes into a loop and doesnt adher to EOS

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024)


# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")



# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256,
#                    repetition_penalty = 0.1)

In [None]:
# ----------------------------- #
# Part 11: Start Training
# ----------------------------- #

#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_stats = trainer.train()

#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")



In [None]:
import wandb
from pprint import pprint

def get_run_config(project_name, run_id):
    try:
        # Initialize the wandb API
        api = wandb.Api()

        # Access the specific run
        run = api.run(f"{project_name}/{run_id}")

        # Get the full configuration
        config = run.config

        # Filter for trainer-specific configuration
        trainer_config = {k: v for k, v in config.items() if k.startswith(('train', 'learning', 'optim', 'fp16', 'bf16', 'gradient', 'weight_decay', 'warmup', 'max_steps', 'num_train_epochs', 'per_device'))}

        return trainer_config

    except wandb.errors.CommError:
        print(f"Error: Unable to access run {run_id}. Make sure the run ID is correct and you have the necessary permissions.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Usage
project_name = "olabs-asia-olabs-pro/OLA-quantumLeap"
run_id = "we4axhd1"

trainer_config = get_run_config(project_name, run_id)

if trainer_config:
    print(f"Trainer configuration for run {run_id}:")
    pprint(trainer_config)

In [None]:
# %%time

# instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

# ### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
# concept_name: {}
# detailed_explanation: {}

# ### Response:
# {}"""

# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 1024, use_cache = True)
# tokenizer.batch_decode(outputs)


# %%time
# # Text Streaming goes into a loop and doesnt adher to EOS

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024)


# inputs = tokenizer(
# [
#     instruction_prompt.format(
#         "Hero Archetype", # concept_name
#         "The hero archetype is a common motif in literature and folklore, representing a protagonist who embodies bravery, resilience, and a quest for a greater purpose.", # detailed_explanation
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")



# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256,
#                    repetition_penalty = 0.1)

In [None]:
import time
import os

# Create timestamp
timestamp = int(time.time())

# Create directory if it doesn't exist
save_dir = f"/root/quantumLeap/models/qLeap_model_v0_{timestamp}"
os.makedirs(save_dir, exist_ok=True)

# Save functions with explicit paths
def save_model_versions(model, tokenizer, timestamp, token):
    """
    Save model in different formats with proper error handling
    """
    try:
        # Save base model locally
        print("Saving base model locally...")
        # model.save_pretrained(f"{save_dir}/base")
        # tokenizer.save_pretrained(f"{save_dir}/base")
        
        # Save 8-bit Q8_0 version
        print("Saving 8-bit Q8_0 version...")
        try:
            model.push_to_hub_gguf(
                f"olabs-ai/qLeap_model_v0_8bit_Q8_{timestamp}",
                tokenizer,
                token=token,
                quantization_method="q8_0"
            )
            print("Successfully saved 8-bit model")
        except Exception as e:
            print(f"Error saving 8-bit model: {str(e)}")
            
        # Optional: Save 16-bit version
        print("Saving 16-bit version...")
        try:
            model.push_to_hub_gguf(
                f"olabs-ai/qLeap_model_v0_16bit_GGUF_{timestamp}",
                tokenizer,
                quantization_method="f16",
                token=token
            )
            print("Successfully saved 16-bit model")
        except Exception as e:
            print(f"Error saving 16-bit model: {str(e)}")
            
    except Exception as e:
        print(f"Error in save process: {str(e)}")
        raise

# Call the save function
huggingface_token = "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"
save_model_versions(model, tokenizer, timestamp, huggingface_token)

### if the loss from earlier training is too high try training arguments from unsloth colab notebook "Llama-3.1 8b + Unsloth 2x faster finetuning.ipynb". URL below
### https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=95_Nn-89DhsL

In [None]:
break

# Dataset creation based on the book itself using AugmenToolkit

# Instruction  Tuning

In [None]:

# Instruction FineTune - Create an instruction_pompt based on the concept_examples.csv file

import json
import ast
import logging

import csv

with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/concept_examples.csv', 'r') as f:
    reader = csv.DictReader(f)
    data = list(reader)
    
type(data)


# Configure logging
logging.basicConfig(
    filename='transformation_errors.log',
    filemode='w',
    level=logging.ERROR,
    format='%(levelname)s:%(message)s'
)

# Sample original data
original_data = data

def transform_data(original_data):
    """
    Transforms the original data by expanding 'example_scenario' into separate dictionaries.

    Parameters:
        original_data (list): List of dictionaries with 'concept_name', 'detailed_explanation', and 'example_scenario'.

    Returns:
        new_data (list): Transformed list with one 'example_scenario' per dictionary.
    """
    new_data = []

    for idx, entry in enumerate(original_data, start=1):
        concept_name = entry.get('concept_name', '').strip()
        detailed_explanation = entry.get('detailed_explanation', '').strip()
        example_scenario_str = entry.get('example_scenario', '').strip()

        if not concept_name or not detailed_explanation or not example_scenario_str:
            logging.error(f"Entry {idx} is missing required fields. Skipping.")
            continue

        # Attempt to parse with json.loads
        try:
            example_scenarios = json.loads(example_scenario_str)
            if not isinstance(example_scenarios, list):
                raise ValueError("Parsed 'example_scenario' is not a list.")
        except json.JSONDecodeError:
            # Fallback to ast.literal_eval
            try:
                example_scenarios = ast.literal_eval(example_scenario_str)
                if not isinstance(example_scenarios, list):
                    raise ValueError("Parsed 'example_scenario' is not a list.")
            except (ValueError, SyntaxError) as e:
                logging.error(f"Entry {idx} ('{concept_name}') has invalid 'example_scenario': {e}")
                continue

        # Iterate through each scenario and create a new entry
        for scenario_idx, scenario in enumerate(example_scenarios, start=1):
            if not isinstance(scenario, str):
                logging.error(f"Entry {idx} ('{concept_name}') has non-string scenario at position {scenario_idx}. Skipping this scenario.")
                continue

            new_entry = {
                'concept_name': concept_name,
                'detailed_explanation': detailed_explanation,
                'example_scenario': scenario.strip()
            }
            new_data.append(new_entry)

    return new_data

# Transform the data
transformed_data = transform_data(original_data)

# Optional: Save the transformed data to a JSON file
with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/transformed_data.json', 'w', encoding='utf-8') as f:
    json.dump(transformed_data, f, ensure_ascii=False, indent=4)

print(f"Transformation complete. {len(transformed_data)} entries created.")
print("Check 'transformation_errors.log' for any errors encountered during transformation.")

print(len(transformed_data))

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def instruction_prompt_func(examples):
    concept_name = examples["concept_name"]
    detailed_explanation = examples["detailed_explanation"]
    example_scenario = examples["example_scenario"]
    return { "text" : instruction_prompt.format(concept_name, detailed_explanation, example_scenario), }
pass


# convert transformed_data to a huggingface dataset
instruction_dataset = Dataset.from_dict(transformed_data)
instruction_dataset = instruction_dataset.map(instruction_prompt_func, batched = True,)

from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = instruction_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer_stats = trainer.train()
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Give an example scenario that illustrates the concept of Hero archetype as described by Jungian psychology.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# Text Streaming

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

inputs = tokenizer(
[
    instruction_prompt.format(
        "When trying to understand how nature plays a role in the development of a child's personality, which concept should be considered?",
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   repetition_penalty = 0.1)

In [None]:
# add current timestamp to model name
model.save_pretrained(f"qLeap_model_base_v0_{int(time.time())}") # Local saving
tokenizer.save_pretrained(f"qLeap_model_instruct_v0_{int(time.time())}")
model.push_to_hub(f"olabs-ai/qLeap_model_instruct_v0_{int(time.time())}", token = "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") # Online saving
tokenizer.push_to_hub(f"olabs-ai/qLeap_model_instruct_v0_{int(time.time())}", token = "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") # Online saving
        
# Save to 8bit GGUF
if False: model.save_pretrained_gguf("qLeap_model_v0_8bit_Q8_{int(time.time())}", tokenizer,)
if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_8bit_GGUF_{int(time.time())}", tokenizer,quantization_method = "q8_0", token = "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")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_16bit_GGUF_{int(time.time())}", tokenizer, quantization_method = "f16", token = "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")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q4_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q4_k_m", token = "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")
if False: model.push_to_hub_gguf("olabs-ai/qLeap_model_v0_q5_k_m_16bit_{int(time.time())}", tokenizer, quantization_method = "q5_k_m", token = "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")



# # Merge to 16bit
# if False: model.save_pretrained_merged("qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_16bit_merged_{int(time.time())}", tokenizer, save_method = "merged_16bit", token = "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")

# # Merge to 4bit
# if False: model.save_pretrained_merged("qLeap_model_v0_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_v0_4bit_merged_{int(time.time())}", tokenizer, save_method = "merged_4bit", token = "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")

# # Just LoRA adapters
# if False: model.save_pretrained_merged("qLeap_model_v0_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora",)
# if False: model.push_to_hub_merged("olabs-ai/qLeap_model_LoRA_merged_{int(time.time())}", tokenizer, save_method = "lora", token = "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")



# Inference

In [None]:

instruction_prompt = """Below is an instruction that describes a concept in the field of psychology, sociology, anthropology, ethnography, or qualitative research or cultural studies. Write a response that appropriately completes the request.

### Instruction: Given the concept and its detailed explanation, provide an example scenario that illustrates the concept.
concept_name: {}
detailed_explanation: {}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    instruction_prompt.format(
        "Give an example scenario that illustrates the concept of Hero archetype as described by Jungian psychology.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# Text Streaming

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

inputs = tokenizer(
[
    instruction_prompt.format(
        "When trying to understand how nature plays a role in the development of a child's personality, which concept should be considered?",
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   repetition_penalty = 0.1)