In [5]:
import subprocess
subprocess.run(["pip", "install", "unsloth"])
subprocess.run(["pip", "install", "-q", "-U", "xformers", "torch--no-cache-dir"])

Collecting unsloth
  Downloading unsloth-2024.9-py3-none-any.whl.metadata (54 kB)
Collecting torch>=2.4.0 (from unsloth)
  Downloading torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting xformers==0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.27.post2-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.10-py3-none-any.whl.metadata (8.4 kB)
Collecting transformers>=4.43.2 (from unsloth)
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece>=0.2.0 (from unsloth)
  Downloading sentencepiece-0.

[0m[31mERROR: Could not find a version that satisfies the requirement torch--no-cache-dir (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torch--no-cache-dir[0m[31m
[0m

CompletedProcess(args=['pip', 'install', '-q', '-U', 'xformers', 'torch--no-cache-dir'], returncode=1)

In [10]:
# install nltk spacy and wandb
subprocess.run(["pip", "install", "-q", "-U", "nltk", "spacy", "wandb", "datasets", "huggingface_hub", "ipywidgets", "nbformat", "--no-cache-dir"])

[0m

CompletedProcess(args=['pip', 'install', '-q', '-U', 'nltk', 'spacy', 'wandb', 'datasets', 'huggingface_hub', '--no-cache-dir'], returncode=0)

In [7]:
import os
import re
import torch
import nltk
import spacy
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from datasets import Dataset
import logging
import argparse
import wandb  # Weights & Biases integration


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# ----------------------------- #
# Part 1: Install and Setup Libraries
# ----------------------------- #

# Ensure NLTK's punkt tokenizer is available
nltk.download('punkt')

# Initialize spaCy English model
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("SpaCy English model not found. Downloading...")
    os.system('python -m spacy download en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# ----------------------------- #
# Part 2: Load and Clean the Text Data
# ----------------------------- #

def load_and_clean_text(file_path):
    """
    Loads text from a file and removes Project Gutenberg's license and headers/footers.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # # Remove Project Gutenberg's license text and headers/footers
    # start_pattern = r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK.*\*\*\*'
    # end_pattern = r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK.*\*\*\*'

    # text = re.sub(f'.*{start_pattern}', '', text, flags=re.DOTALL)
    # text = re.sub(f'{end_pattern}.*', '', text, flags=re.DOTALL)
    return text.strip()

# Replace 'psychology_of_unconscious.txt' with your actual file path
file_path = '/root/quantumLeap/data/psychologoy-of-unconscious-mind/psychology_of_unconscious.txt'
clean_text = load_and_clean_text(file_path)

In [13]:
# ----------------------------- #
# Part 3: Parse Text into Discourse Units
# ----------------------------- #

def parse_discourse_units(text):
    """
    Parses text into discourse units using spaCy.
    Currently splits text into sentences.
    """
    paragraphs = text.split('\n\n')
    paragraphs = [para.strip() for para in paragraphs if para.strip()]
    
    discourse_units = []
    for para in paragraphs:
        doc = nlp(para)
        sentences = [sent.text for sent in doc.sents]
        discourse_units.extend(sentences)
    return discourse_units

discourse_units = parse_discourse_units(clean_text)

# Save discourse_units to a file (Optional)
with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/discourse_units_final.txt', 'w') as f:
    for unit in discourse_units:
        f.write(unit + '\n')

# If you need to reload from file (Optional)
# with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/discourse_units_final.txt', 'r') as f:
#     discourse_units = f.read().splitlines()


In [15]:
len(discourse_units)

6175

In [16]:
# ----------------------------- #
# Part 4: Create Chunks Using Hybrid Strategy
# ----------------------------- #

def create_chunks(discourse_units, tokenizer, max_length=2048, overlap_size=100):
    """
    Creates chunks from discourse units using a sliding window with overlapping chunks.
    """
    chunks = []
    current_chunk = []
    current_length = 0

    for unit in discourse_units:
        unit_tokens = tokenizer.encode(unit, add_special_tokens=False)
        unit_length = len(unit_tokens)

        if current_length + unit_length <= max_length:
            current_chunk.append(unit)
            current_length += unit_length
        else:
            # Append the current chunk
            chunks.append(' '.join(current_chunk))
            # Create overlap
            overlap_text = ' '.join(current_chunk)[-overlap_size:]
            overlap_tokens = tokenizer.encode(overlap_text, add_special_tokens=False)
            overlap_text = tokenizer.decode(overlap_tokens, skip_special_tokens=True)
            # Start new chunk with overlap and current unit
            current_chunk = [overlap_text, unit]
            current_length = len(tokenizer.encode(overlap_text, add_special_tokens=False)) + unit_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [18]:
# ----------------------------- #
# Part 5: Load the Tokenizer and Model
# ----------------------------- #

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# if the model is already downloaded, then don't download it again; otherwise download it
import os

model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
models_dir = os.path.join(os.path.dirname(os.getcwd()), "models")
model_path = os.path.join(models_dir, model_name)

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if os.path.exists(model_path):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_path,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
else:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        token="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",
    )
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


# Save chunks to a file (Optional)
with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/chunks_final.txt', 'w') as f:
    for unit in discourse_units:
        f.write(unit + '\n')

# If you need to reload from file (Optional)
# with open('/root/quantumLeap/data/psychologoy-of-unconscious-mind/chunks_final.txt', 'r') as f:
#     discourse_units = f.read().splitlines()

len(chunks)

==((====))==  Unsloth 2024.9: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA H100 80GB HBM3. Max memory: 79.209 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 9.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
