## Wednesday, August 7, 2024

[Creating and Uploading a Dataset with Unsloth: An Adventure in Wonderland](https://huggingface.co/blog/dimentox/unsloth-mistral-training)

Right now I have no idea if I can even load this model locally, but gonna try just cuz it looks interesting, and I have considered doing something like this anyways, cuz Github, right?! ... 

mamba activate unsloth_env2 (unsloth 2024.8)

Original code was from the blog article, summarized into a single cell of python code. I am breaking that code down into cells inside this notebook. 

In [1]:
# only target the 4090 ...
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
# We need these next two statements, otherwise we get ...
# NotImplementedError: Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. Please set `NCCL_P2P_DISABLE=\"1\"` and `NCCL_IB_DISABLE=\"1\" or use `accelerate launch` which will do this automatically."
# ... when we try to initialize SentenceTransformerTrainingArguments further on down ... 
os.environ["NCCL_P2P_DISABLE"]="1"
os.environ["NCCL_IB_DISABLE"]="1"

In [3]:
# What is in the huggingface cache??
!ls /home/rob/.cache/huggingface/hub

models--BAAI--bge-base-en
models--BAAI--bge-base-en-v1.5
models--bert-base-cased-finetuned-mrpc
models--bert-base-uncased
models--bert-large-uncased
models--cognitivecomputations--dolphin-2.9-llama3-8b
models--colbert-ir--colbertv2.0
models--distilbert-base-uncased-finetuned-sst-2-english
models--FacebookAI--xlm-roberta-base
models--facebook--opt-350m
models--google--gemma-7b
models--google--gemma-7b-it
models--google-t5--t5-base
models--gpt2
models--meta-llama--Meta-Llama-3.1-8B-Instruct
models--meta-llama--Meta-Llama-3-8B
models--meta-llama--Meta-Llama-3-8B-Instruct
models--microsoft--mpnet-base
models--microsoft--table-transformer-structure-recognition
models--mistralai--Mistral-7B-Instruct-v0.2
models--mistralai--Mistral-7B-Instruct-v0.3
models--mistralai--Mistral-7B-v0.1
models--mixedbread-ai--mxbai-embed-large-v1
models--nomic-ai--nomic-embed-text-v1
models--NousResearch--Hermes-2-Pro-Llama-3-8B
models--nvidia--dragon-multiturn-context-encoder
models--nvidia--dragon-multiturn-que

In [None]:

# Step 1: Setting Up the Environment
# !pip install beautifulsoup4 gitpython huggingface_hub datasets requests

In [4]:
# Step 2: Cloning and Pulling the Repository
import os
import json
import requests
from bs4 import BeautifulSoup
from git import Repo
from huggingface_hub import HfApi
from datasets import Dataset, DatasetDict


In [5]:
def verbose_print(message):
    print(f"[INFO] {message}")

def clone_or_pull_repo(repo_url, repo_name):
    if os.path.exists(repo_name):
        verbose_print(f"Repository {repo_name} already exists. Pulling latest changes.")
        repo = Repo(repo_name)
        repo.remotes.origin.pull()
    else:
        verbose_print(f"Cloning repository from {repo_url}")
        Repo.clone_from(repo_url, repo_name)

def extract_markdown_files(repo_path):
    verbose_print(f"Extracting Markdown files from {repo_path}")
    markdown_files = []
    for root, dirs, files in os.walk(repo_path):
        for file in files:
            if file.endswith(".md"):
                markdown_files.append(os.path.join(root, file))
    return markdown_files

# Step 3: Parsing and Scraping Content
def parse_markdown(file_path):
    verbose_print(f"Parsing Markdown file {file_path}")
    with open(file_path, 'r') as file:
        content = file.read()
    sections = content.split('\n## ')
    parsed_sections = [section.replace('\n', ' ') for section in sections]
    return parsed_sections

def get_page_links(base_url, link_selector):
    verbose_print(f"Getting page links from {base_url}")
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    page_links = []
    for link in soup.select(link_selector):
        href = link['href']
        if not href.startswith('http') and href != '#':
            href = base_url.rstrip('/') + '/' + href.lstrip('/')
            page_links.append(href)
    return page_links

def scrape_page(url, content_selector):
    verbose_print(f"Scraping content from {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    page_content = []
    main_content = soup.select_one(content_selector)
    if main_content:
        sections = main_content.find_all(['h1', 'h2', 'h3', 'p', 'pre'])
        for section in sections:
            page_content.append(section.text)
    return page_content

# Step 4: Creating and Saving the Dataset
def create_dataset(repo_url, doc_urls):
    dataset = []

    # Scrape GitHub repository
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    clone_or_pull_repo(repo_url, repo_name)
    markdown_files = extract_markdown_files(repo_name)
    for md_file in markdown_files:
        sections = parse_markdown(md_file)
        for section in sections:
            dataset.append({
                'source': 'GitHub',
                'repository': repo_name,
                'file': md_file,
                'label': 'autogen',
                'content': section
            })

    # Scrape documentation site
    for doc_url, link_selector, content_selector in doc_urls:
        page_links = get_page_links(doc_url, link_selector)
        for page_url in page_links:
            page_content = scrape_page(page_url, content_selector)
            for section in page_content:
                dataset.append({
                    'source': 'Documentation',
                    'url': page_url,
                    'label': 'autogen',
                    'content': section
                })

    return dataset

def load_dataset_locally(file_path):
    if os.path.exists(file_path):
        verbose_print(f"Loading existing dataset from {file_path}")
        with open(file_path, 'r') as file:
            return json.load(file)
    verbose_print(f"No existing dataset found at {file_path}")
    return []

def save_dataset_locally(dataset, output_file):
    verbose_print(f"Saving dataset to {output_file}")
    with open(output_file, 'w') as file:
        json.dump(dataset, file, indent=4)
    verbose_print("Dataset saved successfully")

# Step 5: Uploading to Hugging Face
def upload_to_huggingface(dataset, repo_id):
    token = os.getenv("HF_TOKEN")
    verbose_print(f"Uploading dataset to Hugging Face with repository ID {repo_id}")
    hf_api = HfApi()
    hf_api.create_repo(repo_id, token=token, repo_type="dataset", private=False)

    # Create a DatasetDict and push to hub
    dataset_dict = DatasetDict({"train": Dataset.from_list(dataset)})
    dataset_dict.push_to_hub(repo_id, token=token)
    verbose_print(f"Dataset uploaded to Hugging Face with repository ID {repo_id}")

In [6]:
# Example Usage
repo_url = 'https://github.com/microsoft/autogen.git'
doc_urls = [
    ('https://microsoft.github.io/autogen/docs/', 'a[href]', 'div.md-content'),
    ('https://microsoft.github.io/autogen/docs/Examples', 'a[href]', 'div.md-content'),
    ('https://microsoft.github.io/autogen/docs/notebooks', 'a[href]', 'div.md-content'),
    ('https://microsoft.github.io/autogen/blog', 'a[href]', 'div.blog-content')
]
output_file = 'autogen_python_dataset.json'
repo_id = 'dimentox/autogen-python'

In [7]:
# running this will always append to any existing autogen_python_dataset.json ... 
verbose_print("Starting dataset creation process")
existing_dataset = load_dataset_locally(output_file)
new_dataset = create_dataset(repo_url, doc_urls)
combined_dataset = existing_dataset + new_dataset
save_dataset_locally(combined_dataset, output_file)
# upload_to_huggingface(combined_dataset, repo_id) # Nope for now!
verbose_print("Dataset creation and upload process completed")


# 48.4s ... re-run ... 
# 27m 56.7s ... run the first time ...

[INFO] Starting dataset creation process
[INFO] No existing dataset found at autogen_python_dataset.json
[INFO] Repository autogen already exists. Pulling latest changes.
[INFO] Extracting Markdown files from autogen
[INFO] Parsing Markdown file autogen/README.md
[INFO] Parsing Markdown file autogen/SECURITY.md
[INFO] Parsing Markdown file autogen/TRANSPARENCY_FAQS.md
[INFO] Parsing Markdown file autogen/CODE_OF_CONDUCT.md
[INFO] Parsing Markdown file autogen/samples/apps/auto-anny/README.md
[INFO] Parsing Markdown file autogen/samples/apps/cap/TODO.md
[INFO] Parsing Markdown file autogen/samples/apps/cap/README.md
[INFO] Parsing Markdown file autogen/samples/apps/cap/py/README.md
[INFO] Parsing Markdown file autogen/samples/apps/cap/c++/Readme.md
[INFO] Parsing Markdown file autogen/samples/apps/cap/node/Readme.md
[INFO] Parsing Markdown file autogen/samples/apps/cap/c#/Readme.md
[INFO] Parsing Markdown file autogen/samples/apps/autogen-studio/README.md
[INFO] Parsing Markdown file au

In [8]:
# Step 6: Fine-Tuning the Model
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from datasets import load_dataset
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [9]:
class AdaptiveTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.prev_eval_loss = float('inf')

    def evaluation_step(self, *args, **kwargs):
        output = super().evaluation_step(*args, **kwargs)
        current_eval_loss = output['eval_loss']

        # Adaptive Learning Rate Adjustment
        if current_eval_loss > self.prev_eval_loss:
            self.args.learning_rate *= 0.9  # Reduce learning rate if loss increased
            print(f"Decreased learning rate to: {self.args.learning_rate}")
        else:
            self.args.learning_rate *= 1.05  # Slightly increase if loss decreased
            print(f"Increased learning rate to: {self.args.learning_rate}")

        self.prev_eval_loss = current_eval_loss
        return output

    def training_step(self, *args, **kwargs):
        # Adjust gradient clipping based on gradient norms
        if self.state.global_step > 0 and self.state.global_step % self.args.eval_steps == 0:
            current_grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
            print(f"Adjusted gradient clipping to: {current_grad_norm}")

        return super().training_step(*args, **kwargs)

def print_memory_stats(stage):
    gpu_stats = torch.cuda.get_device_properties(0)
    used_memory = round(torch.cuda.memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"[{stage}] GPU: {gpu_stats.name}, Memory Reserved: {used_memory} GB / {max_memory} GB")

In [10]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

print("Loading model")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token="token"
)

# 14.1s ... re-load to the 4090 ... 
# 119m 51.7s ... using the compton connection ... 
#   3m 50.2s ... using the BELL connection ... Damn! What a difference!

Loading model
==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.31G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [11]:
!nvidia-smi

Wed Aug  7 19:15:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.06              Driver Version: 545.29.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2070 ...    Off | 00000000:01:00.0  On |                  N/A |
| 25%   37C    P8              12W / 215W |    414MiB /  8192MiB |      7%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        Off | 00000000:02:00.0 Off |  

In [12]:
print("Loading Laura")
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Loading Laura


Unsloth 2024.8 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [13]:
print("Loading dataset")
dataset_path = "autogen_python_dataset.json"
dataset = load_dataset("json", data_files=dataset_path, split="train")

custom_prompt = """Source: {}
Repository: {}
File: {}
Label: {}
Content: {}
"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    sources = examples["source"]
    repositories = examples["repository"]
    files = examples["file"]
    labels = examples["label"]
    contents = examples["content"]
    texts = []
    for source, repository, file, label, content in zip(sources, repositories, files, labels, contents):
        text = custom_prompt.format(source, repository, file, label, content) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

Loading dataset


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/385 [00:00<?, ? examples/s]

In [14]:
trainer = AdaptiveTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="steps",
        save_steps=50,
        eval_steps=1,
    ),
)

Map (num_proc=2):   0%|          | 0/385 [00:00<?, ? examples/s]

In [15]:
print_memory_stats("Before Training")
trainer_stats = trainer.train(resume_from_checkpoint=True)
print_memory_stats("After Training")

[Before Training] GPU: NVIDIA GeForce RTX 4090, Memory Reserved: 8.588 GB / 23.65 GB


ValueError: No valid checkpoint found in output directory (outputs)

In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

In [None]:
# Step 7: Using the Fine-Tuned Model
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

print("Loading fine-tuned model")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="lora_model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token="TOKEN"
)

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
    [
        """
        <s>
        Q: What is the capital of France?
        A:
        """
    ],
    return_tensors="pt"
).to("cuda")

text_streamer = TextStreamer(tokenizer)
outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=64)
print(tokenizer.batch_decode(outputs))
