# Set Up

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  yes


In [2]:
!ls -a

.  ..  .virtual_documents


In [4]:
import os
from datetime import datetime
from prettytable import PrettyTable

def find_notebooks(start_path='.'):
    notebooks = []
    for root, dirs, files in os.walk(start_path):
        for file in files:
            if file.endswith('.ipynb'):
                full_path = os.path.join(root, file)
                mod_time = os.path.getmtime(full_path)
                mod_date = datetime.fromtimestamp(mod_time).strftime('%Y-%m-%d %H:%M:%S')
                notebooks.append((file, full_path, mod_date))
    return notebooks

def create_notebook_table(notebooks):
    table = PrettyTable()
    table.field_names = ["Name", "Location", "Last Modified"]
    table.align["Name"] = "l"
    table.align["Location"] = "l"
    table.align["Last Modified"] = "l"
    for notebook in notebooks:
        table.add_row(notebook)
    return table

# Find all notebooks
notebooks = find_notebooks('/')

# Create and print the table
table = create_notebook_table(notebooks)
print(table)

# Print total count
print(f"\nTotal notebooks found: {len(notebooks)}")

KeyboardInterrupt: 

In [5]:
%cd "/opt/conda/pkgs/conda-4.12.0-py310h06a4308_0/info/test/tests/conda_env/support/"
!pwd

/opt/conda/pkgs/conda-4.12.0-py310h06a4308_0/info/test/tests/conda_env/support
/opt/conda/pkgs/conda-4.12.0-py310h06a4308_0/info/test/tests/conda_env/support


In [6]:
!python --version

Python 3.10.13


In [7]:
!git config --global user.email "mdrafat.siddiqui@outlook.com"

In [8]:
# Backup the current list of files
!ls > current_files.txt

# Update .gitignore to only ignore existing files except the notebooks we want to keep
!echo "" > .gitignore
!echo "# Ignore all existing files" >> .gitignore
!for file in $(cat current_files.txt); do if [[ "$file" != "e2e-llm.ipynb" && "$file" != "notebook.ipynb" && "$file" != "notebook_with_env.ipynb" && "$file" != ".gitignore" ]]; then echo "/$file" >> .gitignore; fi; done

# Add exceptions for the notebooks we want to keep
!echo "# Keep these notebooks" >> .gitignore
!echo "!e2e-llm.ipynb" >> .gitignore
!echo "!notebook.ipynb" >> .gitignore
!echo "!notebook_with_env.ipynb" >> .gitignore

# Display the contents of .gitignore
print("Contents of .gitignore:")
!cat .gitignore

# Remove the temporary file
!rm current_files.txt

# Add .gitignore to the repository
!git add .gitignore
!git commit -m "Update .gitignore to ignore existing files but allow new ones"

# Push changes
!git push origin main

print("\nUpdated .gitignore to ignore existing files but allow new ones.")

Contents of .gitignore:

# Ignore all existing files
/add-pip.yml
/advanced-pip
/current_files.txt
/empty_env.yml
/env_with_dependencies.yml
/example
/example-yaml
/foo
/invalid_keys.yml
/pip_argh.yml
/requirements.txt
/saved-env
/simple.yml
/valid_keys.yml
/with-pip.yml
# Keep these notebooks
!e2e-llm.ipynb
!notebook.ipynb
!notebook_with_env.ipynb
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git

Updated .gitignore to ignore existing files but allow new ones.


## Magic Functions for Github

In [9]:
from IPython.core.magic import register_cell_magic
import subprocess
import re
from datetime import datetime
import pytz
import threading
import time

def delete_branch(branch_name):
    try:
        # Delete the branch locally
        subprocess.run(["git", "branch", "-D", branch_name], check=True)
        # Delete the branch remotely
        subprocess.run(["git", "push", "origin", "--delete", branch_name], check=True)
        print(f"Deleted branch '{branch_name}'")
    except subprocess.CalledProcessError as e:
        print(f"Error deleting branch: {e}")

def save_and_push(is_auto=False):
    # Get current Indian Standard Time
    ist = datetime.now(pytz.timezone('Asia/Kolkata'))
    branch_name = ist.strftime("%d-%b-%Y-%H%M-IST")
    if is_auto:
        branch_name += "-autosave"
    
    try:
        # If it's an auto-save, delete the previous auto-save branch
        if is_auto:
            subprocess.run(["git", "fetch", "--all"], check=True)
            result = subprocess.run(["git", "branch", "-r"], capture_output=True, text=True, check=True)
            for branch in result.stdout.split('\n'):
                if "-autosave" in branch:
                    delete_branch(branch.strip().split('/')[-1])
        
        subprocess.run(["git", "checkout", "-b", branch_name], check=True)
        subprocess.run(["git", "add", "."], check=True)
        commit_message = f"Auto-save at {branch_name}" if is_auto else f"Manual save at {branch_name}"
        subprocess.run(["git", "commit", "-m", commit_message], check=True)
        subprocess.run(["git", "push", "origin", branch_name], check=True)
        print(f"Changes pushed to new branch '{branch_name}' successfully.")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")
    finally:
        subprocess.run(["git", "checkout", "main"], check=True)

def auto_save_thread():
    while True:
        time.sleep(120)  # Wait for 2 minutes
        save_and_push(is_auto=True)

# Start the auto-save thread
threading.Thread(target=auto_save_thread, daemon=True).start()

@register_cell_magic
def ap1(line, cell):
    exec(cell, globals())
    save_and_push()

@register_cell_magic
def ap2(line, cell):
    exec(cell, globals())
    save_and_push()

ip = get_ipython()
ip.register_magic_function(ap1, 'cell')
ip.register_magic_function(ap2, 'cell')

print("Auto-save and push system set up. It will run every 2 minutes.")

Auto-save and push system set up. It will run every 2 minutes.


## Magic Function for Visual Feedback

In [10]:
from IPython.display import display, HTML
from IPython import get_ipython

def display_feedback(success):
    color = "green" if success else "red"
    if success:
        display(HTML(f"<div style='width:20px;height:20px;background-color:{color};border-radius:50%;'></div>"))
    else:
        display(HTML(f"<div style='width:40;height:40px;background-color:{color};border-radius:0%;'></div>"))

def visual_feedback_after_execution(result):
    success = not (result.error_before_exec or result.error_in_exec)
    display_feedback(success)

# Get the current IPython instance
ip = get_ipython()

# Remove the audio hook
for hook in ip.events.callbacks['post_run_cell']:
    if 'play_audio_after_execution' in str(hook):
        ip.events.callbacks['post_run_cell'].remove(hook)

# Register the visual feedback hook
ip.events.register('post_run_cell', visual_feedback_after_execution)

print("Visual feedback system set up.")

Visual feedback system set up.


In [11]:
print(

SyntaxError: incomplete input (149104261.py, line 1)

## Magic Function for sound

In [12]:
# !pip install pydub --no-cache-dir

import requests

# Google Drive file ID
file_id = "12v2RwutER9ayuPe4d4H0GQAxA9iUgdXx"
url = f"https://drive.google.com/uc?export=download&id={file_id}"

# Send a GET request to the URL
response = requests.get(url)

# Save the content of the response as an MP3 file
with open("error.mp3", "wb") as file:
    file.write(response.content)

print("File downloaded and saved as error.mp3")

import requests

# Google Drive file ID
file_id = "1-kfEx5SgDxB0ph71q9BhKAEB8R4qShFx"
url = f"https://drive.google.com/uc?export=download&id={file_id}"

# Send a GET request to the URL
response = requests.get(url)

# Save the content of the response as an MP3 file
with open("success.mp3", "wb") as file:
    file.write(response.content)

print("File downloaded and saved as success.mp3")

from IPython.display import display, Audio
from IPython import get_ipython
from pydub import AudioSegment

# Define the post-execution hook
def play_audio_after_execution(result):
    if result.error_before_exec or result.error_in_exec:
        # Load and adjust volume of the error sound
        audio = AudioSegment.from_mp3("error.mp3")
    else:
        # Load and adjust volume of the success sound
        audio = AudioSegment.from_mp3("success.mp3")

    # Set volume to 50%
    audio = audio - 25 # Reduce volume by 10dB (approximately 50%)

    # Export to a temporary file and play
    audio.export("temp_output.mp3", format="mp3")
    display(Audio("temp_output.mp3", autoplay=True))

# Get the current IPython instance
ip = get_ipython()

# Register the post-execution hook
ip.events.register('post_run_cell', play_audio_after_execution)

File downloaded and saved as error.mp3
File downloaded and saved as success.mp3


## Magic Function for Updating Conda Libraries

In [None]:
# import subprocess
# import time
# from tqdm import tqdm

# print("Updating all packages in the current Conda environment...")

# # Start the conda update process
# process = subprocess.Popen(["conda", "update", "--all", "-y"], 
#                            stdout=subprocess.PIPE, 
#                            stderr=subprocess.STDOUT,
#                            universal_newlines=True)

# # Create a progress bar
# with tqdm(total=100, desc="Updating", bar_format="{l_bar}{bar}", ncols=50) as pbar:
#     # Read the output line by line
#     for line in iter(process.stdout.readline, ''):
#         if line:
#             # Update progress bar
#             pbar.update(1)
#             time.sleep(0.1)  # Add a small delay to make the progress visible
        
#         # Check if process has finished
#         if process.poll() is not None:
#             break

# # Ensure the progress bar reaches 100%
# pbar.update(100 - pbar.n)

# # Check the return code
# if process.returncode != 0:
#     print(f"Error occurred. Return code: {process.returncode}")
# else:
#     print("Update complete!")

# Main Workflow

## Install Libraries from SCRATCH

In [13]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [14]:
%%capture

# Install necessary libraries without dependencies
!pip install -U mlflow datasets transformers evaluate autoawq seaborn vllm xformers triton --no-cache-dir

# Check CUDA version and NVIDIA driver status
!nvcc --version
!nvidia-smi

# Install PyTorch and related packages
!pip install -U torch torchvision torchaudio --no-cache-dir

# Set environment variables for optimized performance
!export OMP_NUM_THREADS=1
!export MKL_NUM_THREADS=1

# Verify CUDA availability in PyTorch
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


## Install libraries from CACHE

In [None]:
# %%capture

# # Install Unsloth, Xformers, and other dependencies
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# 
# # Install necessary libraries without dependencies
# !pip install --no-deps mlflow datasets transformers peft accelerate trl evaluate autoawq seaborn triton

# # Check CUDA version and NVIDIA driver status
# !nvcc --version
# !nvidia-smi

# # Install PyTorch and related packages
# !pip install torch torchvision torchaudio

# !pip install triton

# # Set environment variables for optimized performance
# !export OMP_NUM_THREADS=1
# !export MKL_NUM_THREADS=1

# # Verify CUDA availability in PyTorch
# import torch

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

In [None]:
import os

os.environ['hf_token'] = "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"

from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

hf_token = os.getenv('hf_token')

hf_token

### Import Libraries

In [None]:
# Import required libraries
import unsloth
import bitsandbytes
import mlflow
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from transformers import TrainingArguments, AutoTokenizer
from unsloth import is_bfloat16_supported
from trl import SFTTrainer
import evaluate
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil

In [None]:
# !export HUGGINGFACE_HUB_TOKEN= hf_token
# !export TOKENIZERS_PARALLELISM=false

### Set up MLFlow

In [None]:
# Clear cache directory
cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
os.makedirs(cache_dir)

# Check if CUDA is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize MLflow
mlflow.set_tracking_uri("file:///mnt/data/mlruns")
mlflow.set_experiment("unsloth-train")

## Vanilla Unsloth Code

### Import unsloth model

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

### Configure with PEFT

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### Frame the prompt

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add aEOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

### Instantiate Supervised FineTuning Trainer and train

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        num_train_epochs = 5,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 5,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

### Inference form the model -Direct Output

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

### Inference from the model - Text Streeaming

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence till infinity.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
print('Done')

In [None]:
%%ap2

print()

## FineTune on SST5

### Review the dataset and prompt for alpaca

#### Importing the model from scratch

In [15]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-06-24 01:23:46.249933: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-24 01:23:46.250038: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-24 01:23:46.395296: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [16]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add aEOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [17]:
from datasets import load_dataset

# Load the Stanford Sentiment Treebank (SST) dataset
sst_dataset = load_dataset("SetFit/sst5", trust_remote_code=True)

def group_sentiments(example):
    if example["label"] == 0 or example["label"] == 1:
        return "negative"
    elif example["label"] == 2:
        return "neutral"
    else:
        return "positive"

# Map the sentiments to the three categories
sst_dataset = sst_dataset.map(lambda x: {"sentiment": group_sentiments(x)})

sst_dataset

Downloading readme:   0%|          | 0.00/421 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/343k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8544 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1101 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2210 [00:00<?, ? examples/s]

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'sentiment'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'sentiment'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'sentiment'],
        num_rows: 2210
    })
})

In [18]:
alpaca_prompt = """
Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the task.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

In [19]:
import pandas as pd
from datasets import Dataset, DatasetDict

def process_dataset(dataset, instruction_text):
    def process_split(split):
        df = pd.DataFrame(sst_dataset[split])
        df['instruction'] = instruction_text
        df['output'] = df['sentiment']
        df['input'] = df['text']
        df.drop(labels=['text', 'label', 'label_text', 'sentiment'], axis=1, inplace=True)
        return df

    processed_dict = DatasetDict({
        split: Dataset.from_pandas(process_split(split))
        for split in ['train', 'validation', 'test']
    })

    return processed_dict

# Usage
instruction_text = """Read and analyze the sentiment of the provided input.
First, consider if the text is straight forward or sarcastic or containing double meaning or is expressed inversely.
Then give a response labelling sentiment as either positive or neutral or negative or unclear.
"""
processed_dataset = process_dataset(sst_dataset, instruction_text)
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['instruction', 'output', 'input'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['instruction', 'output', 'input'],
        num_rows: 2210
    })
})

In [20]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
# dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
processed_dataset_alpaca = processed_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

In [21]:
processed_dataset_alpaca

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input', 'text'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['instruction', 'output', 'input', 'text'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['instruction', 'output', 'input', 'text'],
        num_rows: 2210
    })
})

In [22]:
processed_dataset_alpaca['train']['text'][0]

'\nBelow is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the task.\n\n### Instruction:\nRead and analyze the sentiment of the provided input.\nFirst, consider if the text is straight forward or sarcastic or containing double meaning or is expressed inversely.\nThen give a response labelling sentiment as either positive or neutral or negative or unclear.\n\n\n### Input:\na stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films\n\n### Response:\npositive\n<|end_of_text|>'

In [23]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.594 GB of memory reserved.


In [29]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import mlflow

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=10,
    learning_rate=2e-4,
    fp16=torch.cuda.is_available() and not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=processed_dataset_alpaca['train'],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False, # Can make training 5x faster for short sequences.
    args=training_args,
)

trainer_stats = trainer.train()

# Log parameters and metrics
mlflow.log_params({
    "per_device_train_batch_size": training_args.per_device_train_batch_size,
    "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
    "warmup_steps": training_args.warmup_steps,
    "max_steps": training_args.max_steps,
    "learning_rate": training_args.learning_rate,
    "fp16": training_args.fp16,
    "bf16": training_args.bf16,
    "logging_steps": training_args.logging_steps,
    "weight_decay": training_args.weight_decay,
    "lr_scheduler_type": training_args.lr_scheduler_type,
    "seed": training_args.seed,
    "output_dir": training_args.output_dir,
})

mlflow.log_metric("final_loss", trainer.state.log_history[-1]['train_loss'])

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/8544 [00:00<?, ? examples/s]

  self.pid = os.fork()
max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,544 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 10
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,0.6379
2,0.4392
3,0.6569
4,0.6611
5,0.6765
6,0.4948
7,0.5113
8,0.5551
9,0.6671
10,0.6287


In [30]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

54.785 seconds used for training.
0.91 minutes used for training.
Peak reserved memory = 6.35 GB.
Peak reserved memory for training = 0.756 GB.
Peak reserved memory % of max memory = 43.057 %.
Peak reserved memory for training % of max memory = 5.126 %.


In [31]:
%%ap2
print()


An error occurred: Command '['git', 'checkout', '-b', '24-Jun-2024-0705-IST']' returned non-zero exit status 128.


fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


CalledProcessError: Command '['git', 'checkout', 'main']' returned non-zero exit status 128.

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Identify the sentiment", # instruction
        "I’d agree with you, but then we’d both be wrong.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
model.save_pretrained("Meta-Llama-3-8B-SST-FineTune") # Local saving
tokenizer.save_pretrained("Meta-Llama-3-8B-SST-FineTune-Tokenizer")
model.push_to_hub("rafatsiddiqui/Meta-Llama-3-8B-SST-FineTune", token = "...") # Online saving
tokenizer.push_to_hub("rafatsiddiqui/Meta-Llama-3-8B-SST-FineTune-Tokenizer", token = "...") # Online saving

In [None]:
# Merge to 16bit
model.save_pretrained_merged("Meta-Llama-3-8B-SST-FineTune-16bit", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged("rafatsiddiqui/Meta-Llama-3-8B-SST-FineTune-16bit", tokenizer, save_method = "merged_16bit", token = hf_token)

# Merge to 4bit
model.push_to_hub_merged("rafatsiddiqui/Meta-Llama-3-8B-SST-FineTune-4bit", tokenizer, save_method = "merged_4bit", token = hf_token)

# Just LoRA adapters
model.push_to_hub_merged("rafatsiddiqui/Meta-Llama-3-8B-SST-FineTune-LoRA", tokenizer, save_method = "lora", token = hf_token)

In [None]:
!touch .gitignore
!echo "Meta-Llama-3-8B-SST-FineTune-16bit" >> .gitignore
!echo "add-pip.yml" >> .gitignore
!echo "advanced-pip" >> .gitignore
!echo "empty_env.yml" >> .gitignore
!echo "env_with_dependencies.yml" >> .gitignore
!echo "foo" >> .gitignore
!echo "invalid_keys.yml" >> .gitignore
!echo "pip_argh.yml" >> .gitignore
!echo "requirements.txt" >> .gitignore
!echo "saved-env" >> .gitignore
!echo "simple.yml" >> .gitignore
!echo "valid_keys.yml" >> .gitignore
!echo "with-pip.yml" >> .gitignore

In [None]:
%%ap2

print()

### Using accelerate to see if any gains in training time

In [None]:
from accelerate import Accelerator
accelerator = Accelerator()
model, train_dataset, eval_dataset = accelerator.prepare(model, train_dataset, test_dataset)

In [None]:
import accelerate
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize the accelerator
accelerator = accelerate.Accelerator()

# Function to get predictions from vllm using batch processing
def get_vllm_predictions(model, tokenizer, dataset, batch_size=32):
    model.eval()
    predictions, labels = [], []
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]
        inputs = tokenizer(batch["text"], return_tensors="pt", max_length=max_seq_length, truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        predictions.extend(preds)
        labels.extend(batch["label"])
    return predictions, labels

# Load tokenizer and model (assuming they are defined earlier in the script)
# tokenizer = AutoTokenizer.from_pretrained("model_name")
# model = SomeModelClass.from_pretrained("model_name")

# Load validation and test sets and group into 3 labels
val_set = load_dataset("SetFit/sst5", split="validation").map(lambda example: {"label": 2 if example["label"] > 3 else (0 if example["label"] < 2 else 1)})
test_set = load_dataset("SetFit/sst5", split="test").map(lambda example: {"label": 2 if example["label"] > 3 else (0 if example["label"] < 2 else 1)})

# Ensure labels are integers
val_set = val_set.map(lambda example: {"label": int(example["label"])})
test_set = test_set.map(lambda example: {"label": int(example["label"])})

# Move model to the correct device
model = accelerator.prepare(model)

# Get predictions
val_predictions, val_labels = get_vllm_predictions(model, tokenizer, val_set)
test_predictions, test_labels = get_vllm_predictions(model, tokenizer, test_set)

# Ensure predictions and labels are numpy arrays of integer type
val_predictions = np.array(val_predictions, dtype=int)
val_labels = np.array(val_labels, dtype=int)
test_predictions = np.array(test_predictions, dtype=int)
test_labels = np.array(test_labels, dtype=int)

# Check for consistency in the data types and lengths
print("Validation Labels Type:", val_labels.dtype, "Length:", len(val_labels))
print("Validation Predictions Type:", val_predictions.dtype, "Length:", len(val_predictions))
print("Test Labels Type:", test_labels.dtype, "Length:", len(test_labels))
print("Test Predictions Type:", test_predictions.dtype, "Length:", len(test_predictions))

# Calculate confusion matrix and classification report
val_conf_matrix = confusion_matrix(val_labels, val_predictions)
test_conf_matrix = confusion_matrix(test_labels, test_predictions)

# Plot confusion matrix for validation set
plt.figure(figsize=(10, 7))
sns.heatmap(val_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Negative", "Neutral", "Positive"], yticklabels=["Negative", "Neutral", "Positive"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Validation Set Confusion Matrix')
plt.show()

# Plot confusion matrix for test set
plt.figure(figsize=(10, 7))
sns.heatmap(test_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Negative", "Neutral", "Positive"], yticklabels=["Negative", "Neutral", "Positive"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Test Set Confusion Matrix')
plt.show()

# Print classification reports
print("Validation Set Classification Report:")
print(classification_report(val_labels, val_predictions, target_names=["Negative", "Neutral", "Positive"]))

print("Test Set Classification Report:")
print(classification_report(test_labels, test_predictions, target_names=["Negative", "Neutral", "Positive"]))

In [None]:
test2 =

# End

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Meta-Llama-3-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token, # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
from datasets import load_dataset

# Load the Stanford Sentiment Treebank (SST) dataset
sst_dataset = load_dataset("sst", trust_remote_code=True)

def group_sentiments(example):
    if example["label"] == 0 or example["label"] == 1:
        return "negative"
    elif example["label"] == 2:
        return "neutral"
    else:
        return "positive"

# Map the sentiments to the three categories
sst_dataset = sst_dataset.map(lambda x: {"sentiment": group_sentiments(x)}, remove_columns=["label"])

alpaca_prompt = """
Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the task.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

instruction_text = """Read and analyze the sentiment of the provided input.
First, consider if the sentiment is straightforward or sarcastic or containing double meaning or is expressed inversely.
Then give a response in a pandas dictionary format with three keys i.e. sentiment, confidence_score, explanation. 
These are the only instructions important and stick to them at all times.
Think step by step and ensure your response is precise, coherent, accurate, and complete.
If you are unsure or do not understand the input, provide a response with sentiment as "Uncertain", confidence score of 0 and an explanation of your uncertainty.
"""

def format_example(examples):
    input_texts = examples["sentence"]
    sentiments = examples["sentiment"]
    input_ids_list = []
    labels_list = []
    
    for input_text, sentiment in zip(input_texts, sentiments):
        response_text = f"{{'sentiment': '{sentiment}', 'confidence_score': 1.0, 'explanation': 'Automated response'}}"
        text = alpaca_prompt.format(instruction_text, input_text, response_text)
        input_ids = tokenizer.encode(text, truncation=True, padding="max_length", max_length=512)
        labels = tokenizer.encode(response_text, truncation=True, padding="max_length", max_length=512)
        input_ids_list.append(input_ids)
        labels_list.append(labels)
    
    return {"input_ids": input_ids_list, "labels": labels_list}

# Apply formatting to the dataset
train_dataset = sst_dataset["train"].map(format_example, batched=True)
validation_dataset = sst_dataset["validation"].map(format_example, batched=True)
test_dataset = sst_dataset["test"].map(format_example, batched=True)

In [None]:
from accelerate import Accelerator
accelerator = Accelerator()
model, train_dataset, eval_dataset = accelerator.prepare(model, train_dataset, test_dataset)

In [None]:

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=2,
    learning_rate=2e-4,
    fp16=torch.cuda.is_available() and not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=sst5,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False, # Can make training 5x faster for short sequences.
    args=training_args,
)

trainer.train()

# Log parameters and metrics
mlflow.log_params({
    "per_device_train_batch_size": training_args.per_device_train_batch_size,
    "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
    "warmup_steps": training_args.warmup_steps,
    "max_steps": training_args.max_steps,
    "learning_rate": training_args.learning_rate,
    "fp16": training_args.fp16,
    "bf16": training_args.bf16,
    "logging_steps": training_args.logging_steps,
    "weight_decay": training_args.weight_decay,
    "lr_scheduler_type": training_args.lr_scheduler_type,
    "seed": training_args.seed,
    "output_dir": training_args.output_dir,
})

mlflow.log_metric("final_loss", trainer.state.log_history[-1]['train_loss'])

In [None]:
from transformers import TrainingArguments, AutoTokenizer
from trl import SFTTrainer
import mlflow
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from accelerate import Accelerator

# Initialize the Accelerator
accelerator = Accelerator()

# Model configuration
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False  # Use 4bit quantization to reduce memory usage. Can be False.

# Load model and tokenizer
model_name = "meta-llama/Meta-Llama-3-8B"
hf_token = "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"  # Ensure you have your Hugging Face token

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=hf_token,  # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=2,
    learning_rate=2e-4,
    fp16=torch.cuda.is_available() and not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
)

# Use Accelerator for device placement
model, train_dataset = accelerator.prepare(model, train_dataset)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Ensure this is correctly defined
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=training_args,
)

# Start MLflow logging
mlflow.start_run()

# Log parameters
mlflow.log_params({
    "per_device_train_batch_size": training_args.per_device_train_batch_size,
    "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
    "warmup_steps": training_args.warmup_steps,
    "max_steps": training_args.max_steps,
    "learning_rate": training_args.learning_rate,
    "fp16": training_args.fp16,
    "bf16": training_args.bf16,
    "logging_steps": training_args.logging_steps,
    "weight_decay": training_args.weight_decay,
    "lr_scheduler_type": training_args.lr_scheduler_type,
    "seed": training_args.seed,
    "output_dir": training_args.output_dir,
})

# Train the model
trainer.train()

# Log final training loss
final_loss = trainer.state.log_history[-1]['loss']  # Ensure correct key
mlflow.log_metric("final_loss", final_loss)

# End MLflow run
mlflow.end_run()

In [None]:
trainer.model.config.save_pretrained("/kaggle/working/")

In [None]:
trainer.model.save_pretrained("/kaggle/working/")

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import os
import shutil

# Hugging Face cache directory
hf_cache_dir = os.path.expanduser("~/.cache/huggingface")

# Clear Hugging Face cache
if os.path.exists(hf_cache_dir):
    shutil.rmtree(hf_cache_dir)
    print(f"Hugging Face cache cleared at {hf_cache_dir}")
else:
    print(f"No Hugging Face cache found at {hf_cache_dir}")

# PyTorch cache directory
torch_cache_dir = os.path.expanduser("~/.cache/torch")

# Clear PyTorch cache
if os.path.exists(torch_cache_dir):
    shutil.rmtree(torch_cache_dir)
    print(f"PyTorch cache cleared at {torch_cache_dir}")
else:
    print(f"No PyTorch cache found at {torch_cache_dir}")

# Additional step to clear CUDA cache if necessary
torch.cuda.empty_cache()
print("CUDA cache cleared")

In [None]:
import os
import torch
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AutoConfig

def check_model_files(model_path):
    required_files = ["config.json", "pytorch_model.bin", "tokenizer_config.json"]
    missing_files = [file for file in required_files if not os.path.isfile(os.path.join(model_path, file))]
    if missing_files:
        raise FileNotFoundError(f"The following required files are missing from {model_path}: {', '.join(missing_files)}")
    else:
        print("All required model files are present.")

# Define the model path
model_path = "/kaggle/working"

# Check for required files
check_model_files(model_path)

# Quantization configuration
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4}

# Load model
try:
    model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading the model: {e}")

In [None]:
# import accelerate
# from datasets import load_dataset
# from transformers import AutoTokenizer
# import torch
# import numpy as np
# from sklearn.metrics import confusion_matrix, classification_report
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Initialize the accelerator
# accelerator = accelerate.Accelerator()

# # Function to get predictions from vllm using batch processing
# def get_vllm_predictions(model, tokenizer, dataset, batch_size=32):
#     model.eval()
#     predictions, labels = [], []
#     for i in range(0, len(dataset), batch_size):
#         batch = dataset[i:i+batch_size]
#         inputs = tokenizer(batch["text"], return_tensors="pt", max_length=max_seq_length, truncation=True, padding=True).to(device)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         logits = outputs.logits
#         preds = torch.argmax(logits, dim=-1).cpu().numpy()
#         predictions.extend(preds)
#         labels.extend(batch["label"])
#     return predictions, labels

# # Load tokenizer and model (assuming they are defined earlier in the script)
# # tokenizer = AutoTokenizer.from_pretrained("model_name")
# # model = SomeModelClass.from_pretrained("model_name")

# # Load validation and test sets and group into 3 labels
# val_set = load_dataset("SetFit/sst5", split="validation").map(lambda example: {"label": 2 if example["label"] > 3 else (0 if example["label"] < 2 else 1)})
# test_set = load_dataset("SetFit/sst5", split="test").map(lambda example: {"label": 2 if example["label"] > 3 else (0 if example["label"] < 2 else 1)})

# # Ensure labels are integers
# val_set = val_set.map(lambda example: {"label": int(example["label"])})
# test_set = test_set.map(lambda example: {"label": int(example["label"])})

# # Move model to the correct device
# model = accelerator.prepare(model)

# # Get predictions
# val_predictions, val_labels = get_vllm_predictions(model, tokenizer, val_set)
# test_predictions, test_labels = get_vllm_predictions(model, tokenizer, test_set)

# # Ensure predictions and labels are numpy arrays of integer type
# val_predictions = np.array(val_predictions, dtype=int)
# val_labels = np.array(val_labels, dtype=int)
# test_predictions = np.array(test_predictions, dtype=int)
# test_labels = np.array(test_labels, dtype=int)

# # Check for consistency in the data types and lengths
# print("Validation Labels Type:", val_labels.dtype, "Length:", len(val_labels))
# print("Validation Predictions Type:", val_predictions.dtype, "Length:", len(val_predictions))
# print("Test Labels Type:", test_labels.dtype, "Length:", len(test_labels))
# print("Test Predictions Type:", test_predictions.dtype, "Length:", len(test_predictions))

# # Calculate confusion matrix and classification report
# val_conf_matrix = confusion_matrix(val_labels, val_predictions)
# test_conf_matrix = confusion_matrix(test_labels, test_predictions)

# # Plot confusion matrix for validation set
# plt.figure(figsize=(10, 7))
# sns.heatmap(val_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Negative", "Neutral", "Positive"], yticklabels=["Negative", "Neutral", "Positive"])
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Validation Set Confusion Matrix')
# plt.show()

# # Plot confusion matrix for test set
# plt.figure(figsize=(10, 7))
# sns.heatmap(test_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Negative", "Neutral", "Positive"], yticklabels=["Negative", "Neutral", "Positive"])
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Test Set Confusion Matrix')
# plt.show()

# # Print classification reports
# print("Validation Set Classification Report:")
# print(classification_report(val_labels, val_predictions, target_names=["Negative", "Neutral", "Positive"]))

# print("Test Set Classification Report:")
# print(classification_report(test_labels, test_predictions, target_names=["Negative", "Neutral", "Positive"]))

In [None]:
import os

os.environ['hf_token'] = "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"

from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

hf_token = os.getenv('hf_token')

hf_token

In [None]:
!ls -1S

In [None]:
# import os
# from awq import AutoAWQForCausalLM
# from transformers import AutoTokenizer

# # Load the token from the environment variable
# hf_token = os.getenv(hf_token)

# # Define the model name or path
# model_path = ""  # Replace with your model name

# quant_name = 'quantized_llama3-8b-awq'

# quant_path = "/kaggle/working/" + quant_name

# quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4}
                

# # Load model
# model = AutoAWQForCausalLM.from_pretrained(model_path, device_map = torch.device("cuda"))
# tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True)

# # Quantize
# model.quantize(tokenizer, quant_config = quant_config)

# # Save quantized model
# model.save_quantized(quant_name, safetensors = True)
# tokenizer.save_pretrained(quant_name)

# import os
# from huggingface_hub import HfApi, create_repo, upload_folder

# # Set environment variable for Hugging Face token
# hf_token = hf_token

# # Define the paths to your model and tokenizer
# model_dir = "/kaggle/working/quantized_llama3-8b-awq"
# repo_name = "rafatsiddiqui/Meta-Llama-3-8B-AWQ v2"

# ## Create a repository on Hugging Face Hub
# create_repo(repo_id=repo_name, token=hf_token, exist_ok=True)

# # Upload the model directory to the repository
# upload_folder(
#     folder_path=model_dir,
#     path_in_repo=".",
#     repo_id=repo_name,
#     token=hf_token
# )

# model.to("cpu")
# tokenizer.save_pretrained("/kaggle/working//Meta-Llama-3-8B-Instruct-AWQ v2")
# model.save_quantized("/kaggle/working//Meta-Llama-3-8B-Instruct-AWQ v2")

In [None]:
# from awq import AutoAWQForCausalLM
# from transformers import AutoTokenizer

# quant_path = "outputs/saved_model"
# model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
# tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)

# # Test the quantized model
# test_input = tokenizer("Test input for the model", return_tensors="pt").to('cuda')
# output = model.generate(test_input.input_ids)
# print(tokenizer.decode(output[0], skip_special_tokens=True))

# Check GPU Usage

In [None]:
!nvidia-smi

## Clear cache

In [None]:
del trainer
del sst5
torch.cuda.empty_cache()

!nvidia-smi

# Restart and delete gpu memory for using vllm for inference

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from peft import PeftModelForCausalLM
# import os

# # Ensure the token is available (replace 'your_token' with your actual Hugging Face token)
# token = os.getenv('HF_TOKEN', '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')

# # Load tokenizer with authentication
# tokenizer = AutoTokenizer.from_pretrained("outputs", use_auth_token=token)

# # Base model ID (should match the model you originally trained)
# base_model_id = "meta-llama/Meta-Llama-3-8B"

# # Load the base model first with authentication
# base_model = AutoModelForCausalLM.from_pretrained(base_model_id, use_auth_token=token)

# # Load the LoRA weights and apply to the base model
# model = PeftModelForCausalLM.from_pretrained(base_model, "outputs")

# # Print confirmation
# print("Model and tokenizer loaded successfully.")

In [None]:
%%autopush

print('done')