In [1]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [2]:
%pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [3]:
%pip install peft



In [4]:
import os
import sys
import json
import evaluate
import math
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

from datasets import load_dataset
import transformers

# Assuming MISTRAL has equivalent classes, replace LlamaForCausalLM and LlamaTokenizer
from transformers import AutoTokenizer

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
)

from kaggle_secrets import UserSecretsClient
import wandb



ModuleNotFoundError: No module named 'kaggle_secrets'

In [None]:
# wandb login
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret('wandb-key')
wandb.login(key=wandb_key)

In [None]:
# Set random seed for reproducibility
RANDOM_SEED = 1234
transformers.set_seed(RANDOM_SEED)

# Training configuration
MICRO_BATCH_SIZE = 4
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1  # Adjust based on your computational limits and model size
LEARNING_RATE = 2e-5  # Standard fine-tuning learning rate
CUTOFF_LEN = 256  # Adjust if MISTRAL processes data differently
LORA_R = 8  # LoRA parameters, only adjust if using LoRA with MISTRAL
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
VAL_SET_SIZE = 0  # Set to a positive number if validation is needed
TARGET_MODULES = [
    'q_proj',
    'v_prol',
]  # Specific to model internals, check if applicable for MISTRAL
OUTPUT_DIR = '/kaggle/working/mistral_model_tuned'  # Update path for MISTRAL outputs

# DDP (Distributed Data Parallel) settings
device_map = 'auto'
world_size = int(os.environ.get('WORLD_SIZE', 1))
ddp = (world_size != 1)  # Enable DDP if more than one GPU is used
if ddp:
    device_map = {'': int(os.environ.get('LOCAL_RANK') or 0)}
    GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size


In [None]:
from huggingface_hub import login
login(token="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")


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model  # Assuming these are imported correctly

# Define the model checkpoint
model_checkpoint = "mistralai/Mistral-7B-Instruct-v0.3"

# Optional: Configure quantization to save VRAM
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,  # This assumes quantization config is applicable
    # Uncomment and adjust if the model supports advanced quantization features:
    # bnb_4bit_compute_dtype=torch.bfloat16,
    # bnb_4bit_use_double_quant=True,
)

# Load the model from the checkpoint with quantization settings if supported
model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint,
    quantization_config=quantization_config,  # Apply quantization config if supported
    device_map='auto'  # Automatically map the model to the available GPU(s)
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# LoRA configuration - adjust parameters as needed
lora_config = LoraConfig(
    r=8,  # Rank of the LoRA layers
    lora_alpha=16,  # Scaling factor for LoRA
    target_modules=['q_proj', 'v_proj'],  # Typically targets Q, K, V projections in transformers
    lora_dropout=0.05,  # Dropout rate for LoRA layers
    bias='none',  # No bias term in the LoRA layers
    task_type='CAUSAL_LM',  # Specify the task type, ensure this aligns with your model type
)

# Apply the LoRA configuration to the model
model = get_peft_model(model, lora_config)

# Example usage of the model to generate text (you can customize further as needed)
from transformers import pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = "Discuss the future implications of AI in healthcare."
outputs = generator(prompt, max_length=100, num_return_sequences=1)

# Print the generated text
for output in outputs:
    print(output['generated_text'])


In [None]:
from datasets import load_dataset, DatasetDict

# Load your dataset from the Hugging Face Hub
data = load_dataset('kunchum/capstone_1')

# Shuffle the dataset using a predefined seed for reproducibility
RANDOM_SEED = 1234
data = data.shuffle(seed=RANDOM_SEED)

# Select a sample of 20,000 records for fine-tuning
sample_size = 20000
data_sample = data['train'].select(range(sample_size))

# Create a DatasetDict with the sampled data to maintain a structured format
sampled_data_dict = DatasetDict({
    'train': data_sample  # Optionally, add more splits like 'validation' or 'test' if necessary
})


In [None]:
def generate_prompt(data_point):
    """
    Generate input text based on a prompt, task instruction, context information (if available),
    and a response. The function creates a formatted string that includes the task instruction,
    optional context, and the expected response.

    :param data_point: Dictionary containing 'instruction', optional 'context_cleaned', and 'response_cleaned'
    :return: Formatted string to be used as input text
    """
    # Define the base prompt structure with mandatory instruction and response sections
    prompt = "Below is an instruction that describes a task."
    prompt += "\n\n### Instruction:\n" + data_point["instruction"]

    # Add context information if available
    if data_point.get('context_cleaned'):
        prompt += "\n\n### Input:\n" + data_point["context_cleaned"]

    # Append the response part of the prompt
    prompt += "\n\n### Response:\n" + data_point["response_cleaned"]

    return prompt


In [None]:
def tokenize(prompt, tokenizer, max_length=512, padding_type='max_length'):
    """
    Tokenize the input text using specified tokenizer settings. This function prepares the text for
    processing by NLP models by converting the text into a sequence of IDs, considering maximum length
    and padding.

    :param prompt: str, Input text to be tokenized
    :param tokenizer: Tokenizer object, the tokenizer to use for processing the text
    :param max_length: int, maximum sequence length for tokenization
    :param padding_type: str, type of padding to apply ('max_length' or 'longest')
    :return: dict, containing 'input_ids' and 'attention_mask', both truncated and padded as necessary
    """
    # Ensure tokenizer has a padding token
    if tokenizer.pad_token is None:
        if tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            # You need to resize model embeddings if new tokens are added
            model.resize_token_embeddings(len(tokenizer))

    try:
        result = tokenizer(prompt, truncation=True, max_length=max_length + 1, padding=padding_type)
        return {
            'input_ids': result['input_ids'][:-1],
            'attention_mask': result['attention_mask'][:-1]
        }
    except Exception as e:
        print(f"An error occurred during tokenization: {e}")
        return {}


In [None]:
# Example usage
prompt = "Example prompt text to be tokenized."
tokenized_output = tokenize(prompt, tokenizer, max_length=512, padding_type='max_length')


In [None]:
def generate_and_tokenize_prompt(data_point, tokenizer, max_length=512):
    """Generate and tokenize a prompt with a masked response for training.

    Args:
        data_point (dict): Contains 'instruction', optional 'context_cleaned', and 'response_cleaned'.
        tokenizer (Tokenizer): Tokenizer to use for tokenization.
        max_length (int): Maximum length of the tokenized input.

    Returns:
        dict: Contains 'input_ids', 'labels' for loss calculation, and 'attention_mask'.
    """
    # Generate the initial part of the prompt
    if data_point['context_cleaned']:
        user_prompt = (
            f"Below is an instruction that describes a task, paired with an input that provides further context. "
            f"Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{data_point['instruction']}\n\n"
            f"### Input:\n{data_point['context_cleaned']}\n\n"
            f"### Response:\n"
        )
    else:
        user_prompt = (
            f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{data_point['instruction']}\n\n"
            f"### Response:\n"
        )

    # Tokenize the user prompt to determine the number of tokens
    prompt_tokens = tokenizer(user_prompt, add_special_tokens=False)

    # Tokenize the full prompt including the response
    full_prompt = user_prompt + data_point['response_cleaned']
    full_tokens = tokenizer(full_prompt, max_length=max_length, truncation=True, padding='max_length')

    # Calculate lengths and create masks
    len_user_prompt_tokens = len(prompt_tokens['input_ids'])
    labels = [-100] * len_user_prompt_tokens + full_tokens['input_ids'][len_user_prompt_tokens:]

    return {
        'input_ids': full_tokens['input_ids'],
        'labels': labels,
        'attention_mask': full_tokens['attention_mask']
    }


In [None]:
from datasets import DatasetDict

def prepare_data(sampled_data_dict, val_set_size=0.1, generate_and_tokenize_func=None, random_seed=42):
    """
    Prepare training and validation datasets by applying a tokenization function to each.

    Args:
        sampled_data_dict (DatasetDict): The dataset dictionary containing the training data.
        val_set_size (float): The proportion of the dataset to be used as the validation set.
        generate_and_tokenize_func (callable): The function to apply to each data point for tokenization.
        random_seed (int): Seed for reproducibility of the dataset split.

    Returns:
        tuple: A tuple containing the tokenized training and validation datasets. If no validation
               set is required (val_set_size <= 0), the second element in the tuple will be None.
    """
    if val_set_size > 0:
        # Split the dataset into training and validation sets according to the specified proportion
        train_val_split = sampled_data_dict['train'].train_test_split(
            test_size=val_set_size,
            shuffle=True,  # Ensure the data is shuffled to prevent ordering biases affecting learning
            seed=random_seed  # Use the seed for reproducibility
        )
        # Apply the provided tokenization function to both the training and validation datasets
        train_data = train_val_split['train'].map(generate_and_tokenize_func, batched=True)
        val_data = train_val_split['test'].map(generate_and_tokenize_func, batched=True)
    else:
        # If no validation set is specified, apply the function to the entire training dataset
        train_data = sampled_data_dict['train'].map(generate_and_tokenize_func, batched=True)
        val_data = None  # Set the validation dataset to None

    return train_data, val_data


In [None]:
from transformers import TrainerCallback
import math

class PerplexityCallback(TrainerCallback):
    """
    Custom callback to log and print perplexity at each logging step during training and validation.
    """
    def __init__(self, loss_threshold=100):
        """
        Initializes the callback with an optional loss threshold to handle large loss values gracefully.

        Args:
            loss_threshold (float): Threshold above which loss is considered too large for stable exponentiation.
        """
        self.loss_threshold = loss_threshold

    def on_log(self, args, state, control, logs=None, **kwargs):
        """
        Event called at each logging step.

        Args:
            args: Training arguments.
            state: TrainerState; provides information on training state.
            control: TrainerControl; provides various control flags.
            logs (dict): Dictionary of logs containing at least loss.
        """
        if logs is not None and "loss" in logs:
            # Calculate perplexity from the training loss
            perplexity = math.exp(logs["loss"]) if logs["loss"] < self.loss_threshold else float("inf")
            logs["perplexity"] = perplexity
            # Optionally, you could add validation perplexity calculations here as well
            if "eval_loss" in logs:
                eval_perplexity = math.exp(logs["eval_loss"]) if logs["eval_loss"] < self.loss_threshold else float("inf")
                logs["eval_perplexity"] = eval_perplexity

            # Log training and validation metrics
            print(f"Step {state.global_step} - Training Loss: {logs['loss']:.4f} - Training Perplexity: {perplexity:.4f}")
            if "eval_loss" in logs:
                print(f"Step {state.global_step} - Validation Loss: {logs['eval_loss']:.4f} - Validation Perplexity: {eval_perplexity:.4f}")



In [None]:
from datasets import load_dataset

# Load the dataset from Hugging Face's dataset repository
dataset = load_dataset('kunchum/capstone_1')

# Print information about the dataset to confirm it's loaded correctly
print(dataset)


In [None]:
import torch
import sys
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the tokenizer and model for Mistral AI
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

# Function to modify the model's state dictionary to incorporate PEFT
def apply_peft_to_model(model):
    old_state_dict = model.state_dict  # Store the original state_dict method
    model.state_dict = (
        lambda self, *args, **kwargs: get_peft_model_state_dict(self, old_state_dict(*args, **kwargs))
    ).__get__(model, type(model))  # Update the state_dict to use the PEFT version
    return model

# Optionally compile the model with PyTorch 2.0 if applicable and not on Windows
if torch.__version__ >= '2' and sys.platform != 'win32':
    model = torch.compile(model)

# Apply PEFT modifications
model = apply_peft_to_model(model)

# Load and prepare your dataset
dataset = load_dataset('kunchum/capstone_1')
dataset = dataset.shuffle(seed=42)

# Define a function for data preparation using the correct tokenizer
def tokenize_and_prepare(data_point):
    encoded = tokenizer(data_point['text'], padding="max_length", truncation=True, max_length=512)
    return {'input_ids': encoded['input_ids'], 'attention_mask': encoded['attention_mask'], 'labels': encoded['input_ids']}

# Prepare data
sampled_data_dict = DatasetDict({
    'train': dataset['train'].select(range(20000))
})
train_val_split = sampled_data_dict['train'].train_test_split(test_size=0.1)
train_data = train_val_split['train'].map(tokenize_and_prepare, batched=True)
val_data = train_val_split['test'].map(tokenize_and_prepare, batched=True)

# Training arguments setup
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    report_to="wandb"
)

# Setup the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)




In [None]:
# Start training
trainer.train()

# Close the W&B run
wandb.finish()