In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
! pip install transformers datasets torch bitsandbytes peft accelerate evaluate wandb

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (f

In [3]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType
)
import bitsandbytes as bnb
from typing import Dict, Any
import evaluate
import numpy as np
from datetime import datetime
import wandb  # for logging metrics (optional)
import pandas as pd
from torch.utils.data import DataLoader
from accelerate import Accelerator
import math
import gc

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

334

In [4]:
# Initialize wandb (optional)
wandb.init(project="phi2-finetuning-refined")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmonimoyd[0m ([33mmonimoyd-others[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Load the Phi-2 model and tokenizer
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

89

In [6]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [7]:
# Load model in 8-bit precision
# Load model with optimizations
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, device_map="auto")

# Apply LoRA for memory-efficient fine-tuning
lora_config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.1, bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Clear memory again
torch.cuda.empty_cache()
gc.collect()

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 2,621,440 || all params: 2,782,305,280 || trainable%: 0.0942


49

In [8]:
def prepare_dataset(max_samples=None):
    """
    Prepare dataset with option to limit samples for faster training
    """
    dataset = load_dataset("OpenAssistant/oasst1")
    df = pd.DataFrame(dataset['train'])

    # Optional: Limit the dataset size for faster training
    if max_samples:
        df = df.head(max_samples)

    messages = {}
    qa_pairs = []

    # Optimize message storage
    for _, row in df.iterrows():
        messages[row['message_id']] = {
            'text': row['text'],
            'parent_id': row['parent_id'],
            'role': row['role']
        }

    # Create QA pairs more efficiently
    for message_id, message in messages.items():
        if message['role'] == 'assistant' and message['parent_id'] in messages:
            parent = messages[message['parent_id']]
            if parent['role'] == 'prompter':
                qa_pairs.append({
                    'text': f"Human: {parent['text'].strip()}\nAssistant: {message['text'].strip()}\n"
                })

    from datasets import Dataset
    new_dataset = Dataset.from_pandas(pd.DataFrame(qa_pairs))

    # Split with smaller validation set
    train_val_split = new_dataset.train_test_split(test_size=0.05, seed=42)

    def tokenize_and_prepare(examples):
        tokenized = tokenizer(
            examples['text'],
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors="pt"
        )
        tokenized['labels'] = tokenized['input_ids'].clone()
        return tokenized

    # Process with larger batch size
    processed_train = train_val_split['train'].map(
        tokenize_and_prepare,
        batched=True,
        remove_columns=train_val_split['train'].column_names
    )

    processed_val = train_val_split['test'].map(
        tokenize_and_prepare,
        batched=True,
        remove_columns=train_val_split['test'].column_names
    )

    return processed_train, processed_val


# Prepare the datasets
#Limit dataset size for faster training
max_samples = 50000  # Adjust this number based on your needs
processed_train, processed_val = prepare_dataset(max_samples=max_samples)

# Verify dataset sizes
print(f"\nFinal dataset sizes:")
print(f"Training samples: {len(processed_train)}")
print(f"Validation samples: {len(processed_val)}")


README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

Map:   0%|          | 0/29987 [00:00<?, ? examples/s]

Map:   0%|          | 0/1579 [00:00<?, ? examples/s]


Final dataset sizes:
Training samples: 29987
Validation samples: 1579


In [10]:
# Initialize metrics
perplexity = evaluate.load("perplexity")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate perplexity
    try:
        perp = perplexity.compute(
            predictions=predictions,
            references=labels,
            model_id=model_name
        )
    except:
        perp = {'mean_perplexity': float('inf')}

    # Calculate loss
    loss = torch.nn.functional.cross_entropy(
        torch.tensor(logits.reshape(-1, logits.shape[-1])),
        torch.tensor(labels.reshape(-1)),
        ignore_index=-100
    ).item()

    return {
        'perplexity': perp['mean_perplexity'],
        'loss': loss
    }


In [11]:
# Custom callback for logging
class LoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            print(f"\nStep {state.global_step}:")
            for k, v in logs.items():
                print(f"{k}: {v:.4f}")


class MyCallback(TrainerCallback):
    def on_train_batch_end(self, args, state, optimizer, model, batch, outputs, **kwargs):
        torch.cuda.empty_cache()
        gc.collect()


In [16]:
# Data collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()
torch.cuda.reset_peak_memory_stats()  # Resets memory tracking

print(torch.cuda.memory_allocated() / 1e9, "GB allocated")
print(torch.cuda.memory_reserved() / 1e9, "GB reserved")

# Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/phi2-openassistant-lora",
    #per_device_train_batch_size=2,
    #per_device_eval_batch_size=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    evaluation_strategy="steps",
    eval_steps=10000,
    save_strategy="steps",
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=2e-4,
    warmup_steps=100,
    max_steps=5000,
    fp16=True,
    push_to_hub=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[LoggingCallback(), MyCallback()]
)



# Debug: Print sample batch
print("\nChecking sample batch format:")
sample_batch = data_collator([processed_train[i] for i in range(2)])
for k, v in sample_batch.items():
    print(f"{k}: {v.shape}")

# Optional: Calculate estimated training time
total_steps = math.ceil(len(processed_train) / (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
print(f"Total training steps: {total_steps}")

# Train the model
print("\nStarting training...")
trainer.train()


# Save the trained model
model.save_pretrained("/content/drive/MyDrive/phi2-openassistant-lora-final")

1.986776064 GB allocated
2.174746624 GB reserved

Checking sample batch format:
input_ids: torch.Size([2, 256])
attention_mask: torch.Size([2, 256])
labels: torch.Size([2, 256])
Total training steps: 1875

Starting training...




Step,Training Loss,Validation Loss



Step 500:
loss: 1.7706
grad_norm: 0.3980
learning_rate: 0.0002
epoch: 0.2668

Step 1000:
loss: 1.7892
grad_norm: 0.3363
learning_rate: 0.0002
epoch: 0.5335

Step 1500:
loss: 1.7665
grad_norm: 0.3826
learning_rate: 0.0001
epoch: 0.8003

Step 2000:
loss: 1.7686
grad_norm: 0.4142
learning_rate: 0.0001
epoch: 1.0667

Step 2500:
loss: 1.7430
grad_norm: 0.3755
learning_rate: 0.0001
epoch: 1.3335

Step 3000:
loss: 1.7579
grad_norm: 0.4045
learning_rate: 0.0001
epoch: 1.6002

Step 3500:
loss: 1.7327
grad_norm: 0.4195
learning_rate: 0.0001
epoch: 1.8670

Step 4000:
loss: 1.7414
grad_norm: 0.3816
learning_rate: 0.0000
epoch: 2.1334

Step 4500:
loss: 1.7383
grad_norm: 0.4859
learning_rate: 0.0000
epoch: 2.4002

Step 5000:
loss: 1.7155
grad_norm: 0.4838
learning_rate: 0.0000
epoch: 2.6669

Step 5000:
train_runtime: 4466.2598
train_samples_per_second: 17.9120
train_steps_per_second: 1.1200
train_loss: 1.7524
epoch: 2.6669


In [17]:
# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

0

In [21]:
# Inference code
def generate_response(prompt, model, tokenizer, max_length=200):
    inputs = tokenizer(f"Human: {prompt}\nAssistant:", return_tensors="pt").to(model.device)

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Assistant:")[-1].strip()



In [19]:
# Load the fine-tuned model for inference
inference_model = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/phi2-openassistant-lora-final",
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
# Test the model
test_prompt = "What is machine learning?"
response = generate_response(test_prompt, inference_model, tokenizer)
print(f"Human: {test_prompt}")
print(f"Assistant: {response}")



Human: What is machine learning?
Assistant: Machine learning is a branch of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to learn from and make predictions or decisions based on data, without being explicitly programmed.

In machine learning, computers are trained on large datasets to identify patterns and relationships between different variables. This training process involves feeding the computer with input data and corresponding output labels, and then adjusting the computer's internal parameters to minimize the difference between the predicted output and the actual output.

Once the computer has been trained, it can use the learned patterns and relationships to make predictions or decisions on new, unseen data. This is known as inference, and it is the core of machine learning.

There are several different types of machine learning algorithms, including supervised learning, unsupervised learning, and reinforceme

In [26]:
# Interactive testing
while True:
    test_prompt = input("\nEnter your prompt (or 'quit' to exit): ")
    if test_prompt.lower() == 'quit':
        break

    response = generate_response(test_prompt, inference_model, tokenizer)
    print(f"\nAssistant: {response}")


Enter your prompt (or 'quit' to exit): Summarize the theory of relativity

Assistant: The theory of relativity, developed by Albert Einstein in the early 20th century, is a fundamental theory in physics that describes the relationship between space, time, and gravity. It consists of two parts: the special theory of relativity and the general theory of relativity.

The special theory of relativity, published in 1905, deals with the behavior of objects moving at constant speeds relative to each other. It states that the laws of physics are the same for all observers in uniform motion, and that the speed of light is constant in a vacuum. This theory has been confirmed by numerous experiments and observations, and it has had a profound impact on our understanding of the universe.

The general theory of relativity, published in 1915, extends the special theory of relativity to include gravity. It describes gravity as the curvature of spacetime caused by the presence of mass and energy. Acc