In [2]:
# Install required libraries
!pip install -q transformers peft bitsandbytes accelerate datasets trl
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer

# Load the Phi-2 model and tokenizer
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Configure LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Get the PEFT model
model = get_peft_model(model, peft_config)

# Print trainable parameters
model.print_trainable_parameters()

# Load a smaller subset of the OASST1 dataset
dataset = load_dataset("OpenAssistant/oasst1", split="train[:25000]")  # Reduced to 25000 examples

# Function to format the data
def format_data(example):
    instruction = example['text']
    response = example.get('response', '')
    prompt = f"Instruction: {instruction}\nResponse:"
    example['text'] = f"{prompt} {response}"
    return example

# Apply the formatting function to the dataset
formatted_dataset = dataset.map(format_data)

# Define training arguments
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,  # Reduced from 3 to 2
    per_device_train_batch_size=8,  # Increased from 4 to 8
    gradient_accumulation_steps=2,  # Reduced from 4 to 2
    optim="paged_adamw_32bit",
    save_steps=1000,
    logging_steps=200,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
)

# Define SFT trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=384,  # Reduced from 512 to 384
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train the model
trainer.train()
print("Training complete!")

# Save the model
trainer.model.save_pretrained("./phi2_finetuned_6hour")

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 9,175,040 || all params: 2,788,858,880 || trainable%: 0.3290


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
200,2.1274
400,2.0118
600,1.9977
800,1.9912
1000,1.9324
1200,1.935
1400,1.9366
1600,1.9186
1800,1.8395
2000,1.8642


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Training complete!


In [13]:
# Save the model
output_dir = "./phi-2-fine-tuned_kaggle"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to ./phi-2-fine-tuned_kaggle


In [14]:
pwd

'/kaggle/working'

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

# Path to your saved fine-tuned model
MODEL_PATH = "./phi-2-fine-tuned_kaggle"

# Load the saved tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load the PEFT configuration
peft_config = PeftConfig.from_pretrained(MODEL_PATH)

# Load the PEFT model
model = PeftModel.from_pretrained(base_model, MODEL_PATH)

# Set the model to evaluation mode
model.eval()

def generate_response(instruction, max_length=512):
    # Format the input
    prompt = f"Instruction: {instruction}\nResponse:"
    
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate the response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
    
    # Decode and return the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Response:")[1].strip()

# Example usage
if __name__ == "__main__":
    while True:
        user_input = input("Enter your instruction (or 'quit' to exit): ")
        if user_input.lower() == 'quit':
            break
        response = generate_response(user_input)
        print("Model Response:", response)
        print()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Enter your instruction (or 'quit' to exit):  Explain the concept of machine learning.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Response: The concept of machine learning is a branch of artificial intelligence that involves the development of algorithms and statistical models that allow computers to learn from data, without being explicitly programmed.

Machine learning algorithms can automatically improve their performance on a specific task by analyzing and interpreting data. They can identify patterns and relationships within the data, and use this information to make predictions or decisions.

Machine learning is used in a wide range of applications, including natural language processing, image and speech recognition, fraud detection, and recommendation systems.

The process of machine learning typically involves the following steps:

1. Data collection: The first step is to collect a large amount of data related to the problem to be solved.

2. Data preprocessing: The data is then cleaned, normalized, and transformed into a format that can be used by the machine learning algorithm.

3. Model training:

KeyboardInterrupt: Interrupted by user

In [17]:
# First, make sure you have the latest huggingface_hub library installed
!pip install --upgrade huggingface_hub

# Import necessary libraries
from huggingface_hub import HfApi
from getpass import getpass
import os

# Set your Hugging Face credentials
# It's better to input your token this way instead of hardcoding it
hf_token = getpass("Enter your Hugging Face token: ")

# Set the path to your saved model
local_model_path = "./phi-2-fine-tuned_kaggle"  # Replace with your actual path if different

# Your specific repository ID
repo_id = "sagar007/phi2_25k"

# Initialize the Hugging Face API
api = HfApi()

# Create the repository if it doesn't exist
api.create_repo(repo_id=repo_id, token=hf_token, exist_ok=True)

# Upload the model files
for root, _, files in os.walk(local_model_path):
    for file in files:
        file_path = os.path.join(root, file)
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file_path.replace(local_model_path, "").lstrip("/"),
            repo_id=repo_id,
            token=hf_token
        )

print(f"Model successfully pushed to {repo_id}")

# You can now use this model in your Gradio app or elsewhere by referencing:
# MODEL_PATH = "sagar007/phi2_finetune"

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Enter your Hugging Face token:  ·····································


adapter_model.safetensors:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Model successfully pushed to sagar007/phi2_25k
