In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer

# Load the Phi-3.5-mini-instruct model and tokenizer
model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Configure LoRA with correct target modules
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "qkv_proj",
        "o_proj",
        "gate_up_proj",
        "down_proj"
    ]
)

# Get the PEFT model
model = get_peft_model(model, peft_config)

# Print trainable parameters
print("\nTrainable parameters:")
model.print_trainable_parameters()

# Load a smaller subset of the OASST1 dataset
dataset = load_dataset("OpenAssistant/oasst1", split="train[:25000]")  # Reduced to 25000 examples

# Function to format the data
def format_data(example):
    instruction = example['text']
    response = example.get('response', '')
    prompt = f"Instruction: {instruction}\nResponse:"
    example['text'] = f"{prompt} {response}"
    return example

# Apply the formatting function to the dataset
formatted_dataset = dataset.map(format_data)

# Define training arguments
training_arguments = TrainingArguments(
    output_dir="./results_phi_3_5_mini",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=1000,
    logging_steps=200,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
)

# Define SFT trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=384,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train the model
print("\nStarting training...")
trainer.train()
print("Training complete!")

# Save the model
print("\nSaving the model...")
trainer.model.save_pretrained("./phi_3_5_mini_finetuned")
print("Model saved successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Trainable parameters:
trainable params: 12,582,912 || all params: 3,833,662,464 || trainable%: 0.3282


Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training...


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
200,1.7943
400,1.6814
600,1.6631
800,1.6752
1000,1.6343
1200,1.6186
1400,1.6147
1600,1.6037
1800,1.4901
2000,1.5054


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Training complete!

Saving the model...
Model saved successfully!


In [8]:
# Save the model
output_dir = "./phi-3.5-fine-tuned_kaggle"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to ./phi-3.5-fine-tuned_kaggle


In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

# Path to your saved fine-tuned model
MODEL_PATH = "./phi-3.5-fine-tuned_kaggle"

# Load the saved tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load the PEFT configuration
peft_config = PeftConfig.from_pretrained(MODEL_PATH)

# Load the PEFT model
model = PeftModel.from_pretrained(base_model, MODEL_PATH)

# Set the model to evaluation mode
model.eval()

def generate_response(instruction, max_length=512):
    # Format the input
    prompt = f"Instruction: {instruction}\nResponse:"
    
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate the response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
    
    # Decode and return the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Response:")[1].strip()

# Example usage
if __name__ == "__main__":
    while True:
        user_input = input("Enter your instruction (or 'quit' to exit): ")
        if user_input.lower() == 'quit':
            break
        response = generate_response(user_input)
        print("Model Response:", response)
        print()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Enter your instruction (or 'quit' to exit):  What are some effective ways to reduce stress?


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


Model Response: -  Engage in regular physical activity
  -  Practice relaxation techniques such as deep breathing, meditation, or yoga
  -  Get enough sleep
  -  Eat a healthy diet
  -  Connect with others and build a support network
  -  Prioritize self-care and take time for yourself
  -  Set realistic goals and prioritize tasks
  -  Learn to manage your time effectively
  -  Seek professional help if needed



Enter your instruction (or 'quit' to exit):  quit


In [10]:
# First, make sure you have the latest huggingface_hub library installed
!pip install --upgrade huggingface_hub

# Import necessary libraries
from huggingface_hub import HfApi
from getpass import getpass
import os

# Set your Hugging Face credentials
# It's better to input your token this way instead of hardcoding it
hf_token = getpass("Enter your Hugging Face token: ")

# Set the path to your saved model
local_model_path = "./phi-3.5-fine-tuned_kaggle"  # Replace with your actual path if different

# Your specific repository ID
repo_id = "sagar007/phi3.5_finetune"

# Initialize the Hugging Face API
api = HfApi()

# Create the repository if it doesn't exist
api.create_repo(repo_id=repo_id, token=hf_token, exist_ok=True)

# Upload the model files
for root, _, files in os.walk(local_model_path):
    for file in files:
        file_path = os.path.join(root, file)
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file_path.replace(local_model_path, "").lstrip("/"),
            repo_id=repo_id,
            token=hf_token
        )

print(f"Model successfully pushed to {repo_id}")

# You can now use this model in your Gradio app or elsewhere by referencing:
# MODEL_PATH = "sagar007/phi2_finetune"

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Enter your Hugging Face token:  ·····································


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/50.4M [00:00<?, ?B/s]

Model successfully pushed to sagar007/phi3.5_finetune
