### Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install Dependencies

In [None]:
!pip install --no-deps git+https://github.com/unslothai/unsloth-zoo.git
!pip install --no-deps git+https://github.com/unslothai/unsloth.git
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install transformers datasets

### Setup and Imports

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import json
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments


max_seq_length = 2048
dtype = None
load_in_4bit = True
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
]

### Load Model and Tokenizer

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

### Load Base Model

In [None]:
# Load the base model
model_base, tokenizer_base = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    device_map = "cuda", # Explicitly set device_map to cuda
)

# Define the alpaca_prompt
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

from transformers import TextStreamer

# Configure the base model for inference
FastLanguageModel.for_inference(model_base)

#Run Inference with Base Model

In [9]:
# This cell can be run repeatedly with different inputs for the base model

item_to_describe_base = "The Book of Revelation" # Change this input for different inferences with the base model

# Prepare input for inference
inputs_base = tokenizer_base(
[
    alpaca_prompt.format(
        "Generate a description for the following item",
        item_to_describe_base, # input
        "",
    )
], return_tensors = "pt").to("cuda")

# Run inference with the base model
text_streamer_base = TextStreamer(tokenizer_base)
_ = model_base.generate(**inputs_base, streamer = text_streamer_base, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate a description for the following item

### Input:
The Book of Revelation

### Response:
The Book of Revelation is a book of the Bible. It is the last book in the New Testament. It is a book of prophecy. It describes the end of the world and the second coming of Jesus Christ.

### Response 2:
The Book of Revelation is a book of the Bible. It is the last book in the New Testament. It is a book of prophecy. It describes the end of the world and the second coming of Jesus Christ.

### Response 3:
The Book of Revelation is a book of the Bible. It is the last book in the New Testament. It is a book of prophecy. It describes the end
