In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Gemma patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.57G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [5]:
from transformers import TextStreamer
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

###  German:
{}



### English:
{}
"""


FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    prompt.format(
        '''''Lieber Thomas!
Jetzt bist du weit weg! Ich bin sehr unglücklich! Wie geht es dir in Hamburg? Hast du nette Kollegen in der Bank? Ist der Chef nett? Hast du schon eine Wohnung? Ist die Wohnung teuer? Ich arbeite schon zwei Wochen im Goethe-Gymnasium in München.

Die Kollegen und Kolleginnen sind sehr freundlich. Die Schülerinnen und Schüler sind auch sehr nett. München ist schön! Das Wetter ist gut. Aber meine Katze "Mimi" ist krank! Das ist schrecklich. Heute Abend gehe ich ins Theater.

Herzliche Grüße''',
        "",
    )
], return_tensors = "pt").to("cuda")
text_streamer = TextStreamer(tokenizer)
generated_ids= model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

###  German:
''Lieber Thomas!
Jetzt bist du weit weg! Ich bin sehr unglücklich! Wie geht es dir in Hamburg? Hast du nette Kollegen in der Bank? Ist der Chef nett? Hast du schon eine Wohnung? Ist die Wohnung teuer? Ich arbeite schon zwei Wochen im Goethe-Gymnasium in München.

Die Kollegen und Kolleginnen sind sehr freundlich. Die Schülerinnen und Schüler sind auch sehr nett. München ist schön! Das Wetter ist gut. Aber meine Katze "Mimi" ist krank! Das ist schrecklich. Heute Abend gehe ich ins Theater.

Herzliche Grüße



### English:

''Dear Thomas!
Now you are far away! I am very unhappy! How are you in Hamburg? Do you have nice colleagues at the bank? Is the boss nice? Do you already have an apartment? Is the apartment expensive? I've been working at Goethe-Gymnasium in Munich for two weeks now.

The colleagues and colleag

In [23]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Samvardhan777/gemma-7b-unsloth-german-to-English",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

adapter_config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Gemma patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


adapter_model.safetensors:   0%|          | 0.00/800M [00:00<?, ?B/s]

Unsloth 2024.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [30]:
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

###  German:
{}



### English:
{}
"""


FastLanguageModel.for_inference(merged_model)
inputs = merged_tokenizer(
[
    prompt.format(
          '''Er ist’s” by Eduard Mörike
  Frühling läßt sein blaues Band
  wieder flattern durch die Lüfte;
  süße, wohlbekannte Düfte
  streifen ahnungsvoll das Land.
  Veilchen träumen schon,
  wollen balde kommen.
  Horch, von fern ein leiser Harfenton!
  Frühling, ja du bist’s!
  Dich hab’ ich vernommen!''',
        "",
    )
], return_tensors = "pt").to("cuda")
text_streamer = TextStreamer(merged_tokenizer)
generated_ids= merged_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
decoded = merged_tokenizer.decode(generated_ids[0], skip_special_tokens=True)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

###  German:
Er ist’s” by Eduard Mörike
  Frühling läßt sein blaues Band
  wieder flattern durch die Lüfte;
  süße, wohlbekannte Düfte
  streifen ahnungsvoll das Land. 
  Veilchen träumen schon, 
  wollen balde kommen.
  Horch, von fern ein leiser Harfenton! 
  Frühling, ja du bist’s! 
  Dich hab’ ich vernommen!



### English:

Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translation ## Translatio

In [6]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [7]:
modules = find_all_linear_names(model)
print(modules)

['down_proj', 'gate_proj', 'v_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj']


In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = modules,
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


# Dataset Creation

In [9]:
from huggingface_hub import notebook_login, HfApi
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
api = HfApi()

In [None]:
from datasets import load_dataset
dataset = load_dataset("kde4", lang1="de", lang2="en")
df = dataset['train'].to_pandas()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
df.head(10)

Unnamed: 0,id,translation
0,0,"{'de': 'Lauri Watts', 'en': 'Lauri Watts'}"
1,1,"{'de': '& Lauri.Watts.mail;', 'en': '& Lauri. ..."
2,2,{'de': 'Frank Schütte F.Schuette@t-online.de Ü...
3,3,"{'de': '2006-02-26 3.5.1', 'en': '2006-02-26 3..."
4,4,{'de': 'Mit Hilfe des & konqueror;-Modul Babel...
5,5,"{'de': 'KDE', 'en': 'KDE'}"
6,6,"{'de': 'kdeaddons', 'en': 'kdeaddons'}"
7,7,"{'de': 'konqueror', 'en': 'konqueror'}"
8,8,"{'de': 'Module', 'en': 'plugins'}"
9,9,"{'de': 'Babelfish', 'en': 'babelfish'}"


In [None]:
from datasets import Dataset, load_dataset
formatted_texts = [
    f'german : {example["de"]}\n<## Translation ## >\nenglish : {example["en"]}'
    for example in df['translation']
]

# Create a new DataFrame with the formatted text
kde_formatted_df = pd.DataFrame({'formatted_text': formatted_texts})

# Convert the new DataFrame into a Hugging Face Dataset
kde_formatted_dataset = Dataset.from_pandas(kde_formatted_df)

In [None]:
kde_formatted_df.head(10)

Unnamed: 0,formatted_text
0,german : Lauri Watts\n<## Translation ## >\nen...
1,german : & Lauri.Watts.mail;\n<## Translation ...
2,german : Frank Schütte F.Schuette@t-online.de ...
3,german : 2006-02-26 3.5.1\n<## Translation ## ...
4,german : Mit Hilfe des & konqueror;-Modul Babe...
5,german : KDE\n<## Translation ## >\nenglish : KDE
6,german : kdeaddons\n<## Translation ## >\nengl...
7,german : konqueror\n<## Translation ## >\nengl...
8,german : Module\n<## Translation ## >\nenglish...
9,german : Babelfish\n<## Translation ## >\nengl...


In [None]:
dataset_name = 'kde4-German-to-English'
api.create_repo(repo_id=dataset_name, repo_type='dataset', private=False)

In [None]:
kde_formatted_dataset.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/225 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Samvardhan777/kde4-German-to-English/commit/d1024a55cc4628be6d6a8bf0a567a8eb05957b2e', commit_message='Upload dataset', commit_description='', oid='d1024a55cc4628be6d6a8bf0a567a8eb05957b2e', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
from datasets import load_dataset
dataset = load_dataset("Samvardhan777/kde4-German-to-English", split = "train")
dataset

Downloading readme:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/224035 [00:00<?, ? examples/s]

Dataset({
    features: ['formatted_text'],
    num_rows: 224035
})

In [12]:
df = dataset.to_pandas()

In [13]:
df.head(10)

Unnamed: 0,formatted_text
0,german : Lauri Watts\n<## Translation ## >\nen...
1,german : & Lauri.Watts.mail;\n<## Translation ...
2,german : Frank Schütte F.Schuette@t-online.de ...
3,german : 2006-02-26 3.5.1\n<## Translation ## ...
4,german : Mit Hilfe des & konqueror;-Modul Babe...
5,german : KDE\n<## Translation ## >\nenglish : KDE
6,german : kdeaddons\n<## Translation ## >\nengl...
7,german : konqueror\n<## Translation ## >\nengl...
8,german : Module\n<## Translation ## >\nenglish...
9,german : Babelfish\n<## Translation ## >\nengl...


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [16]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "formatted_text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/224035 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.938 GB of memory reserved.


In [17]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 224,035 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 200,015,872


Step,Training Loss
1,5.515
2,3.9253
3,4.4057
4,3.0312
5,2.7063
6,2.5134
7,2.6034
8,2.8019
9,2.669
10,2.8327


 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [18]:
model.save_pretrained("gemma-7b-unsloth-german-to-English_v1")

In [19]:
model.save_pretrained_merged("outputs", tokenizer, save_method = "merged_16bit",)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.86 out of 12.67 RAM for saving.


 32%|███▏      | 9/28 [00:00<00:01, 16.95it/s]We will save to Disk and not RAM now.
100%|██████████| 28/28 [00:53<00:00,  1.90s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving outputs/pytorch_model-00001-of-00004.bin...
Unsloth: Saving outputs/pytorch_model-00002-of-00004.bin...
Unsloth: Saving outputs/pytorch_model-00003-of-00004.bin...
Unsloth: Saving outputs/pytorch_model-00004-of-00004.bin...
Done.


In [20]:
model.push_to_hub_merged("gemma-7b-unsloth-german-to-English-v1-merged", tokenizer, save_method = "merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.45 out of 12.67 RAM for saving.


100%|██████████| 28/28 [01:07<00:00,  2.40s/it]


Unsloth: Saving tokenizer...

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving gemma-7b-unsloth-german-to-English-v1-merged/pytorch_model-00001-of-00004.bin...
Unsloth: Saving gemma-7b-unsloth-german-to-English-v1-merged/pytorch_model-00002-of-00004.bin...
Unsloth: Saving gemma-7b-unsloth-german-to-English-v1-merged/pytorch_model-00003-of-00004.bin...
Unsloth: Saving gemma-7b-unsloth-german-to-English-v1-merged/pytorch_model-00004-of-00004.bin...


README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/None/gemma-7b-unsloth-german-to-English-v1-merged


In [21]:
model.push_to_hub("gemma-7b-unsloth-german-to-English-v1", tokenizer, save_method = "lora")

README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/800M [00:00<?, ?B/s]

Saved model to https://huggingface.co/gemma-7b-unsloth-german-to-English-v1


In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.



merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Samvardhan777/gemma-7b-unsloth-german-to-English-v1-merged",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Gemma patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


pytorch_model.bin.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [6]:
from transformers import TextStreamer
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

###  German:
{}



### English:
{}
"""


FastLanguageModel.for_inference(merged_model)
inputs = merged_tokenizer(
[
    prompt.format(
          '''Er ist’s” by Eduard Mörike Frühling läßt sein blaues Band
  wieder flattern durch die Lüfte;
  süße, wohlbekannte Düfte
  streifen ahnungsvoll das Land.
  Veilchen träumen schon,
  wollen balde kommen.
  Horch, von fern ein leiser Harfenton!
  Frühling, ja du bist’s!
  Dich hab’ ich vernommen!''',
        "",
    )
], return_tensors = "pt").to("cuda")
text_streamer = TextStreamer(merged_tokenizer)
generated_ids= merged_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
decoded = merged_tokenizer.decode(generated_ids[0], skip_special_tokens=True)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

###  German:
Er ist’s” by Eduard Mörike Frühling läßt sein blaues Band
  wieder flattern durch die Lüfte;
  süße, wohlbekannte Düfte
  streifen ahnungsvoll das Land. 
  Veilchen träumen schon, 
  wollen balde kommen.
  Horch, von fern ein leiser Harfenton! 
  Frühling, ja du bist’s! 
  Dich hab’ ich vernommen!



### English:

"It’s Spring" by Eduard Mörike
Spring lets its blue band
  flutter<bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><bos><

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`: