#### Step 0: Install libraries.

In [1]:
!pip3 install -q -U bitsandbytes
!pip3 install -q -U peft
!pip3 install -q -U trl
!pip3 install -q -U accelerate
!pip3 install -q -U datasets
!pip3 install -q -U transformers
!pip3 install -U tyro

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m333.2/333.2 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m


#### Import libraries.

In [2]:
import os
import transformers
import torch
from datasets import load_dataset, Dataset, DatasetDict
from trl import SFTTrainer
from peft import LoraConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [27]:
# Login to Hugging Face account.
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#### Step 1: Download Gemma-2b-it base model

In [12]:
model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config = bnb_config,
                                             device_map={"":0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
text = "Complete the dialogue: I'll make him an offer, he "
device = "cuda:0"
inputs = tokenizer(text, return_tensors = "pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

<bos>Complete the dialogue: I'll make him an offer, he ___________________.

The correct answer is "will accept".

The dialogue could be continued as follows:

I'll make him an offer, he **will accept** it.<eos>


#### Step 2: Configure LoRA settings for modules to be trained

In [45]:
os.environ["WANDB_DISABLE"] = "false"
os.environ['WANDB_MODE'] = 'disabled'

In [21]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM"
)

#### Step 3: Load the dataset

In [34]:
data1 = load_dataset("Seikaijyu/Beautiful-Chinese")
data1

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 810284
    })
})

In [36]:
# for fast demo reduce the data num, remove this line when you want to train all data
data2 = data1['train'].select(range(100000))

In [40]:
data3 = data2.map(lambda samples: tokenizer(samples["question"], samples["answer"]), batched=False)

In [47]:
# This function won't do anything.
# Because the data had been tokenized
def formatting_func(example):
  text = f"Answer: {example['Answer'][0]}"
  return [text]

#### Step 4: Configure supervised fine-tuning parameters.

In [48]:
trainer = SFTTrainer(
    model = model,
    train_dataset = data3,
    args = transformers.TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 2,
        max_steps = 10000,  # just for fast demo, remove this line when you want to train all data
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 10,
        output_dir = "outputs",
        optim = "paged_adamw_8bit"
    ),
    peft_config = lora_config,
    formatting_func = formatting_func

)


max_steps is given, it will override any value given in num_train_epochs


#### Step 5: Start model fine-tuning.

In [None]:
trainer.train()

Step,Training Loss
1,7.3391
2,7.8619
3,7.3774
4,6.5163
5,5.9925
6,5.5017
7,4.9522
8,4.7898
9,4.9919
10,4.711


#### Step 6: Inference on the fine-tuned model.

In [None]:
text = "What is Hemoglobin?"

device = "cuda:0"

prompt = text + "\nAnswer:"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=100, eos_token_id=tokenizer.eos_token_id)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)

What is Hemoglobin?
Answer: Hemoglobin is a protein found in redbeds and vaguely in vertebrates. It is responsible for carrying oxygen from the lungs to the tissues and removing impra impra impractically from the tissues to the lungs.


In [None]:
text = "what is formula for perimeter of square? "

device = "cuda:0"

prompt = text + "\nAnswer:"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=100, eos_token_id=tokenizer.eos_token_id)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)

what is formula for perimeter of square? 
Answer: The perimeter of a square is given by the formula:

$$P = 4s$$

where:

* P is the perimeter in meters
* s is the side length in meters


#### Step 7: Save the fine-tuned model.

In [None]:
fine_tuned_model = "fine_tuned_science_gemma2b-it_unmerged"
trainer.model.save_pretrained(fine_tuned_model)

# Push the model on Hugging Face.
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    return_dict = True,
    torch_dtype = torch.float16,
    device_map = {"": 0}
)

# Merge the fine-tuned model with LoRA adaption along with the base Gemma 2b-it model.
fine_tuned_merged_model = PeftModel.from_pretrained(base_model, fine_tuned_model)
fine_tuned_merged_model = fine_tuned_merged_model.merge_and_unload()

# Save the fine-tuned merged model.
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True)
fine_tuned_merged_model.save_pretrained("fine_tuned_science_gemma2b-it", safe_serialization = True)
tokenizer.save_pretrained("fine_tuned_science_gemma2b-it")
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Step 8: Convert model to MediaPipe format for on-device deployment
***Note: You might need to restart the runtime before running the below cell.***

In [None]:
!pip install mediapipe
!pip install torch

Collecting mediapipe
  Downloading mediapipe-0.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Installing collected packages: sounddevice, mediapipe
Successfully installed mediapipe-0.10.11 sounddevice-0.4.6


In [None]:
from mediapipe.tasks.python.genai import converter

In [None]:
config = converter.ConversionConfig(
  input_ckpt='/content/fine_tuned_science_gemma2b-it/',
  ckpt_format="safetensors",
  model_type="GEMMA_2B",
  backend='gpu',
  output_dir='/content/intermediate/fine_tuned_science_gemma2b-it/',
  combine_file_only=False,
  vocab_model_file="/content/fine_tuned_science_gemma2b-it/",
  output_tflite_file=f'/content/fine_tuned_science_gemma2b-it/scigemma.bin',
)

converter.convert_checkpoint(config)

print("Model converted successfully.")



Model converted successfully.


#### Step 9: Push the fine-tuned model on Hugging Face.

In [None]:
from huggingface_hub import whoami
from pathlib import Path

# Output directory.
output_dir = "fine_tuned_science_gemma2b-it"
username = whoami(token=Path("/root/.cache/huggingface/"))["name"]
repo_id = f"{username}/{output_dir}"

In [None]:
from huggingface_hub import upload_folder, create_repo

repo_id = create_repo(repo_id, exist_ok=True).repo_id


upload_folder(
    repo_id=repo_id,
    folder_path=output_dir,
    commit_message="Fine-tuned model pushed.",
    ignore_patterns=["step_*", "epoch_*"],
)

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

scigemma.bin:   0%|          | 0.00/2.52G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NSTiwari/fine_tuned_science_gemma2b-it/commit/2abb637a1df5fd1b0cc6b21f840f4e08e668b1d8', commit_message='Fine-tuned model pushed.', commit_description='', oid='2abb637a1df5fd1b0cc6b21f840f4e08e668b1d8', pr_url=None, pr_revision=None, pr_num=None)

#### (Optional) Convert model to MLC format for on-device deployment
***Note: You might need to restart the runtime.***


Install MLC LLM and TVM Unity Compiler

In [None]:
!pip install --pre --force-reinstall mlc-ai-nightly-cu122 mlc-llm-nightly-cu122 -f https://mlc.ai/wheels

Looking in links: https://mlc.ai/wheels
Collecting mlc-ai-nightly-cu122
  Downloading https://github.com/mlc-ai/package/releases/download/v0.9.dev0/mlc_ai_nightly_cu122-0.15.dev223-cp310-cp310-manylinux_2_28_x86_64.whl (1018.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 GB[0m [31m858.7 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting mlc-llm-nightly-cu122
  Downloading https://github.com/mlc-ai/package/releases/download/v0.9.dev0/mlc_llm_nightly_cu122-0.1.dev1058-cp310-cp310-manylinux_2_28_x86_64.whl (145.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.6/145.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting attrs (from mlc-ai-nightly-cu122)
  Downloading attrs-23.2.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cloudpickle (from mlc-ai-nightly-cu122)
  Downloading cloudpickle-3.0.0-py3-none-any.whl 

In [None]:
!pip install numpy==1.23.5

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.0rc1
    Uninstalling numpy-2.0.0rc1:
      Successfully uninstalled numpy-2.0.0rc1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chex 0.1.86 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
datasets 2.17.0 requires fsspec[http]<=2023.10.0,>=2023.1.0, but you have fsspec 2024.3.1 which is incompatible.
moviepy 1.0.3 requires decorator<5.0,>=4.0.2, but you have decorator 5.1.1 which is incompatible.
tensorflow 2.15.0 requires ml-dtypes~=0.2.0, but you have ml-dtypes 0.4.0b1 which 

In [None]:
import mlc_llm
import torch

In [None]:
# Login to Hugging Face account.
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Configure model name and quantization type.
FINE_TUNED_MODEL_NAME = "fine_tuned_science_gemma2b-it"
QUANTIZATION= "q4f16_1"

In [None]:
!nvidia-smi

# Check if CUDA is available.
torch.cuda.is_available()

Sun Mar 31 12:19:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

True

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

#### Convert fine-tuned model weights.

*Note:* Go to **/usr/local/lib/python3.10/dist-packages/tvm/relax/frontend/nn/exporter.py**", and comment Line 139:

 `assert rx.analysis.well_formed(mod)`

*Note:* Go to **/usr/local/lib/python3.10/dist-packages/mlc_llm/loader/huggingface_loader.py**", and add the following line before line 115:

 `preshared_funcs = {}`

In [None]:
!python -m mlc_llm convert_weight /content/$FINE_TUNED_MODEL_NAME/ --quantization $QUANTIZATION -o /content/$FINE_TUNED_MODEL_NAME-$QUANTIZATION-MLC/

[2024-03-31 12:19:45] INFO auto_config.py:115: [92mFound[0m model configuration: /content/fine_tuned_science_gemma2b-it/config.json
[2024-03-31 12:19:48] INFO auto_device.py:76: [92mFound[0m device: cuda:0
[2024-03-31 12:19:50] INFO auto_device.py:85: [91mNot found[0m device: rocm:0
[2024-03-31 12:19:53] INFO auto_device.py:85: [91mNot found[0m device: metal:0
[2024-03-31 12:19:55] INFO auto_device.py:85: [91mNot found[0m device: vulkan:0
[2024-03-31 12:19:58] INFO auto_device.py:85: [91mNot found[0m device: opencl:0
[2024-03-31 12:19:58] INFO auto_device.py:33: Using device: [1mcuda:0[0m
[2024-03-31 12:19:58] INFO auto_weight.py:70: Finding weights in: /content/fine_tuned_science_gemma2b-it
[2024-03-31 12:19:58] INFO auto_weight.py:136: [91mNot found[0m Huggingface PyTorch
[2024-03-31 12:19:58] INFO auto_weight.py:143: [92mFound[0m source weight format: huggingface-safetensor. Source configuration: /content/fine_tuned_science_gemma2b-it/model.safetensors.index.json
[

#### Generate token config files.

In [None]:
!mlc_llm gen_config /content/$FINE_TUNED_MODEL_NAME/ --quantization $QUANTIZATION \
  --conv-template gemma_instruction --context-window-size 768 -o /content/$FINE_TUNED_MODEL_NAME-$QUANTIZATION-MLC/

[2024-03-31 12:20:57] INFO auto_config.py:115: [92mFound[0m model configuration: /content/fine_tuned_science_gemma2b-it/config.json
[2024-03-31 12:20:57] INFO auto_config.py:153: [92mFound[0m model type: [1mgemma[0m. Use `--model-type` to override.
[2024-03-31 12:20:57] INFO gemma_model.py:55: [1mcontext_window_size[0m not found in config.json. Falling back to [1mmax_position_embeddings[0m (8192)
[2024-03-31 12:20:57] INFO gemma_model.py:70: [1mprefill_chunk_size[0m defaults to [1mcontext_window_size[0m (8192)
[2024-03-31 12:20:57] INFO config.py:106: Overriding [1mcontext_window_size[0m from 8192 to 768
[2024-03-31 12:20:57] INFO config.py:106: Overriding [1mmax_batch_size[0m from 1 to 80
[2024-03-31 12:20:57] INFO gemma_model.py:78: Overriding [1mprefill_chunk_size[0m from 8192 to 768 ([1mcontext_window_size[0m)
[2024-03-31 12:20:57] INFO gen_config.py:133: [generation_config.json] Setting [1mbos_token_id[0m: 2
[2024-03-31 12:20:57] INFO gen_config.py:133: [ge

#### Compile the model to Android format.

In [None]:
!mlc_llm compile /content/$FINE_TUNED_MODEL_NAME-$QUANTIZATION-MLC/mlc-chat-config.json \
    --device android -o /content/$FINE_TUNED_MODEL_NAME-$QUANTIZATION-MLC/$FINE_TUNED_MODEL_NAME-$QUANTIZATION-android.tar

[2024-03-31 12:21:07] INFO auto_config.py:69: [92mFound[0m model configuration: /content/fine_tuned_science_gemma2b-it-q4f16_1-MLC/mlc-chat-config.json
[2024-03-31 12:21:07] INFO auto_config.py:153: [92mFound[0m model type: [1mgemma[0m. Use `--model-type` to override.
[1mCompiling with arguments:[0m
  [1m--config[0m          GemmaConfig(hidden_size=2048, hidden_act='gelu', intermediate_size=16384, attention_bias=False, num_attention_heads=8, num_key_value_heads=1, head_dim=256, num_hidden_layers=18, rms_norm_eps=1e-06, vocab_size=256000, position_embedding_base=10000.0, context_window_size=768, prefill_chunk_size=768, tensor_parallel_shards=1, max_batch_size=80, kwargs={})
  [1m--quantization[0m    GroupQuantize(name='q4f16_1', kind='group-quant', group_size=32, quantize_dtype='int4', storage_dtype='uint32', model_dtype='float16', linear_weight_layout='NK', quantize_embedding=True, quantize_final_fc=True, num_elem_per_storage=8, num_storage_per_group=4, max_int_value=7)
  

#### Push the quantized model to Hugging Face.

In [None]:
from huggingface_hub import whoami
from pathlib import Path

# Output directory.
output_dir = "fine_tuned_science_gemma2b-it-q4f16_1-MLC"
repo_name = "scigemma_fine_tuned_quantized_MLC"
username = whoami(token=Path("/root/.cache/huggingface/"))["name"]
repo_id = f"{username}/{repo_name}"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from huggingface_hub import upload_folder, create_repo

repo_id = create_repo(repo_id, exist_ok=True).repo_id
print(output_dir)

upload_folder(
    repo_id=repo_id,
    folder_path=output_dir,
    commit_message="Fine-tuned quantized Gemma 2b-it model.",
    ignore_patterns=["step_*", "epoch_*"],
)

fine_tuned_science_gemma2b-it-q4f16_1-MLC


fine_tuned_science_gemma2b-it-q4f16_1-android.tar:   0%|          | 0.00/325k [00:00<?, ?B/s]

Upload 41 LFS files:   0%|          | 0/41 [00:00<?, ?it/s]

params_shard_0.bin:   0%|          | 0.00/262M [00:00<?, ?B/s]

params_shard_1.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

params_shard_11.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

params_shard_10.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_12.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_13.bin:   0%|          | 0.00/30.7M [00:00<?, ?B/s]

params_shard_14.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_15.bin:   0%|          | 0.00/32.6M [00:00<?, ?B/s]

params_shard_16.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_17.bin:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

params_shard_18.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_19.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_2.bin:   0%|          | 0.00/32.8M [00:00<?, ?B/s]

params_shard_20.bin:   0%|          | 0.00/32.8M [00:00<?, ?B/s]

params_shard_21.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_22.bin:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

params_shard_23.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_24.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

params_shard_25.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_26.bin:   0%|          | 0.00/30.7M [00:00<?, ?B/s]

params_shard_27.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_28.bin:   0%|          | 0.00/32.6M [00:00<?, ?B/s]

params_shard_29.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_3.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_30.bin:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

params_shard_31.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_32.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

params_shard_33.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_34.bin:   0%|          | 0.00/30.7M [00:00<?, ?B/s]

params_shard_35.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_36.bin:   0%|          | 0.00/32.6M [00:00<?, ?B/s]

params_shard_37.bin:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

params_shard_4.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_5.bin:   0%|          | 0.00/30.5M [00:00<?, ?B/s]

params_shard_6.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_7.bin:   0%|          | 0.00/32.6M [00:00<?, ?B/s]

params_shard_8.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

params_shard_9.bin:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NSTiwari/scigemma_fine_tuned_quantized_MLC/commit/f6b97b9b3ede79b23f8f500f83294379ff3785cf', commit_message='Fine-tuned quantized Gemma 2b-it model.', commit_description='', oid='f6b97b9b3ede79b23f8f500f83294379ff3785cf', pr_url=None, pr_revision=None, pr_num=None)

#### Step 10: Download the fine-tuned compiled model.

In [None]:
!zip -r /content/gemma_2b_it_fine_tuned.zip /content/$FINE_TUNED_MODEL_NAME-$QUANTIZATION-MLC

In [None]:
from google.colab import files
files.download("/content/gemma_2b_it_fine_tuned.zip")

In [None]:
from huggingface_hub import whoami
from pathlib import Path

# Output directory.
output_dir = "scigemma_fine_tuned_v0"
repo_name = "scigemma_fine_tuned_v0"
username = whoami(token=Path("/root/.cache/huggingface/"))["name"]
repo_id = f"{username}/{repo_name}"

In [None]:
from huggingface_hub import upload_folder, create_repo

repo_id = create_repo(repo_id, exist_ok=True).repo_id
print(output_dir)

upload_folder(
    repo_id=repo_id,
    folder_path=output_dir,
    commit_message="Fine-tuned quantized Gemma 2b-it model.",
    ignore_patterns=["step_*", "epoch_*"],
)

scigemma_fine_tuned_v0


CommitInfo(commit_url='https://huggingface.co/NSTiwari/scigemma_fine_tuned_v0/commit/0f06e2fea34decd766317c0faa84806dba4ed6ed', commit_message='Fine-tuned quantized Gemma 2b-it model.', commit_description='', oid='0f06e2fea34decd766317c0faa84806dba4ed6ed', pr_url=None, pr_revision=None, pr_num=None)