In [None]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

#Primary Model - Bloomz

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import json
import pandas as pd
import torch


model_name = "bigscience/bloomz-1b1"
max_seq_length = 256

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.gradient_checkpointing_enable()

# LoRA configuration for BLOOMZ
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["query_key_value"],  # LoRA targets for BLOOMZ
    use_rslora=False,
    task_type="CAUSAL_LM",
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

print("Trainable Parameters:")
model.print_trainable_parameters()

# Load JSON data
json_file_path = "/content/drive/My Drive/Haitian_Creole.json"
with open(json_file_path, "r") as file:
    data = json.load(file)

dataset = Dataset.from_dict({
    "instruction": [item["instruction"] for item in data],
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data],
})

# Take 20% of the dataset
subset_size = int(len(dataset) * 0.2)
subset_dataset = dataset.shuffle(seed=42).select(range(subset_size))

# Split the subset into train and eval
splits = subset_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = splits["train"]
eval_dataset = splits["test"]

# Format prompts
def formatting_prompts_func(examples):
    texts = []
    new_labels = []
    for instruction, inp, out in zip(examples["instruction"], examples["input"], examples["output"]):
        prompt = f"{instruction}\n\n{inp}\n\nRepons:"  # Prompt in English
        completion = f" {out}"  # Haitian Creole response
        full_text = prompt + completion
        texts.append(full_text)
        new_labels.append(full_text)
    return {"text": texts, "labels": new_labels}


train_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=["instruction", "input", "output"])
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True, remove_columns=["instruction", "input", "output"])

# Tokenize dataset
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_seq_length,
        truncation=True,
        padding="max_length"
    )

    labels = []
    for text in examples["labels"]:
        tokenized = tokenizer(
            text,
            max_length=max_seq_length,
            truncation=True,
            padding="max_length"
        )["input_ids"]

        assistant_str = "Repons:"
        idx = text.find(assistant_str)
        if idx == -1:
            masked = [-100] * len(tokenized)
        else:
            completion_start = idx + len(assistant_str)
            prompt_ids = tokenizer(
                text[:completion_start],
                max_length=max_seq_length,
                truncation=True,
                padding="max_length"
            )["input_ids"]
            prompt_length = sum(t != tokenizer.pad_token_id for t in prompt_ids)

            masked = [
                (tk if i >= prompt_length and tk != tokenizer.pad_token_id else -100)
                for i, tk in enumerate(tokenized)
            ]
        labels.append(masked)

    model_inputs["labels"] = labels
    return model_inputs

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text", "labels"])
eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text", "labels"])

# Training arguments
training_args = TrainingArguments(
    output_dir="bloomz_fine_tuned_model",
    evaluation_strategy="steps",
    eval_steps=200,
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    fp16=True,
    save_steps=500,
    save_total_limit=2,
    logging_steps=50,
    optim="adamw_hf",
    weight_decay=0.1,
    warmup_steps=500,
    seed=42,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training
print("Is model in training mode?", model.training)
trainer.train()

# Save training logs
log_history = trainer.state.log_history
df = pd.DataFrame(log_history)
df.to_csv("Bloomz_train_eval_log.csv", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Trainable Parameters:
trainable params: 1,179,648 || all params: 1,066,493,952 || trainable%: 0.1106


Map:   0%|          | 0/10616 [00:00<?, ? examples/s]

Map:   0%|          | 0/2655 [00:00<?, ? examples/s]

Map:   0%|          | 0/10616 [00:00<?, ? examples/s]

Map:   0%|          | 0/2655 [00:00<?, ? examples/s]

  trainer = Trainer(


Is model in training mode? True


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
200,5.5493,5.339971
400,4.5919,4.574101
600,4.1961,4.189748
800,3.8926,4.044976
1000,4.0024,3.974928
1200,3.9029,3.972071


In [None]:
!huggingface-cli login

save_directory = "bloomz_fine_tuned_model"

# Save the fine-tuned model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

model = AutoModelForCausalLM.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Push Model to huggingface
model.push_to_hub("sprab4/bloomz_fine_tuned_model")

#Push Tokenizer to Hugging Face
tokenizer.push_to_hub("sprab4/bloomz_fine_tuned_model")

#Secondary Model - mt5 small

In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from peft import LoraConfig, get_peft_model
from datasets import Dataset, DatasetDict
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
import json
import pandas as pd

# Load model and tokenizer
model_name = "google/mt5-small"
max_seq_length = 256

model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = MT5Tokenizer.from_pretrained(model_name)

if tokenizer.eos_token is None:
    tokenizer.eos_token = "</s>"

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["SelfAttention.q", "SelfAttention.k", "SelfAttention.v", "SelfAttention.o"],
    use_rslora=False,
    task_type="SEQ_2_SEQ_LM",
    bias="none"
)

model = get_peft_model(model, lora_config)

print("Trainable Parameters:")
model.print_trainable_parameters()

# Load JSON data
json_file_path = "/content/drive/My Drive/Haitian_Creole.json"
with open(json_file_path, "r") as file:
    data = json.load(file)

# Create dataset
dataset = Dataset.from_dict({
    "instruction": [item["instruction"] for item in data],
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data],
})

subset_size = int(len(dataset) * 0.2)
subset_dataset = dataset.shuffle(seed=42).select(range(subset_size))

# Split the subset into train and eval
splits = subset_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = splits["train"]
eval_dataset = splits["test"]

alpaca_prompt = """{instruction}\n\n{input}"""
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    texts = []
    for instruction, input_text, output_text in zip(examples["instruction"], examples["input"], examples["output"]):
        text = alpaca_prompt.format(instruction=instruction, input=input_text) + EOS_TOKEN
        texts.append(text)
    return {"text": texts, "labels": examples["output"]}

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)


def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_seq_length,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        text_target=examples["labels"],
        max_length=max_seq_length,
        truncation=True,
        padding="max_length"
    )
    labels["input_ids"] = [
        [token if token != tokenizer.pad_token_id else -100 for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "input", "output", "labels", "text"])
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "input", "output", "labels", "text"])

training_args = TrainingArguments(
    output_dir="mt5_fine_tuned_model",
    evaluation_strategy="steps",
    eval_steps=200,
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    fp16=False,
    save_steps=500,
    save_total_limit=2,
    logging_steps=50,
    optim="adamw_hf",
    weight_decay=0.05,
    warmup_steps=100,
    seed=42,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

print("Is model in training mode?", model.training)
trainer.train()

log_history = trainer.state.log_history

df = pd.DataFrame(log_history)
df.to_csv("training_eval_logs.csv", index=False)

Trainable Parameters:
trainable params: 458,752 || all params: 300,635,520 || trainable%: 0.1526


Map:   0%|          | 0/10616 [00:00<?, ? examples/s]

Map:   0%|          | 0/2655 [00:00<?, ? examples/s]

Map:   0%|          | 0/10616 [00:00<?, ? examples/s]



Map:   0%|          | 0/2655 [00:00<?, ? examples/s]

  trainer = Trainer(


Is model in training mode? True




Step,Training Loss,Validation Loss
200,19.3323,11.844785
400,11.7588,7.804188
600,9.26,6.838586
800,8.0741,6.492834
1000,7.317,5.696875
1200,6.8987,5.148508
1400,6.7159,4.901384
1600,6.4656,4.740219
1800,6.4164,4.686241
2000,6.3448,4.65056


In [None]:
!huggingface-cli login

# You can push your model to huggingface
model.push_to_hub("sprab4/mt5_fine_tuned_model")


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

adapter_model.safetensors:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sprab4/mt5_fine_tuned_model/commit/218aa22c01501781756db36f43398c3f4757c4ed', commit_message='Upload model', commit_description='', oid='218aa22c01501781756db36f43398c3f4757c4ed', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sprab4/mt5_fine_tuned_model', endpoint='https://huggingface.co', repo_type='model', repo_id='sprab4/mt5_fine_tuned_model'), pr_revision=None, pr_num=None)

In [None]:
#Push Tokenizer to Hugging Face
tokenizer.push_to_hub("sprab4/mt5_fine_tuned_model")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sprab4/mt5_fine_tuned_model/commit/05279f806cede765bd9f340b17bf3dabf6e3a807', commit_message='Upload tokenizer', commit_description='', oid='05279f806cede765bd9f340b17bf3dabf6e3a807', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sprab4/mt5_fine_tuned_model', endpoint='https://huggingface.co', repo_type='model', repo_id='sprab4/mt5_fine_tuned_model'), pr_revision=None, pr_num=None)