In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## dataset creation

In [2]:
from datasets import load_dataset, Dataset
import random
from tqdm.auto import tqdm
import os

In [None]:
dataset = load_dataset("styalai/SlimPajama-1M-rows", split="train[:1%]")

In [None]:
data = {"text":[]}
probs = 0.2
a = list("azertyuiopmlkjhgfdsqwxcvbn,?:.1234567890AZERTYUIOPMLKJHGFDSQWXCVBN")

for t in tqdm(dataset["text"][:1000]):
    text = "<bos><start_of_turn>user\n"
    s = list(t)[-1000:]
    for l in range(len(s)):
        if random.random() < probs and s[l] != ' ': # change the letter
            s[l] = a[random.randint(0, len(a)-1)]
    s = ''.join(s)
    text = text + s + "<end_of_turn>\n<start_of_turn>model\n" + t[-1000:] + "<end_of_turn>"
    data['text'].append(text)

data = pd.DataFrame.from_dict(data)
data = Dataset.from_pandas(data)
data.push_to_hub("styalai/mistake-v2", token="hf_token")

## LOAD DATASET

In [3]:
dataset = load_dataset("styalai/mistake-v2")

Downloading readme:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [4]:
dataset['train']

Dataset({
    features: ['text'],
    num_rows: 1001
})

In [5]:
data = pd.DataFrame(dataset['train'])
os.mkdir("/kaggle/working/data")
data.to_csv('data/dataset.csv', index=False, escapechar='\\')  

## fine tune

In [None]:
import os
!pip install -U autotrain-advanced 
!autotrain setup

In [7]:
#@markdown ---
#@markdown #### Project Config
#@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.
project_name = 'gemmamistakes-v3' # @param {type:"string"}
model_name = "styalai/gemmamistakes-v2" # model to fine tune

#@markdown ---
#@markdown #### Push to Hub?
#@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account
#@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.
#@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.
#@markdown You can find your token here: https://huggingface.co/settings/tokens
push_to_hub = True # @param ["False", "True"] {type:"raw"}
hf_token = "hf_token" #@param {type:"string"}
#repo_id = "styalai/gemmamistakes-v1" #@param {type:"string"}

#@markdown ---
#@markdown #### Hyperparameters
learning_rate = 3e-4 # @param {type:"number"}
num_epochs = 2 #@param {type:"number"}
batch_size = 1 # @param {type:"slider", min:1, max:32, step:1}
block_size = 1024 # @param {type:"number"}
trainer = "sft" # @param ["default", "sft"] {type:"raw"}
warmup_ratio = 0.1 # @param {type:"number"}
weight_decay = 0.01 # @param {type:"number"}
gradient_accumulation = 4 # @param {type:"number"}
mixed_precision = "fp16" # @param ["fp16", "bf16", "none"] {type:"raw"}
peft = True # @param ["False", "True"] {type:"raw"}
quantization = "int4" # @param ["int4", "int8", "none"] {type:"raw"}
lora_r = 16 #@param {type:"number"}
lora_alpha = 32 #@param {type:"number"}
lora_dropout = 0.05 #@param {type:"number"}

os.environ["PROJECT_NAME"] = project_name
os.environ["MODEL_NAME"] = model_name
os.environ["PUSH_TO_HUB"] = str(push_to_hub)
os.environ["HF_TOKEN"] = hf_token
#os.environ["REPO_ID"] = repo_id
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_EPOCHS"] = str(num_epochs)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["BLOCK_SIZE"] = str(block_size)
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
os.environ["WEIGHT_DECAY"] = str(weight_decay)
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
os.environ["MIXED_PRECISION"] = str(mixed_precision)
os.environ["PEFT"] = str(peft)
os.environ["QUANTIZATION"] = str(quantization)
os.environ["LORA_R"] = str(lora_r)
os.environ["LORA_ALPHA"] = str(lora_alpha)
os.environ["LORA_DROPOUT"] = str(lora_dropout)

In [None]:
!autotrain llm \
--train \
--username "styalai" \
--merge-adapter \
--model ${MODEL_NAME} \
--project-name ${PROJECT_NAME} \
--data-path data/ \
--text-column text \
--lr ${LEARNING_RATE} \
--batch-size ${BATCH_SIZE} \
--epochs ${NUM_EPOCHS} \
--block-size ${BLOCK_SIZE} \
--warmup-ratio ${WARMUP_RATIO} \
--lora-r ${LORA_R} \
--lora-alpha ${LORA_ALPHA} \
--lora-dropout ${LORA_DROPOUT} \
--weight-decay ${WEIGHT_DECAY} \
--gradient-accumulation ${GRADIENT_ACCUMULATION} \
--quantization ${QUANTIZATION} \
--mixed-precision ${MIXED_PRECISION} \
$( [[ "$PEFT" == "True" ]] && echo "--peft" ) \
$( [[ "$PUSH_TO_HUB" == "True" ]] && echo "--push-to-hub --token ${HF_TOKEN}" )

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import TextStreamer
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

2024-07-21 07:36:51.454506: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 07:36:51.454617: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 07:36:51.564949: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
model_id = "styalai/gemmamistakes-v2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [13]:
chat = [
    { "role": "user", "content": "u?d meZtion diet t1 Fe, but it did lqttle to swry ,e. Whyr Bad examyles. nrap Pitt in Ocean's Eleven."},
]
question = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

question = tokenizer(question, return_tensors="pt").to(device)

In [14]:
streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(**question, streamer=streamer,
                            eos_token_id=tokenizer.bos_token_id,
                            max_length=2048, 
                            temperature=0,
                            top_p=0.8,
                            repetition_penalty=1.25)

You would mention diet to be a way to lose weight, but it did little to save me. Why? Because bad examples. In the movie Ocean's Eleven, Brad Pitt in Ocean's Eleven.<end_of_turn><bos>
