In [5]:
%%capture
!pip install unsloth

In [6]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B", 
    dtype=None,  
    load_in_4bit=True,
    max_seq_length=2048,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-06-01 12:19:18.581864: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748780358.604910    1083 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748780358.611796    1083 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.9: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
# Run only if you already have your LoRA adapter

from peft import PeftModel
model = PeftModel.from_pretrained(model, "/kaggle/working/lora_model")

In [3]:
model.device

Model is on device: cuda:0


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16, 
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407, 
    use_rslora=False, 
    loftq_config=None
)

Unsloth 2025.5.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
# Run only if you use nlp-ft-dataset. Otherwise this cell can be referred to as an example of format of data you need

import json

def convert_file(input_path, output_path):
    successful_read_count = 0
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:

        buffer = ""
        for line in infile:
            stripped = line.strip()
            if not stripped:
                continue

            buffer += stripped

            try:
                data = json.loads(buffer)
                conversation = {
                    "messages": [
                        {"role": "user", "content": data["q"]},
                        {"role": "assistant", "content": data["a"]}
                    ]
                }
                outfile.write(json.dumps(conversation, ensure_ascii=False) + '\n')
                buffer = "" 
                successful_read_count += 1
            except json.JSONDecodeError:
                continue
    return successful_read_count
                
cnt = convert_file("/kaggle/input/nlp-ft-dataset/ans.txt", "output.jsonl")
assert(cnt == 200)

In [6]:
from datasets import load_dataset, Dataset

train_dataset = load_dataset("json", data_files="/kaggle/input/init-ft/train.jsonl")["train"]
test_dataset = load_dataset("json", data_files="/kaggle/input/init-ft/test.jsonl")["train"]

# Uncomment code below if you don't have a split yet

# dataset = load_dataset("json", data_files="/kaggle/input/init-ft/output.jsonl")["train"]
# split_dataset = dataset.train_test_split(test_size=0.05, shuffle=True, seed=42)
# train_dataset = split_dataset["train"]
# test_dataset = split_dataset["test"]


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [27]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

INSTRUCTION = "Ты - Лев Толстой. Отвечай пользователю в его стиле."

EOS_TOKEN = tokenizer.eos_token 
def formatting_prompts_func(examples):
    texts = []
    for x in examples["messages"]:
        text = alpaca_prompt.format(INSTRUCTION, x[0]['content'], x[1]['content']) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = train_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

In [29]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")

wandb.login(key=secret_value_0)
wandb.init(project="DeepSeek-R1-Distill-Llama 3.1_8B-fine-tuning-200__13.47")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msemenishchev-ai[0m ([33msemenishchev-ai-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [30]:
import os

os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [32]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb",
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/190 [00:00<?, ? examples/s]

In [34]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 190 | Num Epochs = 3 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
1,1.2947
2,1.184
3,1.1612
4,1.1544
5,1.111
6,1.1941
7,1.0919
8,1.0891
9,1.0931
10,1.1464


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-60)... Done. 0.7s


In [35]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - 0, 3)
used_percentage = round(used_memory / 16 * 100, 3)
lora_percentage = round(used_memory_for_lora / 16 * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1090.1305 seconds used for training.
18.17 minutes used for training.
Peak reserved memory = 7.623 GB.
Peak reserved memory for training = 7.623 GB.
Peak reserved memory % of max memory = 47.644 %.
Peak reserved memory for training % of max memory = 47.644 %.


In [41]:
def get_response(input_text):
    prompt = alpaca_prompt.format(
        INSTRUCTION,  
        input_text,
        ""
    )

    FastLanguageModel.for_inference(model)
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    not_stripped = decoded[0].split("Response:\n")[1]
    return ".".join(not_stripped.split(".")[:-1])+"."


In [42]:
get_response("Как жить жизнь?")

'Жить? Жить же! — это не вопрос, но ответ. Ваши дни должны быть спектаклем, где каждая роль написана Богом, но играется не по сценарию. Встаньте каждый утро и спросите: «Как я могу сегодня обрести правду, глядя в глаза старике или ребенку?» Делайте хлеб руками, пишите письма тем, кто уже спал под звездами, которую вы видите сейчас.'

In [43]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [44]:
!zip -r DubSeek-Llama-8B_v1.zip lora_model

  adding: lora_model/ (stored 0%)
  adding: lora_model/tokenizer.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 85%)
  adding: lora_model/special_tokens_map.json (deflated 71%)
  adding: lora_model/adapter_model.safetensors (deflated 7%)
  adding: lora_model/tokenizer_config.json (deflated 96%)
  adding: lora_model/adapter_config.json (deflated 56%)
  adding: lora_model/README.md (deflated 66%)


In [45]:
import os
os.chdir(r'/kaggle/working')
from IPython.display import FileLink
FileLink(r'DubSeek-Llama-8B_v1.zip')

In [102]:
# Next steps are for Streamlit demo app. Requires this model in gguf format. 
# You can use pretrained by me by just running the cells below.

In [73]:
%%capture
!pip install streamlit

In [74]:
!npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K
added 22 packages in 2s
[1G[0K⠸[1G[0K
[1G[0K⠸[1G[0K3 packages are looking for funding
[1G[0K⠸[1G[0K  run `npm fund` for details
[1G[0K⠸[1G[0K[1mnpm[22m [96mnotice[39m
[1mnpm[22m [96mnotice[39m New [31mmajor[39m version of npm available! [31m10.8.2[39m -> [34m11.4.1[39m
[1mnpm[22m [96mnotice[39m Changelog: [34mhttps://github.com/npm/cli/releases/tag/v11.4.1[39m
[1mnpm[22m [96mnotice[39m To update run: [4mnpm install -g npm@11.4.1[24m
[1mnpm[22m [96mnotice[39m
[1G[0K⠸[1G[0K

In [1]:
!pip install gdown

import gdown

# Import the file from Google Drive using its unique identifier 
file_id = '1-3WEyImpVhuw2Idj8xC12WgXtp48cmE8' # my gguf model.zip id
url = f'https://drive.google.com/uc?id={file_id}'
output = '/kaggle/working/model.zip'
gdown.download(url, output, quiet=False)




Downloading...
From (original): https://drive.google.com/uc?id=1-3WEyImpVhuw2Idj8xC12WgXtp48cmE8
From (redirected): https://drive.google.com/uc?id=1-3WEyImpVhuw2Idj8xC12WgXtp48cmE8&confirm=t&uuid=e32e7249-9422-432b-8623-86e4cd39404d
To: /kaggle/working/model.zip
100%|██████████| 6.79G/6.79G [01:15<00:00, 90.4MB/s]


'/kaggle/working/model.zip'

In [3]:
!unzip model.zip
!rm -r model.zip

Archive:  model.zip
  inflating: model/unsloth.Q8_0.gguf  


In [3]:
!git clone https://github.com/ggml-org/llama.cpp

Cloning into 'llama.cpp'...
remote: Enumerating objects: 52492, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 52492 (delta 41), reused 24 (delta 20), pack-reused 52418 (from 2)[K
Receiving objects: 100% (52492/52492), 126.18 MiB | 34.35 MiB/s, done.
Resolving deltas: 100% (38048/38048), done.


In [None]:
!cd llama.cpp && cmake -B build -DGGML_CUDA=ON && cmake --build build -j

In [None]:
# check that llama-cli works

!cd llama.cpp && build/bin/llama-cli -m ../model/unsloth.Q8_0.gguf -p 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\
\
### Instruction:\
"Ты - Лев Толстой. Отвечай пользователю в его стиле."\
\
\
### Input:\
"Как жить?"\
\
### Response:' --no-warmup -no-cnv --no-display-prompt --log-file logs_file.txt

In [100]:
code = """
import streamlit as st
import subprocess
import re
import os

st.title("Local LLM Chatbot")

LOG_PATH = "/kaggle/working/llama.cpp/logs_file.txt"
LLAMA_COMMAND_TEMPLATE = (
    r\"\"\"cd /kaggle/working/llama.cpp && \\
    build/bin/llama-cli -m ../model/unsloth.Q8_0.gguf \\
    -p 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\\n\\n
### Instruction:\\n
\\"Ты - Лев Толстой. Отвечай пользователю в его стиле.\\"\\n\\n
### Input:\\n"{input_text}"\\n\\n
### Response:' \\
    --no-warmup -no-cnv --no-display-prompt --log-file {log_path}\"\"\")

def get_model_output_from_logs(log_path=LOG_PATH):
    try:
        with open(log_path, "r", encoding="utf-8") as f:
            content = f.read()

        content = content[::-1]
        match = re.search(r"\\]txet fo dne\\[(.*?)\\n\\n", content, re.DOTALL)
        if match:
            return match.group(1).strip()[::-1]
        else:
            return "Не удалось найти ответ в логах."
    except FileNotFoundError:
        return "Файл логов не найден."

def get_response(user_input):
    if os.path.exists(LOG_PATH):
        os.remove(LOG_PATH)

    command = LLAMA_COMMAND_TEMPLATE.format(
        input_text=user_input.replace('"', r'\\\"'),
        log_path=LOG_PATH
    )

    try:
        subprocess.run(["bash", "-c", command], check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as e:
        return f"Ошибка при запуске llama-cli: {e.stderr or e}"

    return get_model_output_from_logs()

if "messages" not in st.session_state:
    st.session_state.messages = []

for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

if prompt := st.chat_input("Спроси что-нибудь..."):
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)

    with st.chat_message("assistant"):
        response = get_response(prompt)
        st.markdown(response)
    st.session_state.messages.append({"role": "assistant", "content": response})
"""

with open("script.py", "w", encoding="utf-8") as f:
    f.write(code)


In [101]:
!streamlit run script.py &>./logs.txt & npx localtunnel --port 8501

[1G[0K⠙[1G[0Kyour url is: https://great-cobras-look.loca.lt
^C
