In [None]:
# ## 1. Установка зависимостей и компиляция llama.cpp

# %%
# Установка системных зависимостей
!apt-get update
!apt-get install -y build-essential python3-pip
!pip install huggingface-hub pandas matplotlib seaborn datasets

# Компиляция llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
!mkdir -p build
%cd build
!cmake ..
!cmake --build . --config Release
%cd ../..


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [75.2 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,842 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,140 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,542 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,696 kB]
Get:14 ht

In [None]:
# ## 2. Настройка параметров эксперимента

# %%
import os
from huggingface_hub import snapshot_download
import subprocess
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Конфигурация эксперимента
MODEL_SIZE = "13B" #"3B" "7B" "13B"
CONTEXT_LENGTHS = ["256"]  # Длины контекста для теста
QUANT_TYPES = ["q3_k", "q4_k", "q5_k", "q8_0"]


BASE_DIR = "/content"
MODEL_NAME = f"open_llama_{MODEL_SIZE}"
MODEL_PATH = os.path.join(BASE_DIR, "models", MODEL_NAME)
DATA_FILE = os.path.join(BASE_DIR, "test.txt")

# Создаем директории
os.makedirs(MODEL_PATH, exist_ok=True)

In [None]:
# ## 3. Загрузка модели и данных

# %%
# Загрузка модели с Hugging Face Hub
snapshot_download(
    repo_id=f"openlm-research/{MODEL_NAME}",
    local_dir=MODEL_PATH,
    ignore_patterns=["*.txt", "*.md", "*.safetensors", "*.h5", "*.ot"],
    resume_download=True
)


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

unzip:  cannot find or open wikitext-2-v1.zip, wikitext-2-v1.zip.zip or wikitext-2-v1.zip.ZIP.
cat: wikitext-2/wikitext-2-v1/test.txt: No such file or directory


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/685k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.07M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/618k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

1268938

In [None]:
# ## 4. Конвертация и квантование

# %%
def convert_and_quantize():
    # Конвертация в FP16
    fp16_path = os.path.join(MODEL_PATH, "ggml-model-f16.gguf")
    if not os.path.exists(fp16_path):
        !python llama.cpp/convert_hf_to_gguf.py {MODEL_PATH} \
            --outfile {fp16_path} \
            --outtype f16

    # Квантование для всех выбранных типов
    for quant in QUANT_TYPES:
        quant_path = os.path.join(MODEL_PATH, f"ggml-model-{quant}.gguf")
        if not os.path.exists(quant_path):
            !./llama.cpp/build/bin/llama-quantize {fp16_path} {quant_path} {quant}

convert_and_quantize()

INFO:hf-to-gguf:Loading model: open_llama_13B
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00003.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {5120, 32000}
INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.float16 --> F16, shape = {5120, 5120}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> F16, shape = {5120, 5120}
INFO:hf-to-gguf:blk.0.attn_v.weight,         torch.float16 --> F16, shape = {5120, 5120}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.float16 --> F16, shape = {5120, 5120}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {5120, 13824}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {13824, 5120}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F1

In [None]:
# ## 5. Измерение перплексии

# %%
results = []

def run_perplexity_test(model_file, quant_type, ctx_length):
    try:

        output = !./llama.cpp/build/bin/llama-perplexity -m {model_file} -f "/content/llama.cpp/prompts/mnemonics.txt" -c {ctx_length}


        ppl = None

        for elem in output:
            if "PPL" in elem:
                ppl = elem.split(" = ")[-1]
                print(ppl)
                break

        if ppl is not None:
            size_gb = os.path.getsize(model_file)/(1024**3)
            return ppl, size_gb

    except:# error as e:# subprocess.CalledProcessError as e:
        print(f"Ошибка для {quant_type} (ctx {ctx_length})")

        return None, None

# Запуск тестов для всех вариантов
for quant in QUANT_TYPES + ["f16"]: #["f16"] +
    model_file = f"/content/models/open_llama_{MODEL_SIZE}/ggml-model-{quant}.gguf"

    if not os.path.exists(model_file):
        continue

    for ctx in CONTEXT_LENGTHS:
        print(f"Тестируем {quant} (ctx {ctx})...")

        ppl, size = run_perplexity_test(model_file, quant, ctx)

        if ppl:
            results.append({
                "Model": MODEL_SIZE,
                "Quant": quant,
                "Context": ctx,
                "Perplexity": ppl,
                "Size_GB": size
            })

# Сохранение результатов
df = pd.DataFrame(results)
df.to_csv("results.csv", index=False)
df



Тестируем q3_k (ctx 256)...
5.1690 +/- 0.40510
Тестируем q4_k (ctx 256)...
5.1165 +/- 0.40442
Тестируем q5_k (ctx 256)...
5.0548 +/- 0.39835
Тестируем q8_0 (ctx 256)...
5.0788 +/- 0.40222
Тестируем f16 (ctx 256)...
5.0728 +/- 0.40156


Unnamed: 0,Model,Quant,Context,Perplexity,Size_GB
0,13B,q3_k,256,5.1690 +/- 0.40510,5.90254
1,13B,q4_k,256,5.1165 +/- 0.40442,7.325775
2,13B,q5_k,256,5.0548 +/- 0.39835,8.596069
3,13B,q8_0,256,5.0788 +/- 0.40222,12.881452
4,13B,f16,256,5.0728 +/- 0.40156,24.245436
