In [1]:
from llama_cpp import Llama


llm = Llama(
  model_path="../../models/Phi-3-mini-4k-instruct-q4.gguf",  # path to GGUF file
  n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=64, # The number of layers to offload to GPU, if you have GPU acceleration available. Set to 0 if no GPU acceleration is available on your system.
)

llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from ../../models/Phi-3-mini-4k-instruct-q4.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.name str              = Phi3
llama_model_loader: - kv   2:                        phi3.context_length u32              = 4096
llama_model_loader: - kv   3:                      phi3.embedding_length u32              = 3072
llama_model_loader: - kv   4:                   phi3.feed_forward_length u32              = 8192
llama_model_loader: - kv   5:                           phi3.block_count u32              = 32
llama_model_loader: - kv   6:                  phi3.attention.head_count u32              = 32
llama_model_loader: - kv   7:               phi3.attention.head_count_kv u

In [15]:
prompt = """
Context: ```
We present an algorithm that produces the classification list of smooth Fano\nd-polytopes for any given d. The input of the algorithm is a single number,\nnamely the positive integer d. The algorithm has been used to classify smooth\nFano d-polytopes for d<=7. There are 7622 isomorphism classes of smooth Fano\n6-polytopes and 72256 isomorphism classes of smooth Fano 7-polytopes.\n
```

Given the context inside ``` answer the question using less than three sentences:
> What is presented in the article?
"""

In [20]:
%%time
# Simple inference example
output = llm(
    f"<|user|>\n{prompt}<|end|>\n<|assistant|>",
    max_tokens=256,  # Generate up to 256 tokens
    stop=["<|end|>"],
    stream=True,
    echo=False,  # Whether to echo the prompt
)
for token in output:
    print(token["choices"][0]["text"], end="")

Llama.generate: prefix-match hit


 The article presents an algorithm for classifying smooth Fano d-polytopes, with a capability to determine isomorphism classes up to d=7, yielding 7622 and 72256 classes respectively for 6 and 7 dimensions


llama_print_timings:        load time =    7127.03 ms
llama_print_timings:      sample time =      14.82 ms /    56 runs   (    0.26 ms per token,  3779.44 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (     nan ms per token,      nan tokens per second)
llama_print_timings:        eval time =   24268.24 ms /    56 runs   (  433.36 ms per token,     2.31 tokens per second)
llama_print_timings:       total time =   24530.48 ms /    56 tokens


.CPU times: user 195 ms, sys: 676 ms, total: 872 ms
Wall time: 24.6 s


In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
import numpy as np

def cosine_similarity(v1, v2):
    return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

v1 = model.encode("Hello, world!").ravel()
v2 = model.encode("Goodbye, moon!").ravel()

In [10]:
cosine_similarity(v1, v2)

0.4691667854785919