From 40f4795ab8e1977eddb1ac4e942d04e9919eb96f Mon Sep 17 00:00:00 2001
From: Andrew Mshar <acmshar@gmail.com>
Date: Fri, 15 Sep 2023 09:29:48 -0400
Subject: [PATCH] Update llm_llama_cpp.py to support .gguf files. Update README
 to reflect that change. Closes #10.

---
 README.md        | 30 ++++++++++++++++++------------
 llm_llama_cpp.py |  4 ++--
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index c06735c..8377828 100644
--- a/README.md
+++ b/README.md
@@ -19,10 +19,7 @@ If you have a C compiler available on your system you can install that like so:
 ```bash
 llm install llama-cpp-python
 ```
-If you are using Python 3.11 installed via Homebrew on an M1 or M2 Mac you may be able to install this wheel instead, which will install a lot faster as it will not need to run a C compiler:
-```bash
-llm install https://static.simonwillison.net/static/2023/llama_cpp_python-0.1.77-cp311-cp311-macosx_13_0_arm64.whl
-```
+
 ## Adding models
 
 After installation you will need to add or download some models.
@@ -33,20 +30,20 @@ The plugin can download models for you. Try running this command:
 
 ```bash
 llm llama-cpp download-model \
-  https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q8_0.bin \
+  https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q6_K.gguf \
   --alias llama2-chat --alias l2c --llama2-chat
 ```
-This will download the Llama 2 7B Chat GGML model file (this one is 6.67GB), save it and register it with the plugin - with two aliases, `llama2-chat` and `l2c`.
+This will download the Llama 2 7B Chat GGUF model file (this one is 5.53GB), save it and register it with the plugin - with two aliases, `llama2-chat` and `l2c`.
 
 The `--llama2-chat` option configures it to run using a special Llama 2 Chat prompt format. You should omit this for models that are not Llama 2 Chat models.
 
 If you have already downloaded a `llama.cpp` compatible model you can tell the plugin to read it from its current location like this:
 
 ```bash
-llm llama-cpp add-model path/to/llama-2-7b-chat.ggmlv3.q8_0.bin \
+llm llama-cpp add-model path/to/llama-2-7b-chat.Q6_K.gguf \
   --alias l27c --llama2-chat
 ```
-The model filename (minus the `.bin` extension) will be registered as its ID for executing the model.
+The model filename (minus the `.gguf` extension) will be registered as its ID for executing the model.
 
 You can also set one or more aliases using the `--alias` option.
 
@@ -75,7 +72,7 @@ cd "$(llm llama-cpp models-dir)"
 
 Once you have downloaded and added a model, you can run a prompt like this:
 ```bash
-llm -m llama-2-7b-chat.ggmlv3.q8_0 'five names for a cute pet skunk'
+llm -m llama-2-7b-chat.Q6_K 'five names for a cute pet skunk'
 ```
 Or if you registered an alias you can use that instead:
 ```bash
@@ -89,7 +86,7 @@ llm -m llama2-chat 'five creative names for a pet hedgehog'
 This model is Llama 2 7B GGML without the chat training. You'll need to prompt it slightly differently:
 ```bash
 llm llama-cpp download-model \
-  https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q8_0.bin \
+  https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q6_K.gguf \
   --alias llama2
 ```
 Try prompts that expect to be completed by the model, for example:
@@ -98,12 +95,21 @@ llm -m llama2 'Three fancy names for a posh albatross are:'
 ```
 ### Llama 2 Chat 13B
 
-This model is the Llama 2 13B Chat GGML model - a 13.83GB download:
+This model is the Llama 2 13B Chat GGML model - a 10.7GB download:
 ```bash
 llm llama-cpp download-model \
-  'https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q8_0.bin'\
+  'https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q6_K.gguf'\
   -a llama2-chat-13b --llama2-chat
 ```
+
+### Llama 2 Python 13B
+
+This model is the Llama 2 13B Python GGML model - a 9.24GB download:
+```bash
+llm llama-cpp download-model \
+  'https://huggingface.co/TheBloke/CodeLlama-13B-Python-GGUF/resolve/main/codellama-13b-python.Q5_K_M.gguf'\
+  -a llama2-python-13b --llama2-chat
+```
 ## Development
 
 To set up this plugin locally, first checkout the code. Then create a new virtual environment:
diff --git a/llm_llama_cpp.py b/llm_llama_cpp.py
index 3c7bc58..83b1e93 100644
--- a/llm_llama_cpp.py
+++ b/llm_llama_cpp.py
@@ -91,8 +91,8 @@ def models_dir():
     )
     def download_model(url, aliases, llama2_chat):
         "Download and register a model from a URL"
-        if not url.endswith(".bin"):
-            raise click.BadParameter("URL must end with .bin")
+        if not url.endswith(".gguf"):
+            raise click.BadParameter("URL must end with .gguf")
         with httpx.stream("GET", url, follow_redirects=True) as response:
             total_size = response.headers.get("content-length")