[Example] ggml: add CI for phi-3, yi-1.5-9b (#143)

* [Example] ggml: rename phi-3-mini to phi-3, test more phi-3 models Signed-off-by: dm4 <dm4@secondstate.io> * [CI] llama: use b2963 Signed-off-by: dm4 <dm4@secondstate.io> * [Example] ggml: update chatml example for better CI testing Signed-off-by: dm4 <dm4@secondstate.io> * [CI] llama: use latest plugin with both wasmedge 0.13 and 0.14 Signed-off-by: dm4 <dm4@secondstate.io> --------- Signed-off-by: dm4 <dm4@secondstate.io>
second-state · May 28, 2024 · 654ffc5 · 654ffc5
1 parent 8b3991c
commit 654ffc5
Show file tree

Hide file tree

Showing 8 changed files with 130 additions and 42 deletions.
diff --git a/.github/workflows/llama.yml b/.github/workflows/llama.yml
@@ -24,7 +24,8 @@ jobs:
   build:
     strategy:
       matrix:
-        runner: [ubuntu-20.04, macos-13, macos-m1]
+        runner: [ubuntu-20.04, macos-m1]
+        wasmedge: ["0.13.5", "0.14.0"]
         plugin: [wasi_nn-ggml]
         job:
           - name: "Tiny Llama"
@@ -224,6 +225,34 @@ jobs:
                 default \
                 '<start_of_turn>user Where is the capital of Japan? <end_of_turn><start_of_turn>model'
 
+          - name: Yi 1.5 9B 16K
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/chatml
+              curl -LO https://huggingface.co/second-state/Yi-1.5-9B-Chat-16K-GGUF/resolve/main/Yi-1.5-9B-Chat-16K-Q5_K_M.gguf
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --env reverse_prompt='<|im_end|>' \
+                --nn-preload default:GGML:AUTO:Yi-1.5-9B-Chat-16K-Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-chatml.wasm \
+                default \
+                $'<|im_start|>system\nYou are an AI assistant<|im_end|>\n<|im_start|>user\nWhere is the capital of Japan?<|im_end|>\n<|im_start|>assistant'
+
+          - name: Yi 1.5 9B
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/chatml
+              curl -LO https://huggingface.co/second-state/Yi-1.5-9B-Chat-GGUF/resolve/main/Yi-1.5-9B-Chat-Q5_K_M.gguf
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --env reverse_prompt='<|im_end|>' \
+                --nn-preload default:GGML:AUTO:Yi-1.5-9B-Chat-Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-chatml.wasm \
+                default \
+                $'<|im_start|>system\nYou are an AI assistant<|im_end|>\n<|im_start|>user\nWhere is the capital of Japan?<|im_end|>\n<|im_start|>assistant'
+
           - name: Grammar Example
             run: |
               test -f ~/.wasmedge/env && source ~/.wasmedge/env
@@ -259,26 +288,48 @@ jobs:
                 default \
                 $'[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.\n<</SYS>>\nWhat is the capital of Japan?[/INST]'
 
-          - name: Phi 3 Mini
+          - name: Phi 3 Mini 4k
             run: |
               test -f ~/.wasmedge/env && source ~/.wasmedge/env
-              cd wasmedge-ggml/test/phi-3-mini
+              cd wasmedge-ggml/test/phi-3
               curl -LO https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
               cargo build --target wasm32-wasi --release
               time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
                 --nn-preload default:GGML:AUTO:Phi-3-mini-4k-instruct-q4.gguf \
-                target/wasm32-wasi/release/wasmedge-ggml-phi-3-mini.wasm \
+                target/wasm32-wasi/release/wasmedge-ggml-phi-3.wasm \
                 default \
                 $'<|user|>\nWhat is the capital of Japan?<|end|>\n<|assistant|>'
 
-          - name: Build llama-stream
+          - name: Phi 3 Mini 128k
             run: |
-              cd wasmedge-ggml/llama-stream
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/test/phi-3
+              curl -LO https://huggingface.co/second-state/Phi-3-mini-128k-instruct-GGUF/resolve/main/Phi-3-mini-128k-instruct-Q5_K_M.gguf
               cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --nn-preload default:GGML:AUTO:Phi-3-mini-128k-instruct-Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-phi-3.wasm \
+                default \
+                $'<|user|>\nWhat is the capital of Japan?<|end|>\n<|assistant|>'
 
-          - name: Build chatml
+          - name: Phi 3 Medium 4k
             run: |
-              cd wasmedge-ggml/chatml
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/test/phi-3
+              curl -LO https://huggingface.co/second-state/Phi-3-medium-4k-instruct-GGUF/resolve/main/Phi-3-medium-4k-instruct-Q5_K_M.gguf
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --nn-preload default:GGML:AUTO:Phi-3-medium-4k-instruct-Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-phi-3.wasm \
+                default \
+                $'<|user|>\nWhat is the capital of Japan?<|end|>\n<|assistant|>'
+
+          - name: Build llama-stream
+            run: |
+              cd wasmedge-ggml/llama-stream
               cargo build --target wasm32-wasi --release
 
           - name: Build llava-base64-stream
@@ -290,6 +341,7 @@ jobs:
           - runner: macos-m1
             ngl: 100
           - runner: ubuntu-20.04
+            wasmedge: "0.14.0"
             plugin: wasi_nn-ggml
             job:
               name: C4AI Command-R v01
@@ -305,7 +357,7 @@ jobs:
                   default \
                   '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>What is the capital of the United States?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'
 
-    name: ${{ matrix.runner }} - ${{ matrix.job.name }} - ${{ matrix.plugin }}
+    name: ${{ matrix.runner }} - ${{ matrix.job.name }} - ${{ matrix.wasmedge }} - ${{ matrix.plugin }}
     runs-on: ${{ matrix.runner }}
     steps:
     - uses: actions/checkout@v4
@@ -316,8 +368,7 @@ jobs:
 
     - name: Install WasmEdge + WASI-NN + GGML
       run: |
-        VERSION=0.13.5
-        curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- -v $VERSION --plugins ${{ matrix.plugin }}
+        curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- -v ${{ matrix.wasmedge }} --plugins ${{ matrix.plugin }}
 
     - name: Set environment variable
       run: echo "NGL=${{ matrix.ngl || 0 }}" >> $GITHUB_ENV

diff --git a/wasmedge-ggml/chatml/src/main.rs b/wasmedge-ggml/chatml/src/main.rs
@@ -1,5 +1,4 @@
-use serde_json::Value;
-use std::collections::HashMap;
+use serde_json::{json, Value};
 use std::env;
 use std::io;
 use wasmedge_wasi_nn::{
@@ -19,6 +18,27 @@ fn read_input() -> String {
     }
 }
 
+fn get_options_from_env() -> Value {
+    let mut options = json!({});
+    if let Ok(val) = env::var("enable_log") {
+        options["enable-log"] = serde_json::from_str(val.as_str())
+            .expect("invalid value for enable-log option (true/false)")
+    }
+    if let Ok(val) = env::var("n_gpu_layers") {
+        options["n-gpu-layers"] =
+            serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer")
+    }
+    if let Ok(val) = env::var("ctx_size") {
+        options["ctx-size"] =
+            serde_json::from_str(val.as_str()).expect("invalid ctx-size value (unsigned integer")
+    }
+    if let Ok(val) = env::var("reverse_prompt") {
+        options["reverse-prompt"] = json!(val.as_str())
+    }
+
+    options
+}
+
 fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
     context.set_input(0, TensorType::U8, &[1], &data)
 }
@@ -56,11 +76,9 @@ fn main() {
     let args: Vec<String> = env::args().collect();
     let model_name: &str = &args[1];
 
-    // Set options for the graph. Check our README for more details.
-    let mut options = HashMap::new();
-    options.insert("enable-log", Value::from(false));
-    options.insert("n-gpu-layers", Value::from(0));
-    options.insert("ctx-size", Value::from(512));
+    // Set options for the graph. Check our README for more details:
+    // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
+    let options = get_options_from_env();
 
     // Create graph and initialize context.
     let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
@@ -82,6 +100,48 @@ fn main() {
     // )
     // .expect("Failed to set metadata");
 
+    // If there is a third argument, use it as the prompt and enter non-interactive mode.
+    // This is mainly for the CI workflow.
+    if args.len() >= 3 {
+        let prompt = &args[2];
+        // Set the prompt.
+        println!("Prompt:\n{}", prompt);
+        let tensor_data = prompt.as_bytes().to_vec();
+        context
+            .set_input(0, TensorType::U8, &[1], &tensor_data)
+            .expect("Failed to set input");
+        println!("Response:");
+
+        // Get the number of input tokens and llama.cpp versions.
+        let input_metadata = get_metadata_from_context(&context);
+        println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
+        println!(
+            "[INFO] llama_build_number: {}",
+            input_metadata["llama_build_number"]
+        );
+        println!(
+            "[INFO] Number of input tokens: {}",
+            input_metadata["input_tokens"]
+        );
+
+        // Get the output.
+        context.compute().expect("Failed to compute");
+        let output = get_output_from_context(&context);
+        println!("{}", output.trim());
+
+        // Retrieve the output metadata.
+        let metadata = get_metadata_from_context(&context);
+        println!(
+            "[INFO] Number of input tokens: {}",
+            metadata["input_tokens"]
+        );
+        println!(
+            "[INFO] Number of output tokens: {}",
+            metadata["output_tokens"]
+        );
+        std::process::exit(0);
+    }
+
     let mut saved_prompt = String::new();
     let system_prompt = String::from("You are a helpful, respectful and honest assistant. Always answer as short as possible, while being safe." );
 
@@ -101,18 +161,6 @@ fn main() {
         set_data_to_context(&mut context, saved_prompt.as_bytes().to_vec())
             .expect("Failed to set input");
 
-        // Get the number of input tokens and llama.cpp versions.
-        // let input_metadata = get_metadata_from_context(&context);
-        // println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
-        // println!(
-        //     "[INFO] llama_build_number: {}",
-        //     input_metadata["llama_build_number"]
-        // );
-        // println!(
-        //     "[INFO] Number of input tokens: {}",
-        //     input_metadata["input_tokens"]
-        // );
-
         // Execute the inference.
         let mut reset_prompt = false;
         match context.compute() {
@@ -141,16 +189,5 @@ fn main() {
             output = output.trim().to_string();
             saved_prompt = format!("{}{}<|im_end|>\n", saved_prompt, output);
         }
-
-        // Retrieve the output metadata.
-        // let metadata = get_metadata_from_context(&context);
-        // println!(
-        //     "[INFO] Number of input tokens: {}",
-        //     metadata["input_tokens"]
-        // );
-        // println!(
-        //     "[INFO] Number of output tokens: {}",
-        //     metadata["output_tokens"]
-        // );
     }
 }
diff --git a/wasmedge-ggml/chatml/wasmedge-ggml-chatml.wasm b/wasmedge-ggml/chatml/wasmedge-ggml-chatml.wasm
diff --git a/wasmedge-ggml/test/phi-3-mini/wasmedge-ggml-phi-3-mini.wasm b/wasmedge-ggml/test/phi-3-mini/wasmedge-ggml-phi-3-mini.wasm
diff --git a/wasmedge-ggml/test/phi-3-mini/Cargo.toml → wasmedge-ggml/test/phi-3/Cargo.toml b/wasmedge-ggml/test/phi-3-mini/Cargo.toml → wasmedge-ggml/test/phi-3/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "wasmedge-ggml-phi-3-mini"
+name = "wasmedge-ggml-phi-3"
 version = "0.1.0"
 edition = "2021"
 

diff --git a/wasmedge-ggml/test/phi-3-mini/README.md → wasmedge-ggml/test/phi-3/README.md b/wasmedge-ggml/test/phi-3-mini/README.md → wasmedge-ggml/test/phi-3/README.md
diff --git a/wasmedge-ggml/test/phi-3-mini/src/main.rs → wasmedge-ggml/test/phi-3/src/main.rs b/wasmedge-ggml/test/phi-3-mini/src/main.rs → wasmedge-ggml/test/phi-3/src/main.rs
diff --git a/wasmedge-ggml/test/phi-3/wasmedge-ggml-phi-3.wasm b/wasmedge-ggml/test/phi-3/wasmedge-ggml-phi-3.wasm