In [1]:
# Llama2 7B Parameter Model

In [2]:
# curl -L "https://replicate.fyi/install-llama-cpp" | bash
# wget https://huggingface.co/localmodels/Llama-2-7B-Chat-ggml/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_S.bin ./backups/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.bin
# ./backups/llama.cpp/convert-llama-ggml-to-gguf.py --eps 1e-5 -i ./llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.bin -o ./llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.gguf.bin


In [3]:
from langchain.llms import LlamaCpp
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import LlamaCppEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser


In [5]:
llm = LlamaCpp(
    model_path="./backups/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.gguf.bin", 
    n_ctx=2048)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./backups/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.gguf.bin (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [

In [6]:
llm("What is Llama.cpp?")


llama_print_timings:        load time =   580.72 ms
llama_print_timings:      sample time =   181.38 ms /   256 runs   (    0.71 ms per token,  1411.39 tokens per second)
llama_print_timings: prompt eval time =   580.63 ms /     8 tokens (   72.58 ms per token,    13.78 tokens per second)
llama_print_timings:        eval time = 13711.30 ms /   256 runs   (   53.56 ms per token,    18.67 tokens per second)
llama_print_timings:       total time = 14846.07 ms


'\nLlama.cpp is a C++ library for creating and manipulating 3D models, including geometry, materials, and animations. It is designed to be fast, flexible, and easy to use, making it a great choice for developers who want to add 3D capabilities to their applications without having to write all the code from scratch.\n\nHere are some of the key features of Llama.cpp:\n\n1. Geometry: Llama.cpp provides a wide range of geometric primitives, including points, lines, triangles, and more. These primitives can be combined to create complex 3D models.\n2. Materials: Llama.cpp includes support for materials, which define the appearance of a 3D object, including its color, texture, and other properties. Materials can be applied to individual objects or groups of objects.\n3. Animations: Llama.cpp allows developers to create animations by defining keyframe data, which defines the position and rotation of an object over time. Animations can be played back using a variety of rendering algorithms.\n4

In [7]:
llama = LlamaCppEmbeddings(
    model_path="./backups/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.gguf.bin")

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./backups/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.gguf.bin (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [

In [8]:
clinical_note = """
A 28-year-old previously healthy adult patient presented with tachycardia, fever, and mental confusion. 
The symptoms started after a cut to his leg while gardening.
"""

In [9]:
vectorstore = FAISS.from_texts([clinical_note], embedding=llama)
retriever = vectorstore.as_retriever()


llama_print_timings:        load time =   521.55 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  2498.63 ms /    48 tokens (   52.05 ms per token,    19.21 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  2504.90 ms


In [10]:
template = """Answer the Question by addressing the following 10 categories:
              1. General
              2. Skin
              3. HEENT
              4. Pulmonary
              5. Cardiovascular
              6. Gastrointestinal
              7. Genitourinary
              8. Musculoskeletal
              9. Neurologic
              10. Psychiatric
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [11]:
chain = ({"context": retriever, "question": RunnablePassthrough()} 
         | prompt 
         | llm 
         | StrOutputParser())

In [12]:
chain.invoke("What is the history of present illness?")


llama_print_timings:        load time =   521.55 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   497.25 ms /    10 tokens (   49.72 ms per token,    20.11 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   499.93 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   580.72 ms
llama_print_timings:      sample time =   180.71 ms /   256 runs   (    0.71 ms per token,  1416.63 tokens per second)
llama_print_timings: prompt eval time =  7939.88 ms /   160 tokens (   49.62 ms per token,    20.15 tokens per second)
llama_print_timings:        eval time = 13943.01 ms /   256 runs   (   54.46 ms per token,    18.36 tokens per second)
llama_print_timings:       total time = 22453.02 ms


"\nAnswer: \n\n1. General: The patient was previously healthy and had no underlying medical conditions.\n2. Skin: There were no skin lesions or rashes observed at the time of presentation.\n3. HEENT: The patient had a mild sinus tachycardia, but there were no other abnormalities observed in the head, eyes, ears, nose, or throat.\n4. Pulmonary: Chest examination revealed wheezing, and the patient had a minimal decrease in lung fields on both sides.\n5. Cardiovascular: The patient had a rapid heart rate (120 bpm) with no other cardiac abnormalities.\n6. Gastrointestinal: There were no abnormalities observed in the patient's abdomen or gastrointestinal system.\n7. Genitourinary: The patient had no abnormalities in the genital or urinary systems.\n8. Musculoskeletal: There were no musculoskeletal abnormalities observed during the examination.\n9. Neurologic: The patient had a mild confusion and disorientation, but there were"