In [3]:
import os
os.environ["TRANSFORMERS_CACHE"] = "/scratch/megathon/cache/"

from med_llama import *

kargs = {
    "quant_method": "q5_k_m",
    "n": 4096,
    "penalty": 1.1,
    "ngl": 35,
    "device": "cuda",
    "con_docs": "webmd_context_docs.txt",
    "chunk_size": 200,
    "chunk_overlap": 10,
    "embed_file": "sentence-transformers/all-mpnet-base-v2"
}

prompt = "What should I do if I want to stop dialysis?"

In [None]:
# to convert to quantized model, GGML, Q4 and Q5
root = "/scratch/megathon/cache/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8"
quantize_model(root, "/scratch/megathon/quant/metaquant2")

In [4]:
# outputs from vanilla llama
model_name = "/scratch/megathon/quant/metaquant"
ggml_version = "gguf"
qtype = f"{model_name}.{ggml_version}.{kargs['quant_method']}.bin"
print(f"Running with quantized model {qtype}")

!/scratch/megathon/quant/llama.cpp/main -m {qtype} -n {kargs['n']} --repeat_penalty {kargs['penalty']} --color -ngl {kargs['ngl']} -p f"\'{prompt}\'" > output.txt
with open('output.txt') as f:
    output = f.read()
os.remove("output.txt")

Running with quantized model /scratch/megathon/quant/metaquant.gguf.q5_k_m.bin
Log start
main: build = 1441 (ff3bad8)
main: built with cc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 for x86_64-linux-gnu
main: seed  = 1698566820
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /scratch/megathon/quant/metaquant.gguf.q5_k_m.bin (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 11008,     1,     1 ]


In [5]:
print(output)

f\'What should I do if I want to stop dialysis?\'
Here are some steps you can take if you want to stop dialysis:

1. Discuss your decision with your nephrologist or kidney doctor: It is important to discuss your decision with your nephrologist or kidney doctor before stopping dialysis. They can help you understand the potential risks and benefits of stopping dialysis and provide guidance on how to do so safely.
2. Review your medical history: Your nephrologist or kidney doctor will review your medical history to determine if stopping dialysis is safe for you. They may consider factors such as your age, overall health, and the cause of your kidney failure.
3. Check your blood chemistry and other test results: Your nephrologist or kidney doctor may perform additional tests to check your blood chemistry and other vital signs before stopping dialysis. This can help ensure that you are in a stable condition and that stopping dialysis will not cause any immediate harm.
4. Consider hospice ca

In [7]:
# outputs from llama with vectorstore
model_name = "/scratch/megathon/quant/metaquant"
ggml_version = "gguf"
qtype = f"{model_name}.{ggml_version}.{kargs['quant_method']}.bin"
print(f"Running with quantized model {qtype}")

print("Generating vectorstore")
vstore = get_vector_store(kargs, prompt)

print("Getting context")
new_prompt = get_context(vstore, prompt)

!/scratch/megathon/quant/llama.cpp/main -m {qtype} -n {kargs['n']} --repeat_penalty {kargs['penalty']} --color -ngl {kargs['ngl']} -p f"\'{new_prompt}\'" > output.txt
with open('output.txt') as f:
    output = f.read()
os.remove("output.txt")

Running with quantized model /scratch/megathon/quant/metaquant.gguf.q5_k_m.bin
Generating vectorstore
Getting context


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Log start
main: build = 1441 (ff3bad8)
main: built with cc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 for x86_64-linux-gnu
main: seed  = 1698567038
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /scratch/megathon/quant/metaquant.gguf.q5_k_m.bin (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [ 

In [4]:
print(output.split("Answer")[-1])

:\'stop hospice care, and start back on treatments. You can also keep up these habits to stay well: Eat right. Stick to a healthy, well-balanced diet that's low in salt, fat, and sugar; drink plenty of water; and avoid foods high in phosphorus. Exercise regularly. Doctors often recommend exercise for people with kidney disease because it can help keep your body healthy. Don't smoke. Smoking raises blood pressure and can damage blood vessels, which can make kidney disease worse. Limit alcohol to 2 drinks a day if you are an adult age 18 or older; limit is lower for children and teens. Don't take too many nonsteroidal anti-inflammatory drugs (NSAIDs), such as ibuprofen or naproxen, unless your doctor says it's okay. These meds can harm the kidneys if you take too much of them. Get enough sleep. Most adults need 7 to 8 hours of sleep each night. Follow these steps and talk with your healthcare provider about any changes in your lifestyle or treatment plan. And remember, it's important to 

## Rough Work :)

In [13]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/megathon/cache/'

MODEL_ID = "/scratch/megathon/cache/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8"

MODEL_NAME = MODEL_ID.split('/')[-1]
GGML_VERSION = "gguf"

# Convert to fp16
fp16 = f"/scratch/megathon/quant/metaquant.{GGML_VERSION}.fp16.bin"
!python /scratch/megathon/quant/llama.cpp/convert.py {MODEL_ID} --outtype f16 --outfile {fp16}

Loading model file /scratch/megathon/cache/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8/model-00001-of-00002.safetensors
Loading model file /scratch/megathon/cache/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8/model-00001-of-00002.safetensors
Loading model file /scratch/megathon/cache/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8/model-00002-of-00002.safetensors
params = Params(n_vocab=32000, n_embd=4096, n_layer=32, n_ctx=4096, n_ff=11008, n_head=32, n_head_kv=32, f_norm_eps=1e-05, f_rope_freq_base=None, f_rope_scale=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('/scratch/megathon/cache/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8'))
Loading vocab file '/scratch/megathon/cache/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8/tokenizer.model', type 'spm'


In [14]:
QUANTIZATION_METHODS = ["q4_k_m", "q5_k_m"]

for method in QUANTIZATION_METHODS:
    qtype = f"/scratch/megathon/quant/metaquant.{GGML_VERSION}.{method}.bin"
    !/scratch/megathon/quant/llama.cpp/quantize {fp16} {qtype} {method}

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6
main: build = 1441 (ff3bad8)
main: built with cc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 for x86_64-linux-gnu
main: quantizing '/scratch/megathon/quant/metaquant.gguf.fp16.bin' to '/scratch/megathon/quant/metaquant.gguf.q4_k_m.bin' as Q4_K_M
llama_model_loader: loaded meta data with 18 key-value pairs and 291 tensors from /scratch/megathon/quant/metaquant.gguf.fp16.bin (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight f16      [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight f16      [  4096, 11008,

In [15]:
import os

model_list = [file for file in os.listdir("/scratch/megathon/quant/") if GGML_VERSION in file]
prompt = "What should I do if I want to stop dialysis?"
chosen_method = "q4_k_m" #input("Please specify the quantization method to run the model (options: " + ", ".join(model_list) + "): ")

qtype = f"/scratch/megathon/quant/metaquant.{GGML_VERSION}.{chosen_method}.bin"
!/scratch/megathon/quant/llama.cpp/main -m {qtype} -n 128 --repeat_penalty 1.1 --color -ngl 35 -p "{prompt}"

Log start
main: build = 1441 (ff3bad8)
main: built with cc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 for x86_64-linux-gnu
main: seed  = 1698532407
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /scratch/megathon/quant/metaquant.gguf.q4_k_m.bin (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [ 

In [17]:
import os

model_list = [file for file in os.listdir("/scratch/megathon/quant/") if GGML_VERSION in file]
prompt = "What should I do if I want to stop dialysis?"
chosen_method = "q5_k_m" #input("Please specify the quantization method to run the model (options: " + ", ".join(model_list) + "): ")

qtype = f"/scratch/megathon/quant/metaquant.{GGML_VERSION}.{chosen_method}.bin"
!/scratch/megathon/quant/llama.cpp/main -m {qtype} -n 128 --repeat_penalty 1.1 --color -p "{prompt}"

Log start
main: build = 1441 (ff3bad8)
main: built with cc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 for x86_64-linux-gnu
main: seed  = 1698532562
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /scratch/megathon/quant/metaquant.gguf.q5_k_m.bin (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [ 

In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/megathon/cache/'

In [1]:
from langchain.document_loaders import TextLoader

loader = TextLoader("train_webmd_squad_v2_consec.txt")
documents = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=10)
all_splits = text_splitter.split_documents(documents)

In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)



In [3]:
query = "What should I do if I want to stop dialysis?"
docs = vectorstore.similarity_search_with_score(query)

context = ""
for doc in docs:
    context += doc[0].page_content + " "

In [4]:
for i in range(len(docs)):
    print(len(docs[i][0].page_content), docs[i][1])

197 0.6995704
197 0.7874575
195 0.85200596
195 0.87893635


In [5]:
context

"kidneys are failing, you might choose the hospice program rather than continuing with dialysis. But you can still change your mind, stop hospice care, and start back on treatments. Other people may kidneys and keep them working as long as you can. Follow your doctor's advice carefully. You can also keep up these habits to stay well: Eat right. Stick to a healthy, well-balanced diet that's low Pain medicines If your kidneys fail, you'll need dialysis, which uses a machine to filter your blood and remove waste, like salt, extra water, and certain chemicals. You can also get on a waiting may eventually stop working. That's serious, and it can be life-threatening. Healthy kidneys: Keep a balance of water and minerals (such as sodium, potassium, and phosphorus) in your blood Remove "

In [20]:
import os

# model_list = [file for file in os.listdir("/scratch/megathon/quant/") if GGML_VERSION in file]
prompt = "What should I do if I want to stop dialysis?"
chosen_method = "q5_k_m" #input("Please specify the quantization method to run the model (options: " + ", ".join(model_list) + "): ")

qtype = f"/scratch/megathon/quant/metaquant.gguf.{q5_k_m}.bin"
!/scratch/megathon/quant/llama.cpp/main -m {qtype} -n 4096 --repeat_penalty 1.1 --color -ngl 35 -p "{prompt}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Log start
main: build = 1441 (ff3bad8)
main: built with cc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 for x86_64-linux-gnu
main: seed  = 1698540553
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /scratch/megathon/quant/metaquant.gguf.q5_k_m.bin (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [ 

In [21]:
import os

# model_list = [file for file in os.listdir("/scratch/megathon/quant/") if GGML_VERSION in file]
prompt = f"{context} \n\n Question: What should I do if I want to stop dialysis? \n\n Answer:"
chosen_method = "q5_k_m" #input("Please specify the quantization method to run the model (options: " + ", ".join(model_list) + "): ")

qtype = f"/scratch/megathon/quant/metaquant.gguf.{chosen_method}.bin"
!/scratch/megathon/quant/llama.cpp/main -m {qtype} -n 4096 --repeat_penalty 1.1 --color -ngl 35 -p "{prompt}"

Log start
main: build = 1441 (ff3bad8)
main: built with cc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 for x86_64-linux-gnu
main: seed  = 1698540614
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /scratch/megathon/quant/metaquant.gguf.q5_k_m.bin (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q5_K     [  4096,