In [11]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
    repo_id="TheBloke/Llama-2-13B-chat-GGUF",
    filename="*Q5_K_M.gguf",
    verbose=True,
    temperature=0.0,
    max_new_tokens=1024,
    
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,  # note, this sets n_ctx in the model_kwargs below, so you don't need to pass it there.
    
    # kwargs to pass to __call__()
    generate_kwargs={},
    
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 4}, # I need to play with this and see if it actually helps

)


llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /home/sanka/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGUF/snapshots/4458acc949de0a9914c3eab623904d4fe999050a/./llama-2-13b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.di

In [12]:
output = llm(
      "Q: Who are the acors in  X Files TV series? A: ", # Prompt
      max_tokens=320, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output['choices'][0]['text'])



llama_print_timings:        load time =    1404.89 ms
llama_print_timings:      sample time =      27.88 ms /    45 runs   (    0.62 ms per token,  1614.23 tokens per second)
llama_print_timings: prompt eval time =    1404.80 ms /    18 tokens (   78.04 ms per token,    12.81 tokens per second)
llama_print_timings:        eval time =    6606.98 ms /    44 runs   (  150.16 ms per token,     6.66 tokens per second)
llama_print_timings:       total time =    8065.84 ms /    62 tokens


Q: Who are the acors in  X Files TV series? A:  The main characters in the TV series "The X-Files" are FBI Agents Fox Mulder (played by David Duchovny) and Dana Scully (played by Gillian Anderson).


In [13]:
from transformers import AutoModel, AutoTokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer_name = model_name  # usually the same as model_name

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

embedding = HuggingFaceEmbedding(
    model_name=model_name)



In [27]:
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.callbacks import CallbackManager
callback_manager = CallbackManager()
transcript_directory = "/home/sanka/volttron/transcripts/ancient-aliens-official"
storage_directory = "/home/sanka/volttron/storage/ancient-aliens-official/local"

# Add filename as metadata to each chunk associated with a document/transcript
filename_fn = lambda filename: {'episode_title': os.path.splitext(os.path.basename(filename))[0]}
documents = SimpleDirectoryReader(transcript_directory, filename_as_id=True, 
                                  file_metadata=filename_fn).load_data()

# Exclude metadata from the LLM, meaning it won't read it when generating a response.
# Future - consider looping over documents and setting the id_ to basename, instead of fullpath
#[document.excluded_llm_metadata_keys.append('episode_title') for document in documents]

# chunk_size - It defines the size of the chunks (or nodes) that documents are broken into when they are indexed by LlamaIndex
#service_context = ServiceContext.from_defaults(llm=llm, chunk_size=1024,embed_model="local")

# Build the index
index = VectorStoreIndex.from_documents(documents, embed_model=embedding, allback_manager=callback_manager,show_progress=True)

# Persist the index to disk
#index.storage_context.persist(persist_dir=storage_directory)


Parsing nodes:   0%|          | 0/84 [00:00<?, ?it/s]

Parsing nodes: 100%|██████████| 84/84 [00:00<00:00, 138.63it/s]
Generating embeddings: 100%|██████████| 231/231 [00:00<00:00, 295.16it/s]


ValueError: 
******
Could not load OpenAI model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

To disable the LLM entirely, set llm=None.
******