In [1]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma

In [2]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=400,
    chunk_overlap=100
)

In [3]:
loader = TextLoader("sample.txt")
docs = loader.load_and_split(
    text_splitter=text_splitter
)

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [4]:
import joblib as jb

jb.dump(embeddings, "embeddings")

In [5]:
embeddings = jb.load("embeddings")

In [6]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [7]:
db = Chroma.from_documents(
    docs,
    embedding=embeddings,
    persist_directory="data"
)

In [8]:
from langchain.llms import LlamaCpp
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

In [9]:
llm = LlamaCpp(
    model_path=r"C:\Users\sulek\Documents\PythonScripts\zephyr-7b-beta.Q4_0.gguf",
    temperature=0.75,
    max_tokens=2000,
    top_p=1,
    verbose=True,
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from C:\Users\sulek\Documents\PythonScripts\zephyr-7b-beta.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-beta
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv  

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [10]:
vectorstore = Chroma(
    persist_directory="data",
    embedding_function=embeddings
)

In [11]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [12]:
template = """Answer the question based only on the following context
 
{context}
 
Question:{question}
Answer:
"""

In [13]:
prompt = ChatPromptTemplate.from_template(template)

In [20]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
)

In [18]:
chain

{
  context: VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002CEFD2043D0>, search_kwargs={'k': 1}),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context\n \n{context}\n \nQuestion:{question}\nAnswer:\n'))])
| LlamaCpp(client=<llama_cpp.llama.Llama object at 0x000002CEF909B9D0>, model_path='C:\\Users\\sulek\\Documents\\PythonScripts\\zephyr-7b-beta.Q4_0.gguf', max_tokens=2000, temperature=0.75, top_p=1.0)

In [15]:
for chunk in chain.stream("What is a chat model?"):
    print(chunk, end="", flush=True)

A chat model is a type of language model in which the underlying model accepts a list of messages as input and outputs a single message. The base interface for these messages is defined by BaseMessage, which requires two attributes - content, which is usually a string representing the content of the message, and any additional metadata that may be required.


llama_print_timings:        load time =     991.11 ms
llama_print_timings:      sample time =      11.67 ms /    68 runs   (    0.17 ms per token,  5828.40 tokens per second)
llama_print_timings: prompt eval time =   11563.65 ms /   131 tokens (   88.27 ms per token,    11.33 tokens per second)
llama_print_timings:        eval time =   11359.45 ms /    67 runs   (  169.54 ms per token,     5.90 tokens per second)
llama_print_timings:       total time =   23159.40 ms /   198 tokens


In [17]:
chain.invoke("What is AImessage?")

Llama.generate: prefix-match hit

llama_print_timings:        load time =     991.11 ms
llama_print_timings:      sample time =      16.15 ms /    84 runs   (    0.19 ms per token,  5201.24 tokens per second)
llama_print_timings: prompt eval time =   10633.17 ms /   106 tokens (  100.31 ms per token,     9.97 tokens per second)
llama_print_timings:        eval time =   13883.17 ms /    83 runs   (  167.27 ms per token,     5.98 tokens per second)
llama_print_timings:       total time =   24763.67 ms /   189 tokens


'AIMessage is a specific type of BaseMessage that is coming from an AI or an assistant, as distinguished from messages that originate from humans or systems. This is defined and provided by LangChain, an open-source library for building transformer-based models that process natural language text. The library offers various entities, such as HumanMessage and SystemMessage, to help categorize and distinguish different types of messages.'