## Generate Embedding for a text Corpus

In [15]:
! pip install -q -r requirements.txt

## import packages

In [9]:
from langchain.llms import LlamaCpp
from langchain.embeddings import LlamaCppEmbeddings
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

## call langChain embeddings and load the model


In [10]:
llm = LlamaCpp(model_path='model/GPT4All-13B-snoozy.ggmlv3.q4_0.bin')

llama.cpp: loading model from model/GPT4All-13B-snoozy.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =    0.09 MB
llama_model_load_internal: mem required  = 9031.70 MB (+ 1608.00 MB per state)
....................................................................................................
llama_init_from_file: kv self size  =  400.00 MB
AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | FMA = 1 | NEON = 0 | A

In [11]:
llama_embeddings = LlamaCppEmbeddings(model_path='model/GPT4All-13B-snoozy.ggmlv3.q4_0.bin')

llama.cpp: loading model from model/GPT4All-13B-snoozy.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =    0.09 MB
llama_model_load_internal: mem required  = 9031.70 MB (+ 3216.00 MB per state)
....................................................................................................
llama_init_from_file: kv self size  =  800.00 MB
AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | FMA = 1 | NEON = 0 | A

## test embeddings output

In [12]:
text = "This is a test document"
query_result =  llama_embeddings.embed_query(text)
len(query_result)



llama_print_timings:        load time =  1835.39 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  1834.63 ms /     6 tokens (  305.77 ms per token,     3.27 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  1837.76 ms


5120

In [13]:
doc_result = llama_embeddings.embed_documents([text])
doc_result


llama_print_timings:        load time =  1835.39 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  2096.06 ms /     6 tokens (  349.34 ms per token,     2.86 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  2098.74 ms


[[0.04978378117084503,
  -0.7217374444007874,
  -1.2726460695266724,
  -1.6391116380691528,
  0.3629855811595917,
  0.33249735832214355,
  1.3781789541244507,
  -0.49891749024391174,
  0.7961655259132385,
  -1.0492669343948364,
  0.0039496091194450855,
  -0.9673369526863098,
  0.9848967790603638,
  -0.2701074779033661,
  -1.8141454458236694,
  0.06683201342821121,
  1.1547869443893433,
  -0.42260056734085083,
  -0.037837572395801544,
  -0.40044209361076355,
  -0.5175232887268066,
  0.33070695400238037,
  -0.8144072890281677,
  -1.6178900003433228,
  -0.8005748987197876,
  -0.8041102290153503,
  1.0098347663879395,
  -0.6482396125793457,
  0.025442881509661674,
  -0.8520680665969849,
  -1.4002238512039185,
  -0.10291893035173416,
  -0.3081028163433075,
  -1.5070282220840454,
  0.222945898771286,
  0.43212032318115234,
  -0.9291085600852966,
  -0.3274846374988556,
  0.6551205515861511,
  -0.30113500356674194,
  -0.8965624570846558,
  0.41829153895378113,
  1.2816272974014282,
  1.4258791

## Load the txt file and generate index


In [14]:
  loader = TextLoader('sample.txt')
  index = VectorstoreIndexCreator(embedding=llama_embeddings,
                                vectorstore_kwargs={"persist_directory": "db"}
                               ).from_loaders([loader])



llama_print_timings:        load time =  1835.39 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 40791.39 ms /   135 tokens (  302.16 ms per token,     3.31 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 40840.61 ms

llama_print_timings:        load time =  1835.39 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 82283.13 ms /   263 tokens (  312.86 ms per token,     3.20 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 82411.83 ms

llama_print_timings:        load time =  1835.39 ms
llama_print_timings:   

In [15]:
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# Again, we should persist the db and figure out how to reuse it
docsearch = Chroma.from_documents(texts, llama_embeddings)


llama_print_timings:        load time =  1835.39 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 14238.31 ms /    47 tokens (  302.94 ms per token,     3.30 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 14254.63 ms

llama_print_timings:        load time =  1835.39 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 16712.79 ms /    54 tokens (  309.50 ms per token,     3.23 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 16733.28 ms

llama_print_timings:        load time =  1835.39 ms
llama_print_timings:   

## Inferring

In [16]:
MIN_DOCS = 1

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                 retriever=docsearch.as_retriever(search_kwargs={"k": MIN_DOCS}))

# qa = RetrievalQA.from_chain_type(llm=llm, retriever= docsearch.as_retriever())

In [19]:
query ="skills available in gadgeon ?"

qa.run(query)


llama_print_timings:        load time =  1835.39 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  3275.53 ms /     8 tokens (  409.44 ms per token,     2.44 tokens per second)
llama_print_timings:        eval time =   723.02 ms /     1 runs   (  723.02 ms per token,     1.38 tokens per second)
llama_print_timings:       total time =  4004.11 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  2495.35 ms
llama_print_timings:      sample time =    16.58 ms /    23 runs   (    0.72 ms per token,  1387.38 tokens per second)
llama_print_timings: prompt eval time = 14574.45 ms /    48 tokens (  303.63 ms per token,     3.29 tokens per second)
llama_print_timings:        eval time = 14366.01 ms /    22 runs   (  653.00 ms per token,     1.53 tokens per second)
llama_print_timings:       total time = 29104.57 ms


' I am not sure what "gadgeon" refers to, could you please provide more context or information?'