# GPT4ALL Model

In [1]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import GPT4All
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [34]:
local_path = '/Users/sacbe/GPT4All/chat/gpt4all-lora-quantized_LlamaNew.bin'  # replace with your desired local file path

In [35]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callback_manager=callback_manager, verbose=True)

llama_model_load: loading model from '/Users/sacbe/GPT4All/chat/gpt4all-lora-quantized_LlamaNew.bin' - please wait ...
llama_model_load: n_vocab = 32001
llama_model_load: n_ctx   = 512
llama_model_load: n_embd  = 4096
llama_model_load: n_mult  = 256
llama_model_load: n_head  = 32
llama_model_load: n_layer = 32
llama_model_load: n_rot   = 128
llama_model_load: f16     = 2
llama_model_load: n_ff    = 11008
llama_model_load: n_parts = 1
llama_model_load: type    = 1
llama_model_load: ggml map size = 4017.70 MB
llama_model_load: ggml ctx size =  81.25 KB
llama_model_load: mem required  = 5809.78 MB (+ 2052.00 MB per state)
llama_model_load: loading tensors from '/Users/sacbe/GPT4All/chat/gpt4all-lora-quantized_LlamaNew.bin'
llama_model_load: model size =  4017.27 MB / num tensors = 291
llama_init_from_file: kv self size  =  512.00 MB


In [36]:
template = """Question: {question}
Answer:"""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [37]:
question = "make a joke"
print(llm_chain.run(question))

llama_generate: seed = 1681149999

system_info: n_threads = 4 / 8 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | 
sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
generate: n_ctx = 512, n_batch = 1, n_predict = 256, n_keep = 0




 Question: make a joke
Answer: Tell me if I'm being too sensitive.


 [end of text]

llama_print_timings:        load time = 10644.44 ms
llama_print_timings:      sample time =    25.06 ms /    11 runs   (    2.28 ms per run)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token)
llama_print_timings:        eval time =  5477.20 ms /    20 runs   (  273.86 ms per run)
llama_print_timings:       total time = 15136.54 ms


# Llama model 7B

In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain

In [2]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [5]:
local_path = "/Users/sacbe/Llama_cpp/models/ggml-model-q4_0.bin"
llm = LlamaCpp(model_path=local_path)
llm_chain = LLMChain(prompt=prompt, llm=llm)

llama.cpp: loading model from /Users/sacbe/Llama_cpp/models/ggml-model-q4_0.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = 'ggml' (old version with low tokenizer quality and no mmap support)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: f16        = 2
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113739.11 KB
llama_model_load_internal: mem required  = 5809.32 MB (+ 2052.00 MB per state)
...................................................................................................

In [4]:
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
llm_chain.run(question)


llama_print_timings:        load time =  1830.01 ms
llama_print_timings:      sample time =   269.24 ms /   114 runs   (    2.36 ms per run)
llama_print_timings: prompt eval time =  6276.46 ms /    34 tokens (  184.60 ms per token)
llama_print_timings:        eval time = 25777.45 ms /   113 runs   (  228.12 ms per run)
llama_print_timings:       total time = 32342.96 ms


" First, what is Justin's birthyear? (Justin was born in 1982) Then, what was the name of the winner of the Super Bowl held in the year Justin was born? (The winner of Super Bowl XXVII, held in 1983, was the Miami Dolphins.) Finally, what NFL team did the Miami Dolphins play for? (The Dallas Cowboys.) So, the answer is: The Dallas Cowboys won the Super Bowl in the year Justin Biber was born."

In [11]:
from langchain.embeddings import LlamaCppEmbeddings
llama = LlamaCppEmbeddings(model_path=local_path)

llama.cpp: loading model from /Users/sacbe/GPT4All/chat/gpt4all-lora-quantized.bin
llama_model_load_internal: format     = ggjt v1 (latest)
llama_model_load_internal: n_vocab    = 32001
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: f16        = 2
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =  59.11 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 2052.00 MB per state)
AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | 
llama_init_from_file: kv self size  =  512.00 MB


In [12]:
# load document
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/Users/sacbe/Documents/Summary_LangChain/IA.pdf")
documents = loader.load()

In [13]:
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import LatexTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import NLTKTextSplitter
from langchain.text_splitter import PythonCodeTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

# split the documents into chunks
text_splitter = LatexTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = llama
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1})

[2023-04-10 11:40:45,312] {posthog.py:15} INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
[2023-04-10 11:40:45,313] {__init__.py:80} INFO - Running Chroma using direct local API.
[2023-04-10 11:40:45,773] {ctypes.py:22} INFO - Successfully imported ClickHouse Connect C data optimizations
[2023-04-10 11:40:45,775] {ctypes.py:31} INFO - Successfully import ClickHouse Connect C/Numpy optimizations
[2023-04-10 11:40:45,780] {json_impl.py:45} INFO - Using python library for writing JSON byte strings



llama_print_timings:        load time =  1446.88 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time = 40794.30 ms /   237 tokens (  172.13 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time = 40796.86 ms


In [14]:
# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm = llm, chain_type="stuff", retriever = retriever, return_source_documents=False)
query = "Dame un resumen del texto"
result = qa({"query": query})
print(result["result"])


llama_print_timings:        load time =  1446.88 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  1453.09 ms /     8 tokens (  181.64 ms per token)
llama_print_timings:        eval time =   276.42 ms /     1 runs   (  276.42 ms per run)
llama_print_timings:       total time =  1730.26 ms
llama_generate: seed = 1681148494

system_info: n_threads = 4 / 8 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | 
sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
generate: n_ctx = 512, n_batch = 1, n_predict = 256, n_keep = 0




 Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Una Inteligencia Artificial (IA) es un sistema de computación diseñado para aprender y tomar decisiones por sí mismo. Para hacerlo, utiliza algoritmos matemáticos y modelos estadísticos que analizan grandes cantidades de datos para identificar patrones y relaciones entre ellos.Una IA se entrena mediante el proceso de aprendizaje automático, donde se le proporciona datos de entrada y se le enseña a asociarlos con resultados específicos. Con el tiempo, la IA aprende a hacer predicciones y tomar decisiones basadas en la información que ha sido alimentada.Las IA se utilizan en una amplia gama de aplicaciones, desde asistentes virtuales y chatbots hasta sistemas de diagnóstico médico y vehículos autónomos. A medida que la tecnología avanza, se espera que las IA sean cada vez más sofisticadas y capaces de realizar tareas cada vez

 [end of text]

llama_print_timings:        load time = 191860.62 ms
llama_print_timings:      sample time =   419.12 ms /   230 runs   (    1.82 ms per run)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token)
llama_print_timings:        eval time = 180431.45 ms /   544 runs   (  331.68 ms per run)
llama_print_timings:       total time = 474889.46 ms


In [15]:
query = "De que habla el último parrafo?"
result = qa({"query": query})


llama_print_timings:        load time =  1446.88 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  4903.21 ms /    11 tokens (  445.75 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  4904.08 ms
llama_generate: seed = 1681148800

system_info: n_threads = 4 / 8 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | 
sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
generate: n_ctx = 512, n_batch = 1, n_predict = 256, n_keep = 0


 [end of text]

llama_print_timings:        load time = 191860.62 ms
llama_print_timings:      sample time =   604.56 ms /   333 runs   (    1.82 ms per run)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per toke

In [20]:
print(result["result"])

 Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Una Inteligencia Artificial (IA) es un sistema de computación diseñado para aprender y tomar decisiones por sí mismo. Para hacerlo, utiliza algoritmos matemáticos y modelos estadísticos que analizan grandes cantidades de datos para identificar patrones y relaciones entre ellos.Una IA se entrena mediante el proceso de aprendizaje automático, donde se le proporciona datos de entrada y se le enseña a asociarlos con resultados específicos. Con el tiempo, la IA aprende a hacer predicciones y tomar decisiones basadas en la información que ha sido alimentada.Las IA se utilizan en una amplia gama de aplicaciones, desde asistentes virtuales y chatbots hasta sistemas de diagnóstico médico y vehículos autónomos. A medida que la tecnología avanza, se espera que las IA sean cada vez más sofisticadas y capaces de realizar tareas cada vez