In [None]:
'''

How GPT4All works?
It is trained on top of Facebook’s LLaMA model, which released its weights under a non-commercial license.
Still, running the mentioned architecture on your local PC is impossible due to the large (7 billion) number of parameters. 
The authors incorporated two tricks to do efficient fine-tuning and inference. 
We will focus on inference since the fine-tuning process is out of the scope of this course.

The main contribution of GPT4All models is the ability to run them on a CPU. 
Testing these models is practically free because the recent PCs have powerful Central Processing Units. 
The underlying algorithm that helps with making it happen is called Quantization.
It basically converts the pre-trained model weights to 4-bit precision using the GGML format.
So, the model uses fewer bits to represent the numbers. There are two main advantages to using this technique:

1. Reducing Memory Usage: It makes deploying the models more efficient on low-resource devices.
2. Faster Inference: The models will be faster during the generation process since there will be fewer computations.
'''

In [7]:
# Step 1:Convert The Model

'''
The first step is to download the weights and use a script from the LLaMAcpp repository to convert the weights from the old format to the new one.
It is a required step; otherwise, the LangChain library will not identify the checkpoint file.

We need to download the weights file. 
You can either head to [https://the-eye.eu/public/AI/models/nomic-ai/gpt4all/] and download the weights (make sure to download the one that ends with *.ggml.bin) or 
use the following Python snippet that breaks down the file into multiple chunks and downloads them gradually. 
The local_path variable is the destination folder.
'''

import requests
from pathlib import Path
from tqdm import tqdm

local_path = './models/gpt4all-lora-quantized-ggml.bin'
Path(local_path).parent.mkdir(parents=True, exist_ok=True)

url = 'https://the-eye.eu/public/AI/models/nomic-ai/gpt4all/gpt4all-lora-quantized-ggml.bin'

# send a GET request to the URL to download the file.
response = requests.get(url, stream=True)

# open the file in binary mode and write the contents of the response
# to it in chunks.
with open(local_path, 'wb') as f:
    for chunk in tqdm(response.iter_content(chunk_size=8192)):
        if chunk:
            f.write(chunk)

514266it [1:25:27, 100.29it/s]


In [6]:
from langchain.llms import GPT4All
from langchain import PromptTemplate, LLMChain
# from langchain.callbacks.base import CallbackManager
from langchain.callbacks.manager import AsyncCallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [7]:
template = """Question: {question}

Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])

In [9]:
!pip install gpt4all

Collecting gpt4all
  Downloading gpt4all-2.6.0-py3-none-win_amd64.whl.metadata (4.2 kB)
Downloading gpt4all-2.6.0-py3-none-win_amd64.whl (6.3 MB)
   ---------------------------------------- 0.0/6.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.3 MB 1.4 MB/s eta 0:00:05
    --------------------------------------- 0.1/6.3 MB 1.2 MB/s eta 0:00:06
   - -------------------------------------- 0.2/6.3 MB 1.4 MB/s eta 0:00:05
   - -------------------------------------- 0.2/6.3 MB 1.3 MB/s eta 0:00:05
   - -------------------------------------- 0.2/6.3 MB 1.3 MB/s eta 0:00:05
   -- ------------------------------------- 0.3/6.3 MB 1.4 MB/s eta 0:00:05
   -- ------------------------------------- 0.4/6.3 MB 1.3 MB/s eta 0:00:05
   -- ------------------------------------- 0.5/6.3 MB 1.4 MB/s eta 0:00:05
   --- ------------------------------------ 0.5/6.3 MB 1.3 MB/s eta 0:00:05
   --- ------------------------------------ 0.5/6.3 MB 1.4 MB/s eta 0:00:05
   --- ------------------

In [None]:
callback_manager = AsyncCallbackManager([StreamingStdOutCallbackHandler()])
llm = GPT4All(model= "all-MiniLM-L6-v2.gguf2.f16.gguf", model_path="Documents/Activeloop/Langchain_and_Vector_Databases_in_Production/2_Large_Language_Models_and_Langchain/7_Using_OpenSource_GPT4All_Model_Locally/models/ggml-model-q4_0.bin", callback_manager=callback_manager, verbose=True)
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
question = "What happens when it rains somewhere?"
llm_chain.run(question)