Step 1: Install All the Required Pakages (Llama2 + Langchain + Pinecone)

In [2]:
!pip install langchain
!pip install pypdf
!pip install unstructured
!pip install sentence_transformers
!pip install pinecone-client
!pip install llama-cpp-python
!pip install huggingface_hub

Collecting langchain
  Downloading langchain-0.0.353-py3-none-any.whl (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.1/803.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.2 (from langchain)
  Downloading langchain_community-0.0.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2,>=0.1.4 (from langchain)
  Downloading langchain_core-0.1.4-py3-none-any.whl (205 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.7/205.7 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.1.0,>=0.0.70 (from langchain)
  Downloading langsmith-

### **Step 2: Import all required libraries**

In [3]:
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os

### **Step 3: Load the Data**

In [73]:
#loader = PyPDFLoader("/content/yolov7.pdf")
loader = PyPDFLoader("/content/Wells Fargo - Dataset.pdf")
data = loader.load()
data

[Document(page_content='2022\n \nAnnual Report \nWells Fargo & Company ', metadata={'source': '/content/Wells Fargo - Dataset.pdf', 'page': 0}),
 Document(page_content=' \n CEO Letter \nDear Shareholders, \nI’m proud to report that Wells Fargo continued to make \nprogress on our priorities in 2022. Our underlying financial performance is improving, we are moving forward on our risk, control and regulatory agenda, we are focusing on businesses where we can generate appropriate risk-adjusted returns, we continue to strengthen the leadership team, and we are executing on our strategic objectives. While we have made progress, our work is not complete and we remain focused on successful and timely execution of our multi-year journey to complete our risk and control work and to move forward with our businesses. \nStronger financial performance \nOur financial performance benefitted as we continued to drive improved efficiency, and it was positively impacted by both rising rates and a benign 

Step 4: Split the Text into **Chunks**

In [74]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

In [75]:
docs=text_splitter.split_documents(data)

In [76]:
len(docs)

2675

In [77]:
docs[0]

Document(page_content='2022\n \nAnnual Report \nWells Fargo & Company', metadata={'source': '/content/Wells Fargo - Dataset.pdf', 'page': 0})

## **Step 5: Setup the Environment**

In [78]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '5fd5141c-3765-46cd-a14e-f5576041c1b9')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

### **Step 6: Downlaod the Embeddings**

In [79]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

### Step 7: Initializing the **Pinecone**

In [80]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchainpinecone" # put in the name of your pinecone index here

## **Step 8: Create Embeddings for Each of the Text Chunk**

In [81]:
docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)

#If you already have an index, you can load it like this
#docsearch = Pinecone.from_existing_index(index_name, embeddings)

## **Step 9: Similarity Search**

In [82]:
#query="YOLOv7 outperforms which models"
query="What was the net income of Wells Fargo in 2022?"

In [83]:
docs=docsearch.similarity_search(query, k=3)

In [84]:
docs

[Document(page_content='Wells Fargo net income for 2022 was $13.2 billion ($3.14 diluted'),
 Document(page_content='Wells Fargo & Company and Subsidiaries \nConsolidated Statement of Comprehensive Income \nYear ended December 31, \n(in millions) 2022 2021 2020 \nNet income before noncontrolling interests $  12,882  23,238 3,662 \nOther comprehensive income (loss), after tax: \nNet change in debt securities (10,500) (2,375) 1,487'),
 Document(page_content='$1.0 billion in 2022, and $1.1 billion in both 2021 and 2020. \nWells Fargo & Company 178')]

## **Step 9: Query the Docs to get the Answer Back (Llama 2 Model)**

In [35]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose

Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.26.tar.gz (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command pip subprocess to install build dependencies
  Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
  Collecting scikit-build-core[pyproject]>=0.5.1
    Using cached scikit_build_core-0.7.1-py3-none-any.whl (136 kB)
  Collecting exceptiongroup (from scikit-build-core[pyproject]>=0.5.1)
    Using cached exceptiongroup-1.2.0-py3-none-any.whl (16 kB)
  Collecting packaging>=20.9 (from scikit-build-core[pyproject]>=0.5.1)
    Using cached packaging-23.2-py3-none-any.whl (53 kB)
  Collecting tomli>=1.1 (from scikit-build-core[pyproject]>=0.5.1)
    Using cached tomli-2.0.1-py3-none-any.whl (12 kB)
  Collecting pathspec>=0.10.1 (from scikit-build-core[pyproject]>=0.

## **Import All the Required Libraries**

In [45]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
from langchain.chains.question_answering import load_qa_chain

In [85]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

**Quantized Models from the Hugging Face Community**
The Hugging Face community provides quantized models, which allow us to efficiently and effectively utilize the model on the T4 GPU. It is important to consult reliable sources before using any model.

There are several variations available, but the ones that interest us are based on the GGLM library.

We can see the different variations that Llama-2-13B-GGML has here.

In this case, we will use the model called Llama-2-13B-chat-GGML.

Quantization reduces precision to optimize resource usage.

Quantization is a technique to reduce the computational and memory costs of running inference by representing the weights and activations with low-precision data types like 8-bit integer ( int8 ) instead of the usual 32-bit floating point ( float32 ).

In [54]:
model_name_or_path = "TheBloke/CodeLlama-13B-Python-GGUF"
model_basename = "codellama-13b-python.Q5_K_M.gguf" # the model is in bin format

In [55]:
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


codellama-13b-python.Q5_K_M.gguf:   0%|          | 0.00/9.23G [00:00<?, ?B/s]

In [60]:
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 256  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Loading model,
llm = LlamaCpp(
    model_path=model_path,
    max_tokens=256,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    n_ctx=1024,
    verbose=True,
)

AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [86]:
chain=load_qa_chain(llm, chain_type="stuff")

In [87]:
#query="YOLOv7 outperforms which models"
query="What was the net income of Wells Fargo in 2022?"
docs=docsearch.similarity_search(query)

In [88]:
docs

[Document(page_content='Wells Fargo net income for 2022 was $13.2 billion ($3.14 diluted'),
 Document(page_content='Wells Fargo & Company and Subsidiaries \nConsolidated Statement of Comprehensive Income \nYear ended December 31, \n(in millions) 2022 2021 2020 \nNet income before noncontrolling interests $  12,882  23,238 3,662 \nOther comprehensive income (loss), after tax: \nNet change in debt securities (10,500) (2,375) 1,487'),
 Document(page_content='$1.0 billion in 2022, and $1.1 billion in both 2021 and 2020. \nWells Fargo & Company 178'),
 Document(page_content='Liabilities  (63)  (391) (14)  (71)  (501)  (22)      \n      \n      \n      \n       \n       \n      \n      \n      \n      \n       \n      \n      \n       \n      \n      \n      \n     \n     \n       \n    \n \n \n \n \nWells Fargo & Company 174')]

In [89]:
chain.run(input_documents=docs, question=query)

'$13.2 billion'

In [90]:
#query="YOLOv7 outperforms which models"
query="What was the net income of Wells Fargo in 2022?"
docs=docsearch.similarity_search(query)

In [91]:
chain.run(input_documents=docs, question=query)

'$13.2 billion'

## Step 10: Query the Docs to get the Answer Back (Hugging Face Model)**bold text**

In [68]:
from langchain.llms import HuggingFaceHub

In [69]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})



In [70]:
chain=load_qa_chain(llm, chain_type="stuff")

In [71]:
query="What are examples of good data science teams?"
docs=docsearch.similarity_search(query)

In [72]:
chain.run(input_documents=docs, question=query)

'the Apple neural engine (Apple), the neural compute stick (Intel), Jetson AI edge devices (Nvidia), the edge TPU (Google), the neural processing engine (Qualcomm), the AI'