## Install All the Required Packages

In [1]:
!pip install langchain
!pip install pinecone-client
!pip install pypdf

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

In [5]:
!pip install openai
!pip install tiktoken
!pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

## Import All the Required Libraries

In [6]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

## Load the PDF Files

In [7]:
!mkdir pdfs

In [8]:
!gdown 1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE -O pdfs/yolov7paper.pdf
!gdown 1vILwiv6nS2wI3chxNabMgry3qnV67TxM -O pdfs/rachelgreecv.pdf

Downloading...
From: https://drive.google.com/uc?id=1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE
To: /content/pdfs/yolov7paper.pdf
100% 2.27M/2.27M [00:00<00:00, 13.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vILwiv6nS2wI3chxNabMgry3qnV67TxM
To: /content/pdfs/rachelgreecv.pdf
100% 271k/271k [00:00<00:00, 101MB/s]


## Extract the Text from the PDF's

In [9]:
loader = PyPDFDirectoryLoader("pdfs")
data = loader.load()

In [10]:
data

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-07-07T00:21:22+00:00', 'author': '', 'keywords': '', 'moddate': '2022-07-07T00:21:22+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'pdfs/yolov7paper.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known\nreal-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object d

## Split the Extracted Data into Text Chunks

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

In [12]:
text_chunks = text_splitter.split_documents(data)

In [13]:
text_chunks

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-07-07T00:21:22+00:00', 'author': '', 'keywords': '', 'moddate': '2022-07-07T00:21:22+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'pdfs/yolov7paper.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known'),
 Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperr

In [14]:
len(text_chunks)

168

In [15]:
text_chunks[1]

Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-07-07T00:21:22+00:00', 'author': '', 'keywords': '', 'moddate': '2022-07-07T00:21:22+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'pdfs/yolov7paper.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='real-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable')

In [16]:
text_chunks[2]

Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-07-07T00:21:22+00:00', 'author': '', 'keywords': '', 'moddate': '2022-07-07T00:21:22+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'pdfs/yolov7paper.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='DETR, DINO-5scale-R50, ViT-Adapter-B and many other\nobject detectors in speed and accuracy. Moreover, we train\nYOLOv7 only on MS COCO dataset from scratch without\nusing any other datasets or pre-trained weights. Source\ncode is released in https://github.com/WongKinYiu/yolov7.\n1. Introduction\nReal-time object detection is a very important topic in\ncomputer vision, as it is often a necessary component in\ncomputer vision systems. For example, multi-object track-')

In [17]:
text_chunks[3]

Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-07-07T00:21:22+00:00', 'author': '', 'keywords': '', 'moddate': '2022-07-07T00:21:22+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'pdfs/yolov7paper.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='ing [94, 93], autonomous driving [40, 18], robotics [35, 58],\nmedical image analysis [34, 46], etc. The computing de-\nvices that execute real-time object detection is usually some\nmobile CPU or GPU, as well as various neural processing\nunits (NPU) developed by major manufacturers. For exam-\nple, the Apple neural engine (Apple), the neural compute\nstick (Intel), Jetson AI edge devices (Nvidia), the edge TPU\n(Google), the neural processing engine (Qualcomm), the AI')

## Downlaod the Embeddings

In [18]:
import os
os.environ['OPENAI_API_KEY']= 'openAiApiKey'


In [19]:
embeddings = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [20]:
result = embeddings.embed_query("How are you!")

In [21]:
result

[-0.00477863978311413,
 -0.009318661688887599,
 -0.0030078430987363442,
 -0.026047041116614954,
 -0.016138113711047668,
 0.012056490164278347,
 0.0011420697908606054,
 -0.013563550736321223,
 -0.01942853031779609,
 -0.0033814685205804424,
 0.028257397367335884,
 0.006762937041160885,
 -0.01389008080632728,
 -0.006436407436816106,
 0.007535306119843328,
 -0.022731506740533557,
 0.02747875024330403,
 -0.002370481797387976,
 0.01873779445510708,
 -0.027177337011308388,
 0.0020863378467159874,
 0.02186494649251375,
 0.00860908703353758,
 -0.006951319845496883,
 -0.00021153801510919145,
 0.00022272324266144988,
 0.011196208892930508,
 -0.005475656019458941,
 0.022467771093859928,
 -0.02778016161265456,
 0.011761356840277225,
 -0.0005851636406800984,
 0.0007680517583515305,
 0.0027362580327308403,
 0.010970149900256332,
 -0.0230705956952061,
 0.0007056500038917418,
 0.0078053212088502015,
 0.03265299209944477,
 -0.0007331225058920182,
 0.0255949231312666,
 -0.014241728645666307,
 0.002806901

In [22]:
len(result)

1536

## Initializing the Pinecone

In [24]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'apikeyPinecone')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'envkey')

In [28]:
# import pinecone
# pinecone.init(
#     api_key=PINECONE_API_KEY,
#     environment=PINECONE_API_ENV
# )
# index_name = "myproject"


## Create Embeddings for each of the Text Chunk

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

## If you already have an index, you can load it like this

In [None]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch

<langchain.vectorstores.pinecone.Pinecone at 0x78f9e0e67460>

## Similarity Search

In [None]:
query = "YOLOv7 outperforms which models"

In [None]:
docs = docsearch.similarity_search(query, k=3)

In [None]:
docs

[Document(page_content='YOLOv7-tiny 6.2 3.5 320 30.8% 47.3% 32.2% 10.0% 31.9% 52.2%\nimprovement -39% -49% - = = = -0.9 = +0.7\nYOLOR-E6 [81] 115.8M 683.2G 1280 55.7% 73.2% 60.7% 40.1% 60.4% 69.2%\nYOLOv7-E6 97.2M 515.2G 1280 55.9% 73.5% 61.1% 40.6% 60.3% 70.0%\nimprovement -19% -33% - +0.2 +0.3 +0.4 +0.5 -0.1 +0.8\nYOLOR-D6 [81] 151.7M 935.6G 1280 56.1% 73.9% 61.2% 42.4% 60.5% 69.9%\nYOLOv7-D6 154.7M 806.8G 1280 56.3% 73.8% 61.4% 41.3% 60.6% 70.1%\nYOLOv7-E6E 151.7M 843.2G 1280 56.8% 74.4% 62.1% 40.8% 62.1% 70.6%'),
 Document(page_content='YOLOv5-L6 (r6.1) [23] 76.8M 445.6G 1280 63 - / 53.7% - -\nYOLOX-X [21] 99.1M 281.9G 640 58 51.5% / 51.1% - -\nYOLOv7-E6 97.2M 515.2G 1280 56 56.0% /55.9% 73.5% 61.2%\nYOLOR-E6 [81] 115.8M 683.2G 1280 45 55.8% / 55.7% 73.4% 61.1%\nPPYOLOE-X [85] 98.4M 206.6G 640 45 52.2% / 51.9% 69.9% 56.5%\nYOLOv7-D6 154.7M 806.8G 1280 44 56.6% /56.3% 74.0% 61.8%\nYOLOv5-X6 (r6.1) [23] 140.7M 839.2G 1280 38 - / 55.0% - -\nYOLOv7-E6E 151.7M 843.2G 1280 36 56.8% /56.8

## Creating a LLM Model Wrapper

In [None]:
llm = OpenAI()

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())


## Q/A

In [None]:
query = "YOLOv7 outperforms which models"

In [None]:
qa.run(query)

' YOLOv7 outperforms YOLOv5-L6 (r6.1), YOLOX-X, YOLOR-E6, PPYOLOE-X, YOLOv7-D6, YOLOv5-X6 (r6.1), YOLOv7-E6E, YOLOv5-X (r6.1), YOLOR-CSP, YOLOR-CSP-X, YOLOv7-tiny-SiLU, YOLOv7, and YOLOv7-X.'

In [None]:
query = "Rachel Green Experience"

In [None]:
qa.run(query)

' Rachel Green has a PhD in English from the University of Illinois at Urbana-Champaign. Her dissertation title was "Down on the Farm: World War One and the Emergence of Literary Modernism in the American South". She also holds an MA in English from Butler University, and has received a Summer Research Grant from the Center for Summer Studies, a Graduate College Conference Travel Grant from the University of Illinois, the Most Outstanding Butler Woman award from Butler University, and an Academic Scholarship from Butler University. She has published multiple works, and has presented at conferences.'

In [None]:
import sys

In [None]:
while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")

Input Prompt: what is yolo v7
Answer:  YOLOv7 is a real-time object detector which surpasses all known object detectors in both speed and accuracy. It has the highest accuracy of 56.8% AP among all known detectors and can run from 5 FPS to 160 FPS.
Input Prompt: tell me about Rechel Green
Answer:  Rachel Green is a PhD in English from the University of Illinois at Urbana-Champaign. Her dissertation title was “Down on the Farm: World War One and the Emergence of Literary Modernism in the American South.” She also has a MA in English and was awarded a Summer Research Grant, a Graduate College Conference Travel Grant, Most Outstanding Butler Woman, and an Academic Scholarship. She has published extensively and has given multiple conference presentations.
Input Prompt: exit
Exiting


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
