In [1]:
!pip install \
  langchain_community \
  langchain_pinecone \
  langchain_openai \
  unstructured \
  langchain-text-splitters

Collecting langchain_community
  Downloading langchain_community-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.1.3-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.1.21-py3-none-any.whl.metadata (2.6 kB)
Collecting unstructured
  Downloading unstructured-0.15.3-py3-none-any.whl.metadata (29 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.13 (from langchain_community)
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.30 (from langchain_community)
  Downloading langchain_core-0.2.32-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain_community)
  Downloadin

In [7]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import glob

In [12]:
text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader("./full_data/", glob="./*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

In [13]:
docstest = loader.load()

In [23]:
from google.colab import userdata

os.environ['OPENAPI_KEY_API'] = userdata.get('OPENAI_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')

In [24]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)
index_name = "pinecone-test"

#split our documents into chunks
text_splitter = RecursiveCharacterTextSplitter()
split_docs = text_splitter.split_documents(docstest)

In [27]:
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)

In [44]:
query = "What is Saanvi's last name?"
similar_docs = vectorstore.similarity_search(query)

In [41]:
similar_docs[0]

Document(metadata={'source': 'full_data/3040PaperSaanviBhumpalle.txt'}, page_content='Deep learning in computer vision provides many levels of abstraction that mimic the deep neural networks in our brains. Deep neural networks in our brains are extremely “complex” and still not completely understood (Kriegeskorte, 2015). By contrast, computational models have artificial neural networks that have well-defined layers with matrix multiplication, stochastic pooling, convolution, and other mathematical operations. Humans are better at context and understanding the meaning behind a visual scene, unlike computers. Perhaps artificial neural networks can incorporate layers for deriving information from the context of a scene through semantical analysis. Humans all have different biases when gathering the gist from a visual scene. Still, computer vision would gather unbiased input that computer scientists would achieve by cleaning data sets and removing bias from those (Kyle-Davidson et al., 202

In [45]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa.invoke(query)

{'query': "What is Saanvi's last name?",
 'result': "Saanvi's last name is Bhumpalle."}