In [7]:
#pip install langchain langchain_community langchain-openai chromadb`.
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_openai.embeddings import OpenAIEmbeddings
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

Split text into chunks.
Convert it into embeddings using OpenAI.
Store the embeddings in a vector database (Chroma).
Perform question-answering by retrieving relevant documents from the database and processing them with OpenAI’s GPT model.

In [31]:
texts = [
    "My Name is Pradeep kumar Sharma.",
    "I have more 17 years experience in IT,Cloud computing and NLP  as well as  Deep learning using pytorch.",
    "We provide services in Chat board and Sentiment Analysis.",
]
# Convert texts to Document objects
'''List Comprehension:
1. The expression [Document(page_content=text) for text in texts] is a Python list comprehension.
It iterates over each text in the texts list and creates a Document object for each text.'''

docs = [Document(page_content=text) for text in texts]
# Initialize the text splitter (chunk_size=1000 and chunk_overlap=0)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# Split the documents into smaller chunks
docs = text_splitter.split_documents(docs) # Pass Document objects to split_documents


docs = [Document(page_content=text) for text in texts] # what is happening here
**In short, this line of code is converting plain text into a format (the Document class) that LangChain's vector store and other tools can use.**

List Comprehension:

The expression [Document(page_content=text) for text in texts] is a Python list comprehension.
It iterates over each text in the texts list and creates a Document object for each text.
Document(page_content=text):

This is creating an instance of the Document class from the langchain.schema module.
The Document class is designed to store structured data (in this case, text) to be processed by LangChain.
page_content=text means that for each text in the texts list, you are setting the content of the Document object to that text.
Result:

The result of the list comprehension is a new list, where each element is a Document object that contains one of the original text strings.

Purpose:
This step is necessary because many natural language processing (NLP) frameworks, such as LangChain, work with Document objects rather than raw text strings. The Document class can contain additional metadata (like titles, page numbers, etc.), making it easier to work with for tasks like retrieval, embeddings, or chunking.

In [11]:
from google.colab import drive
drive.mount('/content/drive')
with open('/content/drive/MyDrive/openai_key.txt', 'r') as file:
    openai_api_key = file.read().strip()
    os.environ["OPENAI_API_KEY"] = openai_api_key



Mounted at /content/drive


In [12]:
openai_embeddings = OpenAIEmbeddings()

In [16]:
import chromadb
import chromadb.config
# Generate embeddings for the documents and store them in Chroma
vector_store = Chroma.from_documents(docs, openai_embeddings)

In [29]:
# Retrieve relevant documents based on query (without LLM)
query = "how much experience you have"
query_embedding = openai_embeddings.embed_query(query)
print(query_embedding)


[0.014483030885457993, -0.016584128141403198, 0.025199946016073227, -0.03504470735788345, -0.020337030291557312, 0.01353159174323082, -0.044585537165403366, 0.005536852404475212, -0.04299980401992798, -0.0012066439958289266, 0.002626370871439576, 0.005137115251272917, -0.017945215106010437, 0.0033515135291963816, 0.0017443066462874413, -0.006372665986418724, 0.031053945422172546, -0.009600293822586536, -0.014218742027878761, -0.020601319149136543, -0.03298325464129448, -0.008629032410681248, -0.026402460411190987, -0.023244207724928856, -0.017033418640494347, -0.013339981436729431, 0.012943548150360584, -0.00022402613831218332, 0.004476393107324839, -0.04640913009643555, 0.019240232184529305, 0.002571861259639263, -0.023085635155439377, -0.01031387411057949, -0.020773107185959816, -0.009131181053817272, 0.018169861286878586, -0.0052130986005067825, 0.02777676284313202, 0.012930333614349365, 0.01733735203742981, 0.014496245421469212, -0.010782986879348755, 0.014919107779860497, -0.00156

In [30]:
# Perform similarity search in Chroma DB
retrieved_docs = vector_store.similarity_search_by_vector(query_embedding)
print(retrieved_docs[0].page_content)





I have more 17 years experience in IT,Cloud computing and NLP  as well as  Deep learning using pytorch.
