In [None]:
!pip install -Q langchain_community langchain_pinecone langchain_openai unstructured langchain-text-splitters

In [90]:
from langchain.docstore.document import Document
import json
import tiktoken
import os

data_folder = "./data"
json_file = "paragraphs.json"
file_path = os.path.join(data_folder, json_file)

with open(file_path, 'r') as file:
    data = json.load(file)

documents = []
for paragraph in data:
    for sentence in paragraph['sentences']:
        doc = Document(
            page_content=sentence['text'],
            metadata={
                "start": sentence['start'],
                "end": sentence['end'],
            }
        )
        documents.append(doc)



def count_tokens(text, model_name='text-embedding-ada-002'):
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(text))

sentences = []
for doc in documents:
    text = doc.page_content
    tokens = count_tokens(text)
    sentences.append({'text': text, 'tokens': tokens, 'metadata': doc.metadata})

max_tokens = 1024  # Maximum tokens per chunk
overlap_tokens = 100  # Tokens to overlap between chunks

chunks = []
current_chunk = []
current_token_count = 0

for idx, sentence in enumerate(sentences):
    sentence_tokens = sentence['tokens']
    if current_token_count + sentence_tokens > max_tokens:
        # Create chunk
        chunk_text = ' '.join([s['text'] for s in current_chunk])
        chunk_metadata = {
            'start': current_chunk[0]['metadata']['start'],
            'end': current_chunk[-1]['metadata']['end'],
        }
        chunk_doc = Document(page_content=chunk_text, metadata=chunk_metadata)
        chunks.append(chunk_doc)

        # Start new chunk with overlap
        overlap = []
        overlap_token_count = 0
        i = len(current_chunk) - 1
        while i >= 0 and overlap_token_count < overlap_tokens:
            overlap.insert(0, current_chunk[i])
            overlap_token_count += current_chunk[i]['tokens']
            i -= 1
        current_chunk = overlap.copy()
        current_token_count = overlap_token_count

    # Add current sentence to current_chunk
    current_chunk.append(sentence)
    current_token_count += sentence_tokens

# Add any remaining sentences as a chunk
if current_chunk:
    chunk_text = ' '.join([s['text'] for s in current_chunk])
    chunk_metadata = {
        'start': current_chunk[0]['metadata']['start'],
        'end': current_chunk[-1]['metadata']['end'],
    }
    chunk_doc = Document(page_content=chunk_text, metadata=chunk_metadata)
    chunks.append(chunk_doc)

len(chunks)

3

In [53]:
# setup pinecone
from pinecone import Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

dimension = 1536
index_name = "chatbot"
index = pc.Index(index_name)

In [109]:
# helper method to wipe out entire pinecone index
index.delete(delete_all=True)
print(f"All vectors in the index '{index_name}' have been deleted.")

All vectors in the index 'chatbot' have been deleted.


In [92]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')  # Adjust model as needed

# Extract texts and metadata
texts = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]

In [93]:
batch_size = 100  # Adjust based on your preference and API rate limits

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]
    batch_embeddings = embeddings.embed_documents(batch_texts)
    
    # Prepare vectors for upsert
    vectors = []
    for idx, embedding in enumerate(batch_embeddings):
        vector_id = f'vec_{i + idx}'
        metadata = batch_metadatas[idx]
        metadata['text'] = batch_texts[idx]  # Include text in metadata if needed
        vectors.append((vector_id, embedding, metadata))
    
    # Upsert vectors into Pinecone
    index.upsert(vectors=vectors)

In [94]:
def retrieve_relevant_chunks(query, top_k=5):
    # Generate embedding for the query
    query_embedding = embeddings.embed_query(query)
    
    # Query Pinecone
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    
    # Extract texts and metadata
    retrieved_chunks = []
    for match in results['matches']:
        metadata = match['metadata']
        text = metadata.get('text', '')
        start = metadata.get('start', '')
        end = metadata.get('end', '')
        score = match['score']
        retrieved_chunks.append({
            'text': text,
            'start': start,
            'end': end,
            'score': score
        })
    return retrieved_chunks


In [84]:
# helper method to view the contents of the index
limit = 1
fetch_response = index.query(
    vector=[0] * dimension,
    top_k=limit,
    include_values=True,
    include_metadata=True
)

for match in fetch_response['matches']:
    print(match["id"])
    for key, value in match['metadata'].items():
        print(f"{key}: {value}")
    
    print("\n" + "-"*50 + "\n")

vec_86
end: 486.77002
start: 484.99002

--------------------------------------------------



In [98]:
# init the chain
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

template = """
Given the following information, answer the question. Use the information from the documents to support your answer. Do not use any external information or make up any information. If you don't know the answer, write "I don't know".


Context:
{context}

Question: {question}
Answer: 
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [66]:
# Run the query
question = "How to get my first saas customers?"
result = qa_chain({"query": question})

In [75]:
# view the results
print("Answer:", result["result"])
print("\nSource Documents:")
for doc in result["source_documents"]:
    print(doc.metadata)
    print("---")

Answer: To get your first SaaS customers, follow these steps:

1. **Start Before Building the Product**: Begin by identifying any advantages you have before writing a single line of code. This includes considering if you have an existing audience or a network in a particular space that you can leverage.

2. **Leverage Your Network**: Utilize your network to help promote your product. This can include calling on people you know to do webinars, co-promote, or otherwise help sell your product.

3. **Choose a Marketing Approach**: Select one marketing approach that you believe is most likely to work for your product. This could be SEO, content marketing, cold outreach, partnerships, integrations, or pay-per-click ads.

4. **Experiment and Commit**: Dive into your chosen marketing approach and commit to it for a few months. Experiment and give it your all to see if it can help you reach the 100 customer mark.

By following these steps, you can strategically work towards acquiring your first