In [30]:
!pip install -q langchain pinecone-client tiktoken langchain-community openai spacy sentence-transformers ace_tools pandas


[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
# Block 1: Chunking Your Data

from langchain.docstore.document import Document
import json
import tiktoken

max_tokens = 1000  # Maximum tokens per chunk
overlap_tokens = 100  # Tokens to overlap between chunks



file_path = "./data/paragraphs.json"

with open(file_path, 'r') as file:
    data = json.load(file)

# Create Documents from sentences
documents = []
for paragraph in data:
    for sentence in paragraph['sentences']:
        doc = Document(
            page_content=sentence['text'],
            metadata={
                "start": sentence['start'],
                "end": sentence['end'],
            }
        )
        documents.append(doc)

def count_tokens(text, model_name='text-embedding-ada-002'):
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(text))

sentences = []
token_count = 0
for doc in documents:
    text = doc.page_content
    tokens = count_tokens(text)
    sentences.append({'text': text, 'tokens': tokens, 'metadata': doc.metadata})


chunks = []
current_chunk = []
current_token_count = 0

for idx, sentence in enumerate(sentences):
    sentence_tokens = sentence['tokens']
    if current_token_count + sentence_tokens > max_tokens:
        chunk_text = ' '.join([s['text'] for s in current_chunk])
        chunk_metadata = {
            'start': current_chunk[0]['metadata']['start'],
            'end': current_chunk[-1]['metadata']['end'],
        }
        chunk_doc = Document(page_content=chunk_text, metadata=chunk_metadata)
        chunks.append(chunk_doc)

        # Start new chunk with overlap
        overlap = []
        overlap_token_count = 0
        i = len(current_chunk) - 1
        while i >= 0 and overlap_token_count < overlap_tokens:
            overlap.insert(0, current_chunk[i])
            overlap_token_count += current_chunk[i]['tokens']
            i -= 1
        current_chunk = overlap.copy()
        current_token_count = overlap_token_count

    # Add current sentence to current_chunk
    current_chunk.append(sentence)
    current_token_count += sentence_tokens

if current_chunk:
    chunk_text = ' '.join([s['text'] for s in current_chunk])
    chunk_metadata = {
        'start': current_chunk[0]['metadata']['start'],
        'end': current_chunk[-1]['metadata']['end'],
    }
    chunk_doc = Document(page_content=chunk_text, metadata=chunk_metadata)
    chunks.append(chunk_doc)

print(f"Number of chunks created: {len(chunks)}")
print(f"Total token count: {current_token_count}")

Number of chunks created: 3
Total token count: 926


In [16]:
# Block 2.1: Setting Up Pinecone with the simplified setup

from pinecone import Pinecone
import os
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "chatbot"
pinecone_index = pc.Index(index_name)

print("pinecone client setup complete")

pinecone client setup complete


In [17]:
# Block 2.2: Embed and upsert data into Pinecone

from langchain.embeddings.openai import OpenAIEmbeddings

# create batch embeddings
batch_size = 100 
all_embeddings = []
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
texts = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_embeddings = embeddings.embed_documents(batch_texts)
    all_embeddings.extend(batch_embeddings)



# Upsert vectors into Pinecone in batches
batch_size = 100
vectors = []
for i, (embedding, metadata) in enumerate(zip(all_embeddings, metadatas)):
    vector_id = f"vec_{i}"
    # Include text and any other necessary metadata
    metadata['text'] = texts[i]
    vectors.append({'id': vector_id, 'values': embedding, 'metadata': metadata})

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    pinecone_index.upsert(vectors=batch)

print("Data successfully inserted into Pinecone.")


Data successfully inserted into Pinecone.


In [19]:
# Block 3: Implementing Retrieval Augmented Generation (RAG)

from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
vectorstore = Pinecone(pinecone_index, embeddings.embed_query, "text")
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


template = """
The answer should be short, concise and directly related to the question and not contain filler words. 
Given the following information, answer the question. 
Use the information from the documents to support your answer. 
Do not use any external information or make up any information. 
If you don't know the answer, write "I don't know".


Context:
{context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can also try "map_reduce" or "refine" if needed
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)



In [41]:
query = """
How can i get my first saas customer?
"""

result = qa_chain(query)
print("Answer:", result['result'])

print("\n\n---------------------------------------------------------")
for doc in result['source_documents']:
    print(f"Text: {doc.page_content[:200]}")
    print(f"Metadata: {doc.metadata}")
    # print(f"Metadata: {doc}")
    print("---")


Answer: To get your first SaaS customer, start by validating your product idea with a landing page to capture emails and gauge interest. Engage in customer development by discussing your idea publicly and gathering feedback. Leverage your network or online communities to promote your product. Once you have an email list, launch to them with exclusive offers to convert them into customers. Consider using platforms like Product Hunt or Reddit for additional exposure.


---------------------------------------------------------
Text: In this video, I'm talking about how to get your first 100 customers for your SaaS product. I'm gonna be offering up actionable strategies and tactics on how you can get this done. I'm Rob Walling. I'
Metadata: {'end': 219.455, 'start': 0.08}
---
Text: And what this does is allows you to build that email launch list which not only provides some validation that people are interested in the value that you're talking about, but it allows you to do what
Metadata: 

In [48]:
import json
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from IPython.display import display  # For displaying dataframes in Jupyter
import torch
import re

# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')  # Upgraded model

# Load paragraphs.json
with open('./data/paragraphs.json', 'r', encoding='utf-8') as json_file:
    paragraphs = json.load(json_file)

# Extract sentences and their timestamps from paragraphs.json
text_sentences = []
timestamps = []

for paragraph in paragraphs:
    for sentence in paragraph['sentences']:
        text_sentences.append(sentence['text'])
        timestamps.append((sentence['start'], sentence['end']))

# Assuming 'result' contains the answer text that you want to find matches for
answer = result['result']

# Split the answer into sentences using a simple regex-based approach
def split_into_sentences(text):
    # This splits on punctuation that usually ends a sentence followed by space
    return re.split(r'(?<=[.!?])\s+', text)

# Split the answer into sentences
answer_sentences = split_into_sentences(answer)

# Encode sentences from the text once
text_embeddings = model.encode(text_sentences, convert_to_tensor=True)

# Prepare a list to hold all matching results
all_matches = []

# For each sentence in the answer, find the top 3 most similar sentences in the text
for ans_sentence in answer_sentences:
    ans_embedding = model.encode(ans_sentence, convert_to_tensor=True)
    # Compute cosine similarities between the answer sentence and all text sentences
    cosine_scores = util.cos_sim(ans_embedding, text_embeddings)[0]
    # Get the top 3 matches
    top_results = torch.topk(cosine_scores, k=3)
    for idx, score in zip(top_results.indices, top_results.values):
        all_matches.append({
            # 'matched': text_sentences[idx],
            'score': score.item(),
            'start': timestamps[idx][0],
            'end': timestamps[idx][1]
        })

# # Convert the matches into a DataFrame and sort by similarity score
# matching_data = pd.DataFrame(all_matches)
# matching_data = matching_data.sort_values(by='score', ascending=False)

# # Limit the results to top 10
# matching_data = matching_data.head(10)

# # Display the DataFrame in the notebook
# display(matching_data)


all_matches = sorted(all_matches, key=lambda x: x['score'])
top_matches = all_matches[:10]
top_matches_json = json.dumps(top_matches, indent=2)
print(top_matches_json)


[
  {
    "score": 0.5243916511535645,
    "start": 45.145,
    "end": 59.13
  },
  {
    "score": 0.5377623438835144,
    "start": 383.275,
    "end": 401.5
  },
  {
    "score": 0.5429914593696594,
    "start": 479.71002,
    "end": 484.99002
  },
  {
    "score": 0.5434920191764832,
    "start": 383.275,
    "end": 401.5
  },
  {
    "score": 0.5550068020820618,
    "start": 513.15,
    "end": 521.165
  },
  {
    "score": 0.5632674694061279,
    "start": 479.71002,
    "end": 484.99002
  },
  {
    "score": 0.6074326634407043,
    "start": 260.55,
    "end": 272.83502
  },
  {
    "score": 0.6269683837890625,
    "start": 260.55,
    "end": 272.83502
  },
  {
    "score": 0.6328544616699219,
    "start": 479.71002,
    "end": 484.99002
  },
  {
    "score": 0.6492415070533752,
    "start": 293.945,
    "end": 299.945
  }
]
