In [2]:
!pip install -Q --upgrade langchain pinecone-client


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -Q


In [55]:
# Block 1: Chunking Your Data

from langchain.docstore.document import Document
import json
import tiktoken
import os

# Load your data from the JSON file
data_folder = "./data"
json_file = "paragraphs.json"
file_path = os.path.join(data_folder, json_file)

with open(file_path, 'r') as file:
    data = json.load(file)

# Create Documents from sentences
documents = []
for paragraph in data:
    for sentence in paragraph['sentences']:
        doc = Document(
            page_content=sentence['text'],
            metadata={
                "start": sentence['start'],
                "end": sentence['end'],
            }
        )
        documents.append(doc)

# Function to count tokens using tiktoken
def count_tokens(text, model_name='text-embedding-ada-002'):
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(text))

# Prepare sentences with token counts and metadata
sentences = []
for doc in documents:
    text = doc.page_content
    tokens = count_tokens(text)
    sentences.append({'text': text, 'tokens': tokens, 'metadata': doc.metadata})

# Set maximum tokens per chunk and overlap tokens
max_tokens = 1000  # Maximum tokens per chunk
overlap_tokens = 100  # Tokens to overlap between chunks

# Group sentences into chunks
chunks = []
current_chunk = []
current_token_count = 0

for idx, sentence in enumerate(sentences):
    sentence_tokens = sentence['tokens']
    if current_token_count + sentence_tokens > max_tokens:
        # Create chunk
        chunk_text = ' '.join([s['text'] for s in current_chunk])
        chunk_metadata = {
            'start': current_chunk[0]['metadata']['start'],
            'end': current_chunk[-1]['metadata']['end'],
        }
        chunk_doc = Document(page_content=chunk_text, metadata=chunk_metadata)
        chunks.append(chunk_doc)

        # Start new chunk with overlap
        overlap = []
        overlap_token_count = 0
        i = len(current_chunk) - 1
        while i >= 0 and overlap_token_count < overlap_tokens:
            overlap.insert(0, current_chunk[i])
            overlap_token_count += current_chunk[i]['tokens']
            i -= 1
        current_chunk = overlap.copy()
        current_token_count = overlap_token_count

    # Add current sentence to current_chunk
    current_chunk.append(sentence)
    current_token_count += sentence_tokens

# Add any remaining sentences as a chunk
if current_chunk:
    chunk_text = ' '.join([s['text'] for s in current_chunk])
    chunk_metadata = {
        'start': current_chunk[0]['metadata']['start'],
        'end': current_chunk[-1]['metadata']['end'],
    }
    chunk_doc = Document(page_content=chunk_text, metadata=chunk_metadata)
    chunks.append(chunk_doc)

print(f"Number of chunks created: {len(chunks)}")


Number of chunks created: 3


In [56]:
# Block 2: Setting Up Pinecone and Inserting Data

# Generate embeddings for the chunks
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

# Extract texts and metadata from chunks
texts = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]

# Generate embeddings in batches
batch_size = 100  # Adjust based on your preference
all_embeddings = []
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_embeddings = embeddings.embed_documents(batch_texts)
    all_embeddings.extend(batch_embeddings)

from pinecone import Pinecone
import os

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "chatbot"
index = pc.Index(index_name)

# Prepare vectors for upsert
vectors = []
for i, (embedding, metadata) in enumerate(zip(all_embeddings, metadatas)):
    vector_id = f"vec_{i}"
    # Include text and any other necessary metadata
    metadata['text'] = texts[i]
    vectors.append({'id': vector_id, 'values': embedding, 'metadata': metadata})

# Upsert vectors into Pinecone in batches
batch_size = 100  # Adjust based on your preference
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    index.upsert(vectors=batch)

print("Data successfully inserted into Pinecone.")


Data successfully inserted into Pinecone.


In [85]:
# Block 3: Implementing Retrieval Augmented Generation (RAG)

# Set up the retriever using LangChain's Pinecone integration
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

# Use the same index and embeddings as before
vectorstore = Pinecone(index, embeddings.embed_query, "text")

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Set up the language model and prompt
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Initialize the LLM
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

# Define the prompt template
template = """
The answer should be short, concise and directly related to the question and not contain filler words. 
Given the following information, answer the question. 
Use the information from the documents to support your answer. 
Do not use any external information or make up any information. 
If you don't know the answer, write "I don't know".


Context:
{context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can also try "map_reduce" or "refine" if needed
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# Example usage
query = "How can i get my first saas customer?"

# Get the answer
result = qa_chain(query)

# Print the answer
print("Answer:", result['result'])

# Optionally, print the source documents
print("\nSource Documents:")
for doc in result['source_documents']:
    print(f"Text: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print(f"Metadata: {doc}")
    print("---")




Answer: To get your first SaaS customer, start by validating your product idea with a landing page to capture emails and gauge interest. Engage in customer development by discussing your idea publicly and seeking feedback. Leverage your network or online communities to promote your product. Once you have interested leads, launch to your email list with exclusive offers to convert them into customers. Consider using platforms like Product Hunt or Reddit for additional exposure.

Source Documents:
Text: In this video, I'm talking about how to get your first 100 customers for your SaaS product. I'm gonna be offering up actionable strategies and tactics on how you can get this done. I'm Rob Walling. I'm a startup founder with multiple exits, author of 3 books about building startups, and an investor in more than 100 companies. When most people think about building a product, they think about how can I build the product and then market it later? How can I build something and then go find pe

In [67]:
answer = result['result']
with open('./data/large-data.txt', 'r', encoding='utf-8') as file:
    data = file.read()

text = data


In this video, I'm talking about how to get your first 100 customers for your SaaS product. I'm gonn


In [79]:
!pip install -Q spacy sentence-transformers ace_tools


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -Q


In [76]:
!python -m -Q spacy download en_core_web_sm

c:\Users\Me\Desktop\chat-with-audio\venv\Scripts\python.exe: No module named -Q


In [105]:
import json
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from IPython.display import display  # For displaying dataframes in Jupyter
import torch

# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')  # Upgraded model

# Load paragraphs.json
with open('./data/paragraphs.json', 'r', encoding='utf-8') as json_file:
    paragraphs = json.load(json_file)

# Extract sentences and their timestamps from paragraphs.json
text_sentences = []
timestamps = []

for paragraph in paragraphs:
    for sentence in paragraph['sentences']:
        text_sentences.append(sentence['text'])
        timestamps.append((sentence['start'], sentence['end']))

# Assuming 'result' contains the answer text that you want to find matches for
answer = result['result']

# Split the answer into sentences using a simple regex-based approach
def split_into_sentences(text):
    # This splits on punctuation that usually ends a sentence followed by space
    return re.split(r'(?<=[.!?])\s+', text)

# Split the answer into sentences
answer_sentences = split_into_sentences(answer)

# Encode sentences from the text once
text_embeddings = model.encode(text_sentences, convert_to_tensor=True)

# Prepare a list to hold all matching results
all_matches = []

# For each sentence in the answer, find the top 3 most similar sentences in the text
for ans_sentence in answer_sentences:
    ans_embedding = model.encode(ans_sentence, convert_to_tensor=True)
    # Compute cosine similarities between the answer sentence and all text sentences
    cosine_scores = util.cos_sim(ans_embedding, text_embeddings)[0]
    # Get the top 3 matches
    top_results = torch.topk(cosine_scores, k=3)
    for idx, score in zip(top_results.indices, top_results.values):
        all_matches.append({
            'Answer Sentence': ans_sentence,
            'Matched Sentence': text_sentences[idx],
            'Similarity Score': score.item(),
            'Start Time': timestamps[idx][0],
            'End Time': timestamps[idx][1]
        })

# Convert the matches into a DataFrame and sort by similarity score
matching_data = pd.DataFrame(all_matches)
matching_data = matching_data.sort_values(by='Similarity Score', ascending=False)

# Limit the results to top 10
matching_data = matching_data.head(10)

# Display the DataFrame in the notebook
display(matching_data)




Unnamed: 0,Answer Sentence,Matched Sentence,Similarity Score,Start Time,End Time
0,"To get your first SaaS customer, start by vali...",You look at the most common b to b SaaS market...,0.672609,487.55002,495.835
6,Leverage your network or online communities to...,"I would recommend going on social media, podca...",0.662606,260.55,272.83502
1,"To get your first SaaS customer, start by vali...","If you don't, I would not invest the time to b...",0.659703,91.799995,96.505
2,"To get your first SaaS customer, start by vali...","However, building and selling a SaaS product, ...",0.65578,81.56,87.88
12,Consider using platforms like Product Hunt or ...,"I would recommend going on social media, podca...",0.626968,260.55,272.83502
9,"Once you have interested leads, launch to your...",And the idea is that that email launch list ge...,0.618134,324.20502,329.965
10,"Once you have interested leads, launch to your...",And what this does is allows you to build that...,0.602901,198.88,212.175
3,Engage in customer development by discussing y...,"I would recommend going on social media, podca...",0.601353,260.55,272.83502
11,"Once you have interested leads, launch to your...",So let's say you followed this advice and you ...,0.578584,284.13,288.55002
4,Engage in customer development by discussing y...,How do I find usually one marketing approach t...,0.559224,479.71002,484.99002
