In [7]:
!pip install -q langchain pinecone-client tiktoken langchain-community openai spacy sentence-transformers ace_tools pandas langchain-openai langchain-pinecone


[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
# Block 1: Chunking Your Data

from langchain.docstore.document import Document
import json
import tiktoken
import requests

max_tokens = 1000  # Maximum tokens per chunk
overlap_tokens = 100  # Tokens to overlap between chunks

url = "https://chatwithaudio.blob.core.windows.net/uploads/kp_07d5b59a68fe4b2285ed52c6ad1e32eb/transcription_5fd87ab0-4c85-49d3-9bd1-65103f1234f3.json?sp=r&st=2024-10-14T19:47:42Z&se=2024-10-15T03:47:42Z&spr=https&sv=2022-11-02&sr=b&sig=HZrdE5Umj6Cx5Ruhiqyu2ZRRamvtYrrd2Jnc9xpn08o%3D"
response = requests.get(url)
requestJson = response.json()

documents = []
for paragraph in requestJson.get('paragraphs', []):
    for sentence in paragraph.get('sentences', []):
        doc = Document(
            page_content=sentence['text'],
            metadata={
                "start": sentence['start'],
                "end": sentence['end'],
            }
        )
        documents.append(doc)

def count_tokens(text, model_name='text-embedding-ada-002'):
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(text))

sentences = []
for doc in documents:
    text = doc.page_content
    tokens = count_tokens(text)
    sentences.append({'text': text, 'tokens': tokens, 'metadata': doc.metadata})


chunks = []
current_chunk = []
current_token_count = 0

for idx, sentence in enumerate(sentences):
    sentence_tokens = sentence['tokens']
    if current_token_count + sentence_tokens > max_tokens:
        chunk_text = ' '.join([s['text'] for s in current_chunk])
        chunk_metadata = {
            'start': current_chunk[0]['metadata']['start'],
            'end': current_chunk[-1]['metadata']['end'],
        }
        chunk_doc = Document(page_content=chunk_text, metadata=chunk_metadata)
        chunks.append(chunk_doc)

        # Start new chunk with overlap
        overlap = []
        overlap_token_count = 0
        i = len(current_chunk) - 1
        while i >= 0 and overlap_token_count < overlap_tokens:
            overlap.insert(0, current_chunk[i])
            overlap_token_count += current_chunk[i]['tokens']
            i -= 1
        current_chunk = overlap.copy()
        current_token_count = overlap_token_count
    current_chunk.append(sentence)
    current_token_count += sentence_tokens

if current_chunk:
    chunk_text = ' '.join([s['text'] for s in current_chunk])
    chunk_metadata = {
        'start': current_chunk[0]['metadata']['start'],
        'end': current_chunk[-1]['metadata']['end'],
    }
    chunk_doc = Document(page_content=chunk_text, metadata=chunk_metadata)
    chunks.append(chunk_doc)

print(f"Number of chunks created: {len(chunks)}")
print(f"Total token count: {current_token_count}")

Number of chunks created: 3
Total token count: 926


In [29]:
# Block 2.1: Setting Up Pinecone with the simplified setup

from pinecone import Pinecone
import os

# NOTE: there was an update and now the Pinecone client requires the following simplified setup
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "chatbot"
pinecone_index = pc.Index(index_name)

print("pinecone client setup complete")

pinecone client setup complete


In [30]:
# Block 2.2: Embed and upsert data into Pinecone

from langchain.embeddings.openai import OpenAIEmbeddings

project_id = "your-project-1234"

# create batch embeddings
batch_size = 100 
all_embeddings = []
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
texts = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_embeddings = embeddings.embed_documents(batch_texts)
    all_embeddings.extend(batch_embeddings)

# Upsert vectors into Pinecone in batches, specifying the namespace
vectors = []
for i, (embedding, metadata) in enumerate(zip(all_embeddings, metadatas)):
    vector_id = f"vec_{i}"
    metadata['text'] = texts[i]
    vectors.append({'id': vector_id, 'values': embedding, 'metadata': metadata})

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    # Upsert the batch into the specified namespace (project_id)
    pinecone_index.upsert(vectors=batch, namespace=project_id)

print(f"Data successfully inserted into Pinecone for project {project_id}.")



Data successfully inserted into Pinecone for project your-project-1234.


In [16]:
# Block 3: Implementing Retrieval Augmented Generation (RAG)

from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
vectorstore = Pinecone(pinecone_index, embeddings.embed_query, "text", namespace=project_id)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


template = """
The answer should be short, concise and directly related to the question and not contain filler words. 
Given the following information, answer the question. 
Use the information from the documents to support your answer. 
Do not use any external information or make up any information. 
If you don't know the answer, write "I don't know".


Context:
{context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can also try "map_reduce" or "refine" if needed
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)



In [17]:
query = """
How can i get my first saas customer?
"""

result = qa_chain(query)
print("Answer:", result['result'])

print("\n\n---------------------------------------------------------")
for doc in result['source_documents']:
    print(f"Text: {doc.page_content[:200]}")
    print(f"Metadata: {doc.metadata}")
    # print(f"Metadata: {doc}")
    print("---")


  result = qa_chain(query)


Answer: Identify your target audience, create a compelling value proposition, leverage social media and online communities, offer free trials or demos, and network within relevant industry events.


---------------------------------------------------------


In [24]:
import json
import requests
from sentence_transformers import SentenceTransformer, util
import torch
import re

# Load the model for sentence embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Fetch the JSON data from a remote URL
url = "https://chatwithaudio.blob.core.windows.net/uploads/kp_07d5b59a68fe4b2285ed52c6ad1e32eb/transcription_5fd87ab0-4c85-49d3-9bd1-65103f1234f3.json?sp=r&st=2024-10-14T19:47:42Z&se=2024-10-15T03:47:42Z&spr=https&sv=2022-11-02&sr=b&sig=HZrdE5Umj6Cx5Ruhiqyu2ZRRamvtYrrd2Jnc9xpn08o%3D"
response = requests.get(url)
requestJson = response.json()

sentences = []
timestamps = []

for paragraph in requestJson.get('paragraphs', []):
    for sentence in paragraph.get('sentences', []):
        sentences.append(sentence["text"])
        timestamps.append((sentence["start"], sentence["end"]))

def split_into_sentences(text):
    return re.split(r'(?<=[.!?])\s+', text)

answer = result['result']
answer_sentences = split_into_sentences(answer)

text_embeddings = model.encode(sentences, convert_to_tensor=True)

all_matches = []

for ans_sentence in answer_sentences:
    ans_embedding = model.encode(ans_sentence, convert_to_tensor=True)
    cosine_scores = util.cos_sim(ans_embedding, text_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=10)
    
    for idx, score in zip(top_results.indices, top_results.values):
        all_matches.append({
            'score': score.item(),
            'start': timestamps[idx][0],
            'end': timestamps[idx][1]
        })

all_matches = sorted(all_matches, key=lambda x: x['score'], reverse=True)

top_matches = all_matches[:10]
print(top_matches)


[{'score': 0.6792088747024536, 'start': 260.55, 'end': 272.83502}, {'score': 0.6435611248016357, 'start': 503.95, 'end': 509.23}, {'score': 0.6272023916244507, 'start': 383.275, 'end': 401.5}, {'score': 0.6216945648193359, 'start': 487.55002, 'end': 495.835}, {'score': 0.6165946125984192, 'start': 45.145, 'end': 59.13}, {'score': 0.5801970958709717, 'start': 479.71002, 'end': 484.99002}, {'score': 0.5770946145057678, 'start': 140, 'end': 161.015}, {'score': 0.5500840544700623, 'start': 525.14496, 'end': 534.1}, {'score': 0.5472249984741211, 'start': 22.015, 'end': 25.695}, {'score': 0.5417923927307129, 'start': 513.15, 'end': 521.165}]
