In [1]:
import sys, os

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Pinecone as langchain_pinecone
from langchain_openai import OpenAIEmbeddings

load_dotenv("../.env")

True

In [None]:
# import nltk
# nltk.download('punkt')

In [2]:
docs = pd.DataFrame(load_dataset("BeIR/fiqa", "corpus", split="corpus"))
queries = pd.DataFrame(load_dataset("BeIR/fiqa", "queries", split="queries"))
docs.head(2)

Unnamed: 0,_id,title,text
0,3,,I'm not saying I don't like the idea of on-the...
1,31,,So nothing preventing false ratings besides ad...


In [3]:
queries.head(2)

Unnamed: 0,_id,title,text
0,0,,What is considered a business expense on a bus...
1,4,,Business Expense - Car Insurance Deductible Fo...


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=100)
chunks = text_splitter.create_documents(docs.text.values[:2])
len(chunks), chunks

(6,
 [Document(page_content="I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little"),
  Document(page_content='software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining'),
  Document(page_content="in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything."),
  Document(page_content='So nothing preventing false ratings besides additional scrutiny from the market/investors, but there are some newer controls in place to prevent institutions from using them. Under the DFA banks can no longer solely rely on credit ratings as du

In [6]:
index_name = "webinar"

# instantiate a Pinecone client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# First, check if our index already exists. If it doesn't, we create it
if index_name not in [index_info['name'] for index_info in pc.list_indexes()]:
    # we create a new index
    pc.create_index(name=index_name, metric="cosine", dimension=1536, 
                    spec=ServerlessSpec(
                    cloud="aws",
                    region="us-west-2"
                ) )
    
embeddings = OpenAIEmbeddings()

In [7]:
# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`
docsearch = langchain_pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [10]:
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 6}},
 'total_vector_count': 6}

In [9]:
# # if you already have an index, you can load it like this
# # docsearch = Pinecone.from_existing_index(index_name, embeddings)

# query = "What did the president say about Ketanji Brown Jackson"
# docs = docsearch.similarity_search(query)
# docs