In [56]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.vectorstores import  Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI
import pinecone
import os
from dotenv import load_dotenv
import time
from pinecone import Pinecone
from pinecone import ServerlessSpec

In [57]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
# PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

MODEL = "text-embedding-3-small"
client = OpenAI(api_key=OPENAI_API_KEY)
embed = OpenAIEmbeddings(model=MODEL, openai_api_key=OPENAI_API_KEY)
# llm = OpenAI(api_key=OPENAI_API_KEY , model='gpt-3.5-turbo', temperature=0.0)
llm = ChatOpenAI(model="gpt-3.5-turbo")
pc = Pinecone(api_key=PINECONE_API_KEY)
spec = ServerlessSpec(cloud="aws", region="us-east-1")
index_name = 'arpa2'
indexes = []
for index in pc.list_indexes():
    indexes.append(index.name)

print(indexes)

['arpa', 'arpa2']


In [58]:
try:
    # Check if the index already exists
    if index_name not in indexes:
        # If it does not exist, create the index
        pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
        print(f"Index '{index_name}' is being created.")
        
        # Wait for the index to be initialized and ready
        while not pinecone.describe_index(index_name).status['ready']:
            time.sleep(1)
        
        print(f"Index '{index_name}' is ready for use.")
    else:
        print(f"Index '{index_name}' already exists.")
except Exception as e:
    print(f"An error occurred: {e}")



Index 'arpa2' already exists.


In [59]:
# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

vectorstore = Pinecone(index_name, "combined")

qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "combined",
    retriever = vectorstore.as_retriever()
    )

AttributeError: 'Pinecone' object has no attribute 'as_retriever'

In [41]:
loader = PyPDFLoader("/Users/luisbarajas/Documents/AGI/Papers/weak-to-strong-generalization.pdf")
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
# print(len(texts))
# for(text) in texts:
#     print(text.page_content)
# print (f'You have {len(data)} document(s) in your data')
# print (f'There are {len(data[0].page_content)} characters in your document')

In [42]:
count = 0  # we'll use the count to create unique IDs
for document in texts:
    text_chunk = document.page_content

    # create embeddings
    embeddings = client.embeddings.create(input=text_chunk, model=MODEL)
    embeds = [record.embedding for record in embeddings.data]

    # prep metadata and upsert batch
    meta = [{'text': text_chunk}]
    count_str = str(count)
    to_upsert = zip(count_str, embeds, meta)
    count += 1
    # print(to_upsert)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

KeyboardInterrupt: 

In [43]:
query = "What is the article about? "
xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding
print (xq)

[0.02276374213397503, 0.02391357347369194, -0.02878943830728531, 0.018251746892929077, -0.003600499127060175, -0.007466624025255442, -0.021351924166083336, 0.049893930554389954, -0.007077282294631004, -0.017916986718773842, 0.04046240821480751, -0.07126040756702423, -0.019270585849881172, -0.00720463739708066, 0.039851102977991104, 0.04040418565273285, -0.03152574598789215, 0.036066848784685135, -0.01341954618692398, 0.030012043192982674, -0.0011962244752794504, 0.027523169293999672, -0.032515473663806915, 0.0375223308801651, 0.009831782430410385, -0.037638772279024124, -0.008347190916538239, -0.002503429539501667, 0.004773981869220734, 0.013383159413933754, 0.01718924567103386, -0.03859939053654671, 0.017334792762994766, -0.018630173057317734, -0.0008050636388361454, 0.05085454881191254, -0.04494529217481613, -0.05100009962916374, 0.03574664518237114, 0.024422992020845413, 0.0024706812109798193, -0.02726118266582489, 0.013412268832325935, 0.031409308314323425, -0.025616487488150597, -

In [44]:
res = index.query(vector=xq, top_k=5, include_metadata=True)

In [45]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.16: Figure 1: An illustration of our methodology. Traditional ML focuses on the setting where humans
supervise models that are weaker than humans. For the ultimate superalignment problem, humans
will have to supervise models much smarter than them. We study an analogous problem today:
using weak models to supervise strong models.
this problem, it is difficult to empirically study today. Most prior work on alignment has either
confronted this core challenge head-on—but been restricted to primarily theoretical frameworks and
toy problems (Irving et al., 2018; Christiano et al., 2018; Leike et al., 2018; Demski & Garrabrant,
2019; Hubinger et al., 2019), or empirically studied humans supervising today’s models—without
addressing the core challenges that may arise with superhuman models (Christiano et al., 2017; Wu
et al., 2021; Ouyang et al., 2022; Bowman et al., 2022; Saunders et al., 2022). In contrast, we would
0.15: eling (RM) or safety classification task, it is unclear how that mo