In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [3]:
loader = PyPDFLoader("/Users/luisbarajas/Documents/AGI/Papers/weak-to-strong-generalization.pdf")
data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 49 document(s) in your data
There are 3411 characters in your document


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print(len(texts))
for(text) in texts:
    print(text.page_content)


188
WEAK-TO-STRONG GENERALIZATION : ELICITING
STRONG CAPABILITIES WITHWEAK SUPERVISION
Collin Burns∗Pavel Izmailov∗Jan Hendrik Kirchner∗Bowen Baker∗Leo Gao∗
Leopold Aschenbrenner∗Yining Chen∗Adrien Ecoffet∗Manas Joglekar∗
Jan Leike Ilya Sutskever Jeff Wu∗
OpenAI
ABSTRACT
Widely used alignment techniques, such as reinforcement learning from human
feedback (RLHF), rely on the ability of humans to supervise model behavior—for
example, to evaluate whether a model faithfully followed instructions or generated
safe outputs. However, future superhuman models will behave in complex ways
too difficult for humans to reliably evaluate; humans will only be able to weakly
supervise superhuman models. We study an analogy to this problem: can weak
model supervision elicit the full capabilities of a much stronger model? We test
this using a range of pretrained language models in the GPT-4 family on natural
language processing (NLP), chess, and reward modeling tasks. We find that when
we naively finetu

In [5]:
from langchain.vectorstores import  Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI
import pinecone
import os
from dotenv import load_dotenv

In [6]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
# PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

In [7]:
MODEL = "text-embedding-3-small"

client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [20]:
# embeds = [record.embedding for record in res.data]
# print (embeds)

[[0.02109885774552822, -0.010409812442958355, -0.0072544775903224945, 0.002194772707298398, -0.004258305765688419, -0.015089759603142738, 0.018328864127397537, 0.024304453283548355, -0.045704882591962814, 0.056472115218639374, 0.012118719518184662, -0.021791355684399605, -0.01961333677172661, -0.03551845625042915, 0.0033005359582602978, 0.013682425022125244, 0.021523291245102882, 0.03375370427966118, 0.005819219164550304, -3.2308147638104856e-05, 0.002589887473732233, -0.016139676794409752, 0.03009016439318657, -0.027454204857349396, 0.07434303313493729, 0.016396570950746536, -0.0010652744676917791, 0.009337556548416615, 0.039740461856126785, -0.006595487240701914, 0.009130924008786678, -0.015815766528248787, -0.10490231215953827, -0.01357073150575161, 0.004917296115309, 0.021713171154260635, -0.012051703408360481, 0.034870635718107224, -0.05137890204787254, 0.044409241527318954, -0.0031357884872704744, 0.002577322069555521, -0.032703787088394165, -0.02613622322678566, 0.00532497651875

In [8]:
import time
from pinecone import Pinecone
from pinecone import ServerlessSpec

In [12]:
pc = Pinecone(api_key=PINECONE_API_KEY)
spec = ServerlessSpec(cloud="AWS", region="us-east-1")
index_name = 'arpa2'
print(pc.list_indexes())

{'indexes': [{'dimension': 1536,
              'host': 'arpa2-f7qlkj4.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'arpa2',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'arpa-f7qlkj4.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'arpa',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


In [13]:
# check if index already exists (it shouldn't if this is your first run)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name=index_name,
        dimension=1536,  # dimensionality of text-embed-3-small
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

In [14]:
# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 54}},
 'total_vector_count': 54}

In [15]:
from datasets import load_dataset

# load the first 1K rows of the TREC dataset
trec = load_dataset('trec', split='train[:1000]')
print(trec)

Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 1000
})


In [16]:
from tqdm.auto import tqdm

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(trec['text']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(trec['text']))
    # get batch of lines and IDs
    lines_batch = trec['text'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = client.embeddings.create(input=lines_batch, model=MODEL)
    embeds = [record.embedding for record in res.data]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

  0%|          | 0/32 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [34]:
query = "What caused the 1929 Great Depression?"

xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding
print (xq)

[-0.048548728227615356, -0.005683126859366894, 0.013807527720928192, 0.03811154142022133, 0.0195104219019413, -0.027338311076164246, 0.017039496451616287, 0.06036963313817978, -0.03662898764014244, 0.017592983320355415, 0.025895291939377785, -0.0006900057196617126, -0.014074387028813362, -0.04728361591696739, -0.017207520082592964, -0.06088358536362648, 0.011415672488510609, 0.04068130627274513, -0.05258127674460411, 0.008771782740950584, 0.03058016486465931, 0.03253713622689247, -0.04431850463151932, -0.040523163974285126, -0.00894968956708908, 0.032616205513477325, 0.013669155538082123, -0.05629754811525345, -0.005767138209193945, 0.011633113957941532, 0.0161993820220232, -0.015517407096922398, 0.0450301319360733, 0.0020434546750038862, -0.02937435358762741, -0.05254174396395683, -0.04190688207745552, 0.011623229831457138, -0.02834644913673401, -0.013767993077635765, 0.028702260926365852, -0.044595248997211456, 0.02136855758726597, -0.020716233178973198, -0.007995912805199623, -0.035

In [35]:
res = index.query(vector=xq, top_k=5, include_metadata=True)
pr

In [36]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.75: Why did the world enter a global depression in 1929 ?
0.60: When was `` the Great Depression '' ?
0.37: What crop failure caused the Irish Famine ?
0.32: What were popular songs and types of songs in the 1920s ?
0.32: When did World War I start ?
