In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [2]:
loader = PyPDFLoader("/Users/luisbarajas/Documents/AGI/small_deeplearning.pdf")
data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 168 document(s) in your data
There are 50 characters in your document


In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print (f'Now you have {len(texts)} documents')
print(texts[23])

Now you have 215 documents
page_content='computation in batches ofsamples that can fit\nentirely in the GPU memory and are processed\nin parallel. When an operator combines a sample\nand model parameters, both have to be moved\nto the cache memory near the actual computing\n21' metadata={'source': '/Users/luisbarajas/Documents/AGI/small_deeplearning.pdf', 'page': 20}


In [20]:
from langchain.vectorstores import  Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI
import pinecone
import os
from dotenv import load_dotenv

In [25]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

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


In [26]:
MODEL = "text-embedding-3-small"

client = OpenAI(
    api_key=OPENAI_API_KEY
)

res = client.embeddings.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], model=MODEL
)

In [27]:
embeds = [record.embedding for record in res.data]
print (embeds)

[[-0.0006736477953381836, 0.01784864068031311, 0.028474807739257812, -0.016548041254281998, -0.044690780341625214, -0.03376021981239319, 0.024296287447214127, -0.015496493317186832, 0.014804685488343239, -0.00604294054210186, 0.034147631376981735, -0.010307935066521168, 0.004617816768586636, -0.0065825507044792175, 0.06502992659807205, 0.07250145077705383, -0.0044102743268013, 0.013344971463084221, -0.026772959157824516, 0.03685951605439186, 0.0047181290574371815, 0.04671085998415947, -0.021349187940359116, 0.03442435339093208, 0.012708508409559727, -0.021874960511922836, -0.040512263774871826, -0.024891242384910583, 0.05113843083381653, -0.06253942102193832, -0.018512776121497154, -0.041204068809747696, 0.01910773105919361, -0.03038419596850872, -0.040152523666620255, 0.054680485278367996, 0.06558337807655334, 0.004022862296551466, -0.04975481331348419, -0.04817749187350273, -0.00958153698593378, -0.012507883831858635, 0.054846517741680145, 0.00950543861836195, -0.0009209690615534782,

In [28]:
import time
from pinecone import Pinecone
from pinecone import ServerlessSpec

In [29]:
pc = Pinecone(api_key="")
spec = ServerlessSpec(cloud="GCP", region="us-central1")
index_name = 'reto'

In [30]:
# check if index already exists (it shouldn't if this is your first run)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name=index_name,
        dimension=len(embeds[0]),  # dimensionality of text-embed-3-small
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

In [31]:
# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.01,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [32]:
from datasets import load_dataset

# load the first 1K rows of the TREC dataset
trec = load_dataset('trec', split='train[:1000]')
print(trec)

Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 1000
})


In [33]:
from tqdm.auto import tqdm

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(trec['text']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(trec['text']))
    # get batch of lines and IDs
    lines_batch = trec['text'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # Print the text being processed
    print("Processing batch:")
    print('\n'.join(lines_batch))
    # create embeddings
    res = client.embeddings.create(input=lines_batch, model=MODEL)
    embeds = [record.embedding for record in res.data]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

  0%|          | 0/32 [00:00<?, ?it/s]

Processing batch:
How did serfdom develop in and then leave Russia ?
What films featured the character Popeye Doyle ?
How can I find a list of celebrities ' real names ?
What fowl grabs the spotlight after the Chinese Year of the Monkey ?
What is the full form of .com ?
What contemptible scoundrel stole the cork from my lunch ?
What team did baseball 's St. Louis Browns become ?
What is the oldest profession ?
What are liver enzymes ?
Name the scar-faced bounty hunter of The Old West .
When was Ozzy Osbourne born ?
Why do heavier objects travel downhill faster ?
Who was The Pride of the Yankees ?
Who killed Gandhi ?
What is considered the costliest disaster the insurance industry has ever faced ?
What sprawling U.S. state boasts the most airports ?
What did the only repealed amendment to the U.S. Constitution deal with ?
How many Jews were executed in concentration camps during WWII ?
What is `` Nine Inch Nails '' ?
What is an annotated bibliography ?
What is the date of Boxing Day ?
W

In [34]:
query = "What caused the 1929 Great Depression?"

xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding
print (xq)

[-0.048548728227615356, -0.005683126859366894, 0.013807527720928192, 0.03811154142022133, 0.0195104219019413, -0.027338311076164246, 0.017039496451616287, 0.06036963313817978, -0.03662898764014244, 0.017592983320355415, 0.025895291939377785, -0.0006900057196617126, -0.014074387028813362, -0.04728361591696739, -0.017207520082592964, -0.06088358536362648, 0.011415672488510609, 0.04068130627274513, -0.05258127674460411, 0.008771782740950584, 0.03058016486465931, 0.03253713622689247, -0.04431850463151932, -0.040523163974285126, -0.00894968956708908, 0.032616205513477325, 0.013669155538082123, -0.05629754811525345, -0.005767138209193945, 0.011633113957941532, 0.0161993820220232, -0.015517407096922398, 0.0450301319360733, 0.0020434546750038862, -0.02937435358762741, -0.05254174396395683, -0.04190688207745552, 0.011623229831457138, -0.02834644913673401, -0.013767993077635765, 0.028702260926365852, -0.044595248997211456, 0.02136855758726597, -0.020716233178973198, -0.007995912805199623, -0.035

In [35]:
res = index.query(vector=xq, top_k=5, include_metadata=True)

In [36]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.75: Why did the world enter a global depression in 1929 ?
0.60: When was `` the Great Depression '' ?
0.37: What crop failure caused the Irish Famine ?
0.32: What were popular songs and types of songs in the 1920s ?
0.32: When did World War I start ?
