In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [2]:
loader = PyPDFLoader("/Users/luisbarajas/Documents/AGI/Papers/imagenet-classification-with-deep-convolutional-neural-networks.pdf")
data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 9 document(s) in your data
There are 3461 characters in your document


In [31]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print(len(texts))
for(text) in texts:
    print(text.page_content)


42
ImageNet Classiﬁcation with Deep Convolutional
Neural Networks
Alex Krizhevsky
University of Toronto
kriz@cs.utoronto.caIlya Sutskever
University of Toronto
ilya@cs.utoronto.caGeoffrey E. Hinton
University of Toronto
hinton@cs.utoronto.ca
Abstract
We trained a large, deep convolutional neural network to classify the 1.2 million
high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 dif-
ferent classes. On the test data, we achieved top-1 and top-5 error rates of 37.5%
and 17.0% which is considerably better than the previous state-of-the-art. The
neural network, which has 60 million parameters and 650,000 neurons, consists
of ﬁve convolutional layers, some of which are followed by max-pooling layers,
and three fully-connected layers with a ﬁnal 1000-way softmax. To make train-
ing faster, we used non-saturating neurons and a very efﬁcient GPU implemen-
tation of the convolution operation. To reduce overﬁtting in the fully-connected
layers we employed a recently-devel

In [4]:
from langchain.vectorstores import  Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI
import pinecone
import os
from dotenv import load_dotenv

In [5]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

In [19]:
MODEL = "text-embedding-3-small"

client = OpenAI(
    api_key=OPENAI_API_KEY
)

res = client.embeddings.create(input=texts[2].page_content, model=MODEL)


In [20]:
embeds = [record.embedding for record in res.data]
print (embeds)

[[0.02109885774552822, -0.010409812442958355, -0.0072544775903224945, 0.002194772707298398, -0.004258305765688419, -0.015089759603142738, 0.018328864127397537, 0.024304453283548355, -0.045704882591962814, 0.056472115218639374, 0.012118719518184662, -0.021791355684399605, -0.01961333677172661, -0.03551845625042915, 0.0033005359582602978, 0.013682425022125244, 0.021523291245102882, 0.03375370427966118, 0.005819219164550304, -3.2308147638104856e-05, 0.002589887473732233, -0.016139676794409752, 0.03009016439318657, -0.027454204857349396, 0.07434303313493729, 0.016396570950746536, -0.0010652744676917791, 0.009337556548416615, 0.039740461856126785, -0.006595487240701914, 0.009130924008786678, -0.015815766528248787, -0.10490231215953827, -0.01357073150575161, 0.004917296115309, 0.021713171154260635, -0.012051703408360481, 0.034870635718107224, -0.05137890204787254, 0.044409241527318954, -0.0031357884872704744, 0.002577322069555521, -0.032703787088394165, -0.02613622322678566, 0.00532497651875

In [21]:
import time
from pinecone import Pinecone
from pinecone import ServerlessSpec

In [22]:
pc = Pinecone(api_key="")
spec = ServerlessSpec(cloud="GCP", region="us-central1")
index_name = 'reto'

In [23]:
# check if index already exists (it shouldn't if this is your first run)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name=index_name,
        dimension=1536,  # dimensionality of text-embed-3-small
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

In [24]:
# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.01,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [29]:
from datasets import load_dataset

# load the first 1K rows of the TREC dataset
trec = load_dataset('trec', split='train[:1000]')
print(trec)

Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 1000
})


In [39]:
from tqdm.auto import tqdm

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(trec['text']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(trec['text']))
    # get batch of lines and IDs
    lines_batch = trec['text'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = client.embeddings.create(input=lines_batch, model=MODEL)
    embeds = [record.embedding for record in res.data]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

  0%|          | 0/32 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [34]:
query = "What caused the 1929 Great Depression?"

xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding
print (xq)

[-0.048548728227615356, -0.005683126859366894, 0.013807527720928192, 0.03811154142022133, 0.0195104219019413, -0.027338311076164246, 0.017039496451616287, 0.06036963313817978, -0.03662898764014244, 0.017592983320355415, 0.025895291939377785, -0.0006900057196617126, -0.014074387028813362, -0.04728361591696739, -0.017207520082592964, -0.06088358536362648, 0.011415672488510609, 0.04068130627274513, -0.05258127674460411, 0.008771782740950584, 0.03058016486465931, 0.03253713622689247, -0.04431850463151932, -0.040523163974285126, -0.00894968956708908, 0.032616205513477325, 0.013669155538082123, -0.05629754811525345, -0.005767138209193945, 0.011633113957941532, 0.0161993820220232, -0.015517407096922398, 0.0450301319360733, 0.0020434546750038862, -0.02937435358762741, -0.05254174396395683, -0.04190688207745552, 0.011623229831457138, -0.02834644913673401, -0.013767993077635765, 0.028702260926365852, -0.044595248997211456, 0.02136855758726597, -0.020716233178973198, -0.007995912805199623, -0.035

In [35]:
res = index.query(vector=xq, top_k=5, include_metadata=True)
pr

In [36]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.75: Why did the world enter a global depression in 1929 ?
0.60: When was `` the Great Depression '' ?
0.37: What crop failure caused the Irish Famine ?
0.32: What were popular songs and types of songs in the 1920s ?
0.32: When did World War I start ?
