## Installing Dependencies

In [None]:
# We'll need to install the Pinecone client
!pip install pinecone-client
!pip install openai

!pip install tiktoken
!pip install pymupdf

In [2]:
import openai
import tiktoken

from typing import List, Iterator
import pandas as pd
import numpy as np
import os

# Pinecone's client library for Python
import pinecone

#embeddings model, this can be changed to the embedding model of your choice
EMBEDDING_MODEL = "text-embedding-ada-002"


In [3]:
client = openai.OpenAI(api_key=api_key)

In [4]:
import requests
import fitz
import io

url = "https://s2.q4cdn.com/299287126/files/doc_financials/2023/q1/Q1-2023-Amazon-Earnings-Release.pdf"
request = requests.get(url)
filestream = io.BytesIO(request.content)
with fitz.open(stream=filestream, filetype="pdf") as doc:
    text = ""
    for page in doc:
        text += page.get_text()
print(text[:10])

AMAZON.COM


## Document Embedding And Tokenizing

In [46]:
def get_embedding(text, model="text-embedding-ada-002"):
  text = text.replace("\n", " ")
  return client.embeddings.create(input = [text], model=model).data[0].embedding

def tokenize(text,max_tokens) -> pd.DataFrame:
    """ Function to split the text into chunks of a maximum number of tokens """

    # Load the cl100k_base tokenizer which is designed to work with the ada-002 model
    tokenizer = tiktoken.get_encoding("cl100k_base")

    df=pd.DataFrame(['0',text]).T
    df.columns = ['title', 'text']

    # Tokenize the text and save the number of tokens to a new column
    df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

    # Visualize the distribution of the number of tokens per row using a histogram
    # df.n_tokens.hist()

    ################################################################################
    # Step 8
    ################################################################################

    shortened = []

    # Loop through the dataframe
    for row in df.iterrows():

        # If the text is None, go to the next row
        if row[1]['text'] is None:
            continue

        # If the number of tokens is greater than the max number of tokens, split the text into chunks
        if row[1]['n_tokens'] > max_tokens:
            shortened += split_into_many(row[1]['text'], tokenizer, max_tokens)

        # Otherwise, add the text to the list of shortened texts
        else:
            shortened.append(row[1]['text'])


    df = pd.DataFrame(shortened, columns=['text'])
    df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))


    df['embeddings'] = df.text.apply(lambda x: get_embedding(x))

    return df




def split_into_many(text: str, tokenizer: tiktoken.Encoding, max_tokens: int = 1024) -> list:
    """ Function to split a string into many strings of a specified number of tokens """

    # Split the text into sentences
    sentences = text.split(' ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence))
                for sentence in sentences]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        chunk.append(sentence)
        tokens_so_far += token + 1

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        # if token > max_tokens:
        #     continue



    return chunks

In [47]:
df = tokenize(text, 100)

In [48]:
df

Unnamed: 0,text,n_tokens,embeddings
0,AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS\nSE...,73,"[-0.004386260639876127, -0.03638501837849617, ..."
1,"billion in the first quarter, compared with $1...",62,"[-0.022371012717485428, -0.01973203755915165, ..."
2,America segment sales increased 11% year-over-...,66,"[0.014955701306462288, -0.03206072375178337, 0..."
3,billion.\n•\nOperating income increased to $4....,64,"[-0.0075391982682049274, -0.03265468031167984,..."
4,"was $0.9 billion, compared with operating loss...",67,"[0.0008032217156141996, -0.026385251432657242,..."
...,...,...,...
195,"Customer accounts exclude certain customers, i...",54,"[-0.00028259423561394215, -0.02128153480589389..."
196,during the preceding twelve-month period.\nSel...,56,"[-0.01613679528236389, -0.023295223712921143, ..."
197,Customers\n•\nReferences to AWS customers mean...,56,"[-0.0028094155713915825, -0.021578947082161903..."
198,Customers are considered active when they have...,57,"[0.002970371162518859, -0.023911403492093086, ..."


In [49]:
df['vector_id'] = df.index
df['vector_id'] = df['vector_id'].apply(str)
df['content_vector'] = df.embeddings


## Adding Index And Uploading Data

In [50]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=api_key_pc)

In [51]:
# Models a simple batch generator that make chunks out of an input DataFrame
class BatchGenerator:


    def __init__(self, batch_size: int = 10) -> None:
        self.batch_size = batch_size

    # Makes chunks out of an input DataFrame
    def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
        splits = self.splits_num(df.shape[0])
        if splits <= 1:
            yield df
        else:
            for chunk in np.array_split(df, splits):
                yield chunk

    # Determines how many chunks DataFrame contains
    def splits_num(self, elements: int) -> int:
        return round(elements / self.batch_size)

    __call__ = to_batches

df_batcher = BatchGenerator(300)

In [52]:
index_name = 'amazon-q1-2023-3'

# Check whether the index with the same name already exists - if so, delete it
if index_name in pc.list_indexes():
    pc.delete_index(index_name)

pc.create_index(index_name, dimension=len(df['content_vector'][0]),spec=ServerlessSpec(
        cloud="aws",
        region="us-west-2"
    ))
index = pc.Index(name=index_name)



In [53]:
# Upsert content vectors
print("Uploading vectors to content namespace..")
for batch_df in df_batcher(df):
    index.upsert(vectors=zip(batch_df.vector_id, batch_df.content_vector))

Uploading vectors to content namespace..


In [61]:
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 200}},
 'total_vector_count': 200}

## Document Querying

In [62]:
content_mapped = dict(zip(df.vector_id,df.text))

In [63]:
def query_article(query, top_k=5):
    '''Queries an article using its title in the specified
     namespace and prints results.'''

    # Create vector embeddings based on the title column
    # embedded_query = openai.Embedding.create(
    #                                         input=query,
    #                                         model=EMBEDDING_MODEL,
    #                                         )["data"][0]['embedding']
    embedded_query = client.embeddings.create(input = [query], model="text-embedding-ada-002").data[0].embedding

    # Query namespace passed as parameter using title vector
    query_result = index.query(vector=embedded_query,
                                      top_k=5)

    # Print query results
    print(f'\nMost similar results to {query}')
    if not query_result.matches:
        print('no query result')

    matches = query_result.matches
    ids = [res.id for res in matches]
    scores = [res.score for res in matches]
    df = pd.DataFrame({'id':ids,
                       'score':scores,
                       'content': [content_mapped[_id] for _id in ids],
                       })

    counter = 0
    for k,v in df.iterrows():
        counter += 1
        print(f'{v.content} (score = {v.score})')

    print('\n')

    return df

In [64]:
query_output = query_article("""How did Amazon do?""",top_k=5)



Most similar results to How did Amazon do?
creators. For example, in the past quarter, Amazon:
•
Continued to delight customers with convenient delivery options and broad selection. Nearly 26 million customers 
ordered items with Same-Day Delivery in the quarter, an increase of 50% compared to last year. 
•
Increased (score = 0.850620866)
across all of its business areas. The company builds new products and services that 
customers ask for, and also invents new ones that customers didn’t know they wanted but make their lives or businesses better 
in some meaningful way. For example, this past quarter, Amazon: (score = 0.841900885)
AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS
SEATTLE—(BUSINESS WIRE) April 27, 2023—Amazon.com, Inc. (NASDAQ: AMZN) today announced financial results 
for its first quarter ended March 31, 2023. 
•
Net sales increased 9% to $127.4 (score = 0.829579175)
(6) 
 1 
AWS
 37 
 16 
Consolidated
 7 
 9 
Net sales mix:
North America
 59 %
 60 %
International
 25 
 23 


In [65]:
query_output = query_article("""Reviews from Amazon""",top_k=5)



Most similar results to Reviews from Amazon
Reviews from Amazon, a new capability that enables Buy with Prime merchants to display customer ratings 
and reviews from Amazon.com within their own online stores at no additional cost. Nearly 20% of Buy with Prime 
merchants are already using this feature, and the early feedback is (score = 0.835087895)
principles: customer obsession rather than competitor focus, passion for invention, commitment to 
operational excellence, and long-term thinking. Amazon strives to be Earth’s Most Customer-Centric Company, Earth’s Best 
Employer, and Earth’s Safest Place to Work. Customer reviews, 1-Click (score = 0.804236233)
books. The company debuted additional Amazon Original films and series, including action comedy Shotgun 
Wedding, starring Jennifer Lopez and Josh Duhamel; romantic comedy Somebody I Used to Know, directed by Dave 
Franco; comedic thriller The Consultant, starring Christoph Waltz; and Swarm, from (score = 0.801856577)
debuted at No. 