### Initialize Libraries and Keys

In [None]:
# Install libraries into kernel (if not already installed)
# %pip install pinecone-client
# %pip install tqdm

In [24]:
# Base Python data handling environment imports 
import pandas as pd
import os
from tqdm.auto import tqdm
import time
import uuid

# Pinecone is a cloud-based Vector Database we'll use 
# to store embeddings
import pinecone

# OpenAI is used for the embedding LLM and GenAI model 
# used to generate responses
import openai

# Langchain is middleware that ties together the components 
# of the embedding and retrieval pipelines 

# The embedding chain creates searchable vectors of our data
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
# from langchain.vectorstores import DocArrayInMemorySearch
from langchain.vectorstores import Pinecone

# A link in the chain to operate a chat session
from langchain.chat_models import ChatOpenAI

# We'll maintain some memory of the chat so follow-up questions
# will be context-sensitive
from langchain.chains.conversation.memory \
import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

### Get Environment Variables

When using VSCode, install the dotenv extension and create an .env file with these contents:

OPENAI_KEY=YOUR_OPENAI_API_KEY

PINECONE_KEY=YOUR_PINECONE_KEY

In [25]:
OPENAI_KEY=os.getenv("OPENAI_KEY")
openai.api_key = OPENAI_KEY
EMBEDDING_MODEL="text-embedding-ada-002"
GENAI_MODEL='gpt-3.5-turbo'

PINECONE_KEY=os.getenv("PINECONE_KEY")
PINECONE_ENV="gcp-starter"
PINECONE_INDEX_NAME="default" # this will be created below

print(OPENAI_KEY)

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


### Create the Pinecone Vector Database if does not exist

In [26]:
pinecone.init(api_key = PINECONE_KEY, environment = PINECONE_ENV)
index_list = pinecone.list_indexes()
if len(index_list) == 0:
    print("Creating index...")
    pinecone.create_index(PINECONE_INDEX_NAME, dimension=1536, metric='dotproduct')
    
print(pinecone.describe_index(PINECONE_INDEX_NAME))
index = pinecone.Index(PINECONE_INDEX_NAME)

Creating index...
IndexDescription(name='default', metric='dotproduct', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')


### Generate Embedding Index

In [27]:
# This references the text-embedding-ada-002 OpenAI model we'll use to create embeddings 
# Both for indexing ground knowledge content, and later when searching ground knowledge
# For RAG documents to include in LLM Prompts

embed = OpenAIEmbeddings(
    model = EMBEDDING_MODEL,
    openai_api_key= OPENAI_KEY)

### Define Function to Split PDF File into Vectors & UPSERT vectors to Pinecone

In [51]:
def load_db(file):   
    print(f"Splitting and vectorizing file: {file}")
    
    # load document from file disk
    loader = PyPDFLoader(file)
    documents = loader.load()
    
    # split documents into text and embeddings
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    
    # Vectorize chunks of file, submitting 20 chunks at a time to OpenAI
    batch_size = 20 
    for i in tqdm(range(0, len(docs), batch_size)):
        # OpenAPI has rate limits, and we use batches to slow the pace of embedding requests
        i_end = min(i+batch_size, len(docs))
        batch = docs[i:i_end]
        
        # When querying the Vector DB for nearest vectors, the metadata 
        # is what is returned and added to the LLM Prompt (the "Grounding Knowledge")
        ids = []
        context_array = []
        meta_data = []
        for i, row in enumerate(batch):
            print(f"appending {i}")
            # Create a UUID
            ids.append(str(uuid.uuid4()))
            context_array.append(row.page_content)
            meta_data.append({
                'source': row.metadata["source"],
                'page': row.metadata["page"] + 1,
                'context': row.page_content
            })            
        
        # print(ids)
        # print(meta_data)
        
        
        # Get a list of documents to submit to OpenAI for embedding  
        emb_vectors = embed.embed_documents(context_array) 
        
        # Add embeddings, associated metadata, and the keys to the vector DB
        to_upsert = zip(id, emb_vectors, meta_data)    
        index.upsert(vectors=to_upsert)

    
        # Pause after each batch to avoid rate limits
        time.sleep(2) 


In [None]:
load_db('files/2019-21-51_Emergency.pdf')


### Submit a simple query to the Vector Index to ensure we it works!

In [53]:
vectorstore = Pinecone(index, embed, "context")
query = "What model aircraft is affected by directive 2019-21-51?" #ask some question that's answerable with the content added to the Vector DB
vectorstore.similarity_search(query, k=3)


[Document(page_content='1  FAA \nAviation Safety  EMERGENCY \nAIRWORTHINESS DIRECTIVE  \nwww.faa.gov/aircraft/safety/alerts/  \nDATE: October  23, 2019  \nAD #: 2019 -21-51 \nEmergency Airworthiness Directive (AD) 2019- 21-51 is sent to owners and operators of \nGeneral Electric Company  (GE)  Model GE90 -115B  model turbofan e ngines  with certain  engine \nserial  numbers . \nBackground  \nThis emergency AD was prompted by an  event  that occurred  on October 20, 2019,  in which  a \nBoeing Model 777-300ER  airplane powered by GE GE90 -115B  model turbofan engines experienced \nan uncontained high-pressure turbine (HPT) failure  that resulted in an aborted takeoff . Debris \nimpacted the aircraft fuselage and the other engine. Uncontained HPT failure , if not addressed, could \nresult in release of high -energy debris, damage  to the engine, damage to the airplane, and possible \nloss of the airplane . \nRelevant Service Information  \nThe FAA reviewed GE Alert Service Bulletin GE90 

### Create a GPT 3.5 Turbo Chatbot with a 5 response memory

In [54]:
# Create a reference to the OpenAI LLM
llm = ChatOpenAI(openai_api_key = OPENAI_KEY,
                model_name = GENAI_MODEL,
                temperature = 0.0)

# Ensure the chat session includes memory of 5 previous messages
conv_mem = ConversationBufferWindowMemory(
    memory_key = 'history',
    k = 5,
    return_messages =True)

# Create the chain to manage the chat session
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever())

### Now have a conversation about the documents that were added to the grounding data vector database

In [55]:
qa.run("What model aircraft is affected by directive 2019-21-51?")

'Directive 2019-21-51 affects the Boeing Model 777-300ER airplane powered by GE GE90-115B model turbofan engines.'

In [56]:
qa.run("what are the unsafe conditions? Format as a bulleted list")

'- Uncontained high-pressure turbine (HPT) failure\n- Release of high-energy debris\n- Damage to the engine\n- Damage to the airplane\n- Possible loss of the airplane'

In [None]:
qa.run("Does dell make surfboards?")

In [None]:
qa.run("Do they make laptops?")

In [None]:
qa.run("Who founded Dell computer?")