In [1]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM

from transformers import AutoTokenizer
import os
import nltk
from nltk.data import find

import transformers

import re
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Indicator for whether we want to delete the existing DB or append to what's already there
delete = False

In [3]:
# Code borrowed and adapted from: https://www.pinecone.io/learn/retrieval-augmented-generation/

# Import an encoder for sentence-level embeddings
from sentence_transformers import SentenceTransformer

# Check that it works with an example
sentences = ["This is an example sentence", "Each sentence is converted"]

encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = encoder.encode(sentences)
print(embeddings.shape)

(2, 384)


In [4]:
from typing import List
import numpy as np

# Function to encode a list of text into the embedding space we just imported
def embed_docs(docs: List[str]) -> List[List[float]]:
    out = encoder.encode(docs)
    return out.tolist()

In [5]:
import pinecone

In [6]:
# add Pinecone API key from app.pinecone.io
api_key = "828c0ba7-fbe7-4f81-bd61-5b9c8ae0912a"
# set Pinecone environment - find next to API key in console
env = "gcp-starter"
pinecone.init(
    api_key=api_key,
    environment=env
)

In [7]:
import time

# Actually create the database
index_name = 'npc-rag'

# If it's already there, delete it and replace it
if delete:
    if index_name in pinecone.list_indexes():
        pinecone.delete_index(index_name)

    # Instantiate it
        pinecone.create_index(
            name=index_name,
            # Dimension will be the dimension of the embeddings model we imported
            dimension=embeddings.shape[1],
            # Using cosine similarity to query the docs
            metric='cosine'
            )
        

    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)
        
# Otherwise we will just add to the existing DB

In [8]:
# Now we need to encode the data we want to use and upload that to the pinecone db
from tqdm.auto import tqdm

# Keeping defaults from code
batch_size = 2  # can increase but needs larger instance size otherwise instance runs out of memory
vector_limit = 1000

answers = ["The magic key is under the bridge", "The sword is in the cave next to town",
          "The magic wand is located at Olivander's wand shop", "The shield is in the spirit temple",
          "When paired together the sword and the shield can defeat the boss"]

index_name = 'npc-rag'
index = pinecone.Index(index_name)

for i in tqdm(range(0, len(answers), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(answers))
    # create IDs batch
    ids = [str(x+900) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in answers[i:i_end]]
    # create embeddings
    texts = answers[i:i_end]
    embeddings = embed_docs(texts)
    # create records list for upsert
    records = zip(ids, embeddings, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.01it/s]


In [14]:
# check number of records in the index
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 6e-05,
 'namespaces': {'': {'vector_count': 6}},
 'total_vector_count': 6}

In [15]:
question = ["Where is the magic key?"]

In [16]:
# extract embeddings for the questions
query_vec = embed_docs(question)[0]

# query pinecone
res = index.query(query_vec, top_k=10, include_metadata=True)

# show the results
res

{'matches': [{'id': '900',
              'metadata': {'text': 'The magic key is under the bridge'},
              'score': 0.758217931,
              'values': []},
             {'id': '0',
              'metadata': {'text': 'The magic key is under the bridge'},
              'score': 0.758217931,
              'values': []},
             {'id': '902',
              'metadata': {'text': "The magic wand is located at Olivander's "
                                   'wand shop'},
              'score': 0.497206271,
              'values': []},
             {'id': '901',
              'metadata': {'text': 'The sword is in the cave next to town'},
              'score': 0.320037067,
              'values': []},
             {'id': '903',
              'metadata': {'text': 'The shield is in the spirit temple'},
              'score': 0.312742978,
              'values': []},
             {'id': '904',
              'metadata': {'text': 'When paired together the sword and the '
             

In [None]:
index.delete(ids=['0'])