### refactor metadata according to what is specified

In [16]:
import os
import textwrap
from uuid import uuid4
from dotenv import load_dotenv
import pinecone
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter

from util.files import save_json
from util.RateLimiter import Api
from util.MSGraphAPI import getActiveDirectoryToken, getUsers, listUserFileMetadata, listSiteFileMetadata, downloadFileContent
from util.Pinecone import getAllVectors
load_dotenv()

True

In [3]:
# openai setup
openai.organization = os.getenv("openai_org")
openai.api_key = os.getenv("openai_key")

In [4]:
# pinecone setup
vdb_index_name = os.getenv("pinecone_idx")
pinecone.init(
  api_key=os.getenv("pinecone_key"),
  environment=os.getenv("pinecone_env")
)
index = pinecone.Index(vdb_index_name)

In [5]:
# get all vectors
all_vectors = getAllVectors(vdb_index_name, 999, include_values=False)
matches = all_vectors['matches']
vector_ids = [v['id'] for v in matches]
print(f"num vectors: {len(matches)}\n")
existing_document_ids = {match['metadata']['document_id'] for match in matches}

# get Active Directory Application-Level Token
active_directory_token = getActiveDirectoryToken(
  os.getenv("active_directory_client_id"),
  os.getenv("active_directory_client_secret"),
  os.getenv("active_directory_tenant_id")
)

num vectors: 0



In [6]:
# get sharepoint metadata
users = getUsers(active_directory_token)
all_files = []
for user in users:
  user_files = listUserFileMetadata(active_directory_token, user['id'])
  all_files.extend(user_files)

site_files = listSiteFileMetadata(active_directory_token)
all_files.extend(site_files)

In [7]:
## download content as text for each document

for file in all_files:
  # add 'raw_text' attribute to each file
  raw_text = downloadFileContent(active_directory_token, file['drive_id'], file['item_id'])
  if (raw_text):
    file['raw_text'] = raw_text
  else:
    print('raw_text was falsey:', raw_text)
    exit()

In [14]:
### Chunk content
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 1000,
  chunk_overlap  = 200,
)
for file in all_files:
  file['chunks'] = text_splitter.split_text(file['raw_text'])

In [15]:
for chunk in all_files[7]['chunks']:
  print(len(chunk))

518705


In [13]:
metadata_representation:str = textwrap.dedent(
f"""DOCUMENT INFORMATION:
  file name: {file['name']}
  Active Directory item id (document id): {file['item_id']}
  drive id: {file['drive_id']}
  location type: {file['location']}
---------------------
""")
chunk = metadata_representation + all_files[0]['chunks'][0]
print(len(chunk))

101197


In [12]:

# hold all pinecone payloads to be uploaded
payloads = []

# OpenAI embedding rate limit with 50% buffer
rpm_limit_openai = 3000 * 0.75
rpm_limit_pinecone = 3000 * 0.75
openai_request = Api(average_rate_limit=rpm_limit_openai, max_retries=5)
pinecone_request = Api(average_rate_limit=rpm_limit_pinecone, max_retries=3)
for file in all_files:
  metadata_representation:str = textwrap.dedent(
  f"""DOCUMENT INFORMATION:
    file name: {file['name']}
    Active Directory item id (document id): {file['item_id']}
    drive id: {file['drive_id']}
    location type: {file['location']}
  ---------------------
  """
  )
  payload = []
  for i, chunk in enumerate(file['chunks']):
    # create vector id
    vec_id = str(uuid4())

    # fix any UNICODE errors
    chunk = chunk.encode(encoding='ASCII',errors='ignore').decode()
    # add metadata to chunk to allow for semantic lookup of metadata
    chunk_with_metadata = metadata_representation + chunk
    print(f"final chunk length to embed: {len(chunk_with_metadata)}")

    # get vector representation of text chunk
    response = openai_request.send_request(openai.Embedding.create,input=chunk_with_metadata,engine='text-embedding-ada-002')
    vector_value = response['data'][0]['embedding']  # this is a normal list

    # vector metadata as dictionary
    metadata = {
      'name': file['name'],
      'drive_id': file['drive_id'],
      'item_id': file['item_id'],
      'chunk_index': i,
    }
    if 'user' in file:
      metadata['user'] = file['user']

    # create and append vector obj to the payload
    vector_obj = (vec_id, vector_value, metadata)
    payload.append(vector_obj)

    # status update
    if (i % 50 == 0 or i == len(file['chunks']) - 1):
      print(f"File {file['item_id']}, chunk {i}")
  
  # append current payload to payloads
  payloads.append(payload)

# save payloads in case of failure
save_json("test_payloads.json",payloads)

# upload pinecone payloads
for i, payload in enumerate(payloads):
  # push batched payload to pinecone ensuring payload contains less than 80 vectors (abide by 2MB Pinecone limit)
  pinecone_request.send_payload(index.upsert, payload, payload_length_limit=80)
  print(f"submitted pinecone payload {i}")

Exception: Request failed after 5 (max) attempts:
This model's maximum context length is 8191 tokens, however you requested 29106 tokens (29106 in your prompt; 0 for the completion). Please reduce your prompt; or completion length..