In [1]:
# Load API key

from dotenv import load_dotenv
import os

## Error Handling for API key retreival
try: 
            
    load_dotenv()

    PC_KEY = os.getenv('PINECONE_API_KEY')
    print(PC_KEY)

    if not PC_KEY:
        raise ValueError("PINECONE_API_KEY not found in .env file")


except Exception as e:
    print(f"Error: {e}")



623494db-40e1-44ee-9890-26f24e1dd55b


In [2]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PC_KEY)

  from tqdm.autonotebook import tqdm


In [3]:
## View current indexes in Pinecone 
 
print(f"Indexes: {pc.list_indexes().names()}")
print(f"Collections: {pc.list_collections().names()}")

Indexes: ['idx-one', 'idx-two']
Collections: []


In [4]:
index_name = "idx-two"
dim = 768

## Embedding model: legal-bert-base-uncased (768 dimensions)
# https://huggingface.co/nlpaueb/legal-bert-base-uncased

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 
else: 
    print(f'Error: Could not create index. Index with name "{index_name}" already exists. ')

Error: Could not create index. Index with name "idx-two" already exists. 


In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel
import torch 


tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
# model = AutoModelForPreTraining.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")


In [6]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
def pooler_embeding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.pooler_output.detach().cpu().numpy().tolist()[0] ## CLS is stored at the first position of a BERT Tensor


In [8]:
def get_average_last_hidden_state(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad(): 
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        avg_hidden_state = torch.mean(last_hidden_state, dim=1)
    return avg_hidden_state.detach().cpu().numpy().tolist()[0]


In [39]:
from utilities import textFileToString


def push(filepath, vector_id, metadata, dim = 768, embedding_func = None):
    
    file_text = textFileToString(filepath)
    print(file_text[:200]) ## Printing first 200 chars to check file
    vector = embedding_func(file_text) ## Converting text to a vector

    if len(vector) != dim:
        raise Exception(f"Vector dimension invalid.")
    

    upsert_data = [(vector_id, vector, metadata)]

    # Upsert the data to the Pinecone index

    index = pc.Index(index_name)
    index.upsert(upsert_data)



ImportError: cannot import name 'textFileToString' from 'utilities' (c:\Projects\legal_document_reccomender\utilities.py)

In [13]:
index_name

'idx-two'

In [37]:
## Vector 1
# File Description: Schedules of Controlled Substances: Rescheduling of Marijuana


filepath = "summaries\\2024-11137.txt"
vector_id = "2024-11137"

metadata = {

    'publication_date': '2024-05-21',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 44597",
    'page_start': 44597,
    'page_end':44622,
    'cfr': "21 CFR 1308", 
    'document_number': "2024-11137", 

}

push(filepath, vector_id, metadata, embedding_func=pooler_embeding)



AttributeError: module 'utilities' has no attribute 'textFileToString'

In [None]:
()