In [2]:
# Load API key

from dotenv import load_dotenv
import os

## Error Handling for API key retreival
try: 
            
    load_dotenv()

    PC_KEY = os.getenv('PINECONE_API_KEY')
    print(PC_KEY)

    if not PC_KEY:
        raise ValueError("PINECONE_API_KEY not found in .env file")


except Exception as e:
    print(f"Error: {e}")



623494db-40e1-44ee-9890-26f24e1dd55b


In [3]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PC_KEY)

  from tqdm.autonotebook import tqdm


In [4]:
## View current indexes in Pinecone 
 
print(f"Indexes: {pc.list_indexes().names()}")
print(f"Collections: {pc.list_collections().names()}")

Indexes: ['idx-one', 'idx-two']
Collections: []


In [9]:
index_name = "idx-two"
dim = 768

## Embedding model: legal-bert-base-uncased (768 dimensions)
# https://huggingface.co/nlpaueb/legal-bert-base-uncased

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 
else: 
    print(f'Error: Could not create index. Index with name "{index_name}" already exists. ')

Error: Could not create index. Index with name "idx-two" already exists. 


In [32]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForPreTraining
import torch 


tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForPreTraining.from_pretrained("nlpaueb/legal-bert-base-uncased")


In [6]:
model

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [34]:
def embed(str):
    inputs = tokenizer(str, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    vector = last_hidden_state[:, 0, :].squeeze().numpy()
    return vector.tolist() ## return as list (len 768)

In [35]:
from utils import Utils

def push(filepath, vector_id, metadata, dim = 768):
    
    file_text = Utils.textFileToString(filepath)
    print(file_text[:200]) ## Printing first 200 chars to check file
    vector = embed(file_text) ## Converting text to a vector

    if len(vector) != dim:
        raise Exception(f"Vector dimension invalid.")
    

    upsert_data = [(vector_id, vector, metadata)]

    # Upsert the data to the Pinecone index

    index = pc.Index(index_name)
    index.upsert(upsert_data)



In [21]:
index_name

'idx-two'

In [36]:
## Vector 1
# File Description: Schedules of Controlled Substances: Rescheduling of Marijuana


filepath = "cleaned_texts\\2024-11137.txt"
vector_id = "2024-11137"

metadata = {

    'publication_date': '2024-05-21',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 44597",
    'page_start': 44597,
    'page_end':44622,
    'cfr': "21 CFR 1308", 
    'document_number': "2024-11137", 

}

push(filepath, vector_id, metadata)



DEPARTMENT JUSTICE Drug Enforcement Administration 21 CFR Part 1308 [ Docket . DEA–1362 ; A.G. Order . 5931–2024 ] Schedules Controlled Substances : Rescheduling Marijuana AGENCY : Drug Enforcement Ad


AttributeError: 'BertForPreTrainingOutput' object has no attribute 'last_hidden_state'

In [None]:
()