## Connecting to Pinecone

In [1]:
%pip install pinecone-client -quiet
%pip install python-dotenv -quiet


Note: you may need to restart the kernel to use updated packages.



Usage:   
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <vcs project url> ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <local project path> ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <archive url/path> ...

no such option: -u


Note: you may need to restart the kernel to use updated packages.



Usage:   
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <vcs project url> ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <local project path> ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <archive url/path> ...

no such option: -u


In [2]:
## Load API key from .env


from dotenv import load_dotenv
import os

## Error Handling for API key retreival
try: 
            
    load_dotenv()

    PC_KEY = os.getenv('PINECONE_API_KEY')
    print(PC_KEY)

    if not PC_KEY:
        raise ValueError("PINECONE_API_KEY not found in .env file")


except Exception as e:
    print(f"Error: {e}")



623494db-40e1-44ee-9890-26f24e1dd55b


In [3]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PC_KEY)


  from tqdm.autonotebook import tqdm


## Create Index 

In [4]:
## View current indexes in Pinecone 
 
print(f"Indexes: {pc.list_indexes().names()}")
print(f"Collections: {pc.list_collections().names()}")

Indexes: ['idx-one']
Collections: []


In [5]:
## This POC currently only uses 1 index. As we scale up our data and decide how we can partition different legal documents, we can scale horizontally. 
## In this demo, there are only a handful of documents, enabling efficiency with a single index


index_name = "idx-one"

## Embedding model is [BERT large model (uncased)], which outputs vectors of [1024] dimensions
## Cosine similarity so search is not skewed by magnitude

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 
else: 
    print(f'Error: Could not create index. Index with name "{index_name}" already exists. ')

Error: Could not create index. Index with name "idx-one" already exists. 


## Converting text file to string 

In [6]:
def textFileToString(filepath):

    with open(filepath, 'r', encoding='utf-8') as file:
        str = file.read()

    return str






## Converting Strings to Embeddings 

In [7]:
%pip install transformers torch -Q

Note: you may need to restart the kernel to use updated packages.



Usage:   
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <vcs project url> ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <local project path> ...
  c:\Users\pratham.mehta\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <archive url/path> ...

no such option: -Q


In [8]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


In [9]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [10]:
def embed(str):
    inputs = tokenizer(str, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    vector = last_hidden_state[:, 0, :].squeeze().numpy()
    return vector.tolist() ## return as list (len 1024)


## Add first vector to Pinecone

In [11]:
filepath = "texts\\2024-11137.txt"
# File Description: Schedules of Controlled Substances: Rescheduling of Marijuana

# Read text file into string
s = textFileToString(filepath)
print(s[:150])

# convert string to embedding
vector = embed(s)

# check embedding is correct size
if len(vector) != 1024:
    print("Vector length invalid")


vector_id = "2024-11137"

metadata = {

    'publication_date': '2024-05-21',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 44597",
    'page_start': 44597,
    'page_end':44622,
    'cfr': "21 CFR 1308", 
    'document_number': "2024-11137", 

}

upsert_data = [(vector_id, vector, metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)


DEPARTMENT OF JUSTICE
Drug Enforcement Administration
21 CFR Part 1308
[Docket No. DEA–1362; A.G. Order No.
5931–2024]
Schedules of Controlled Substan


{'upserted_count': 1}

## Add Second Vector to Pinecone

In [12]:
filepath = "texts\\2024-13446.txt"
# Request for Information on Identifying and Tracking Data Related to Early Childhood Education Providers


# Read text file into string
s = textFileToString(filepath)
print(s[:150])

# convert string to embedding
vector = embed(s)

# check embedding is correct size
if len(vector) != 1024:
    print("Vector length invalid")


vector_id = "2024-13446"

metadata = {

    'publication_date': '2024-06-20',
    'document_type': 'Notice', 
    'document_citation': "89 FR 51878",
    'page_start': 51878,
    'page_end':51878,
    'document_number': "2024-13446", 
    'agency': "DEPARTMENT OF EDUCATION"

}

upsert_data = [(vector_id, vector, metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)


DEPARTMENT OF EDUCATION
[Docket ID ED–2024–OPE–0072]
Request for Information on Identifying
and Tracking Data Related to Early
Childhood Education Pro


{'upserted_count': 1}

In [13]:
filepath = "texts\\2024-11424.txt"
# Greenhouse Gas Technical Assistance Provider and Third-Party Verifier Program


# Read text file into string
s = textFileToString(filepath)
print(s[:150])

# convert string to embedding
vector = embed(s)

# check embedding is correct size
if len(vector) != 1024:
    print("Vector length invalid")


vector_id = "2024-11424"

metadata = {

    'publication_date': '2024-05-29',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 46335",
    'page_start': 46335,
    'page_end':46336,
    'document_number': "2024-11424", 
    'agency': "DEPARTMENT OF AGRICULTURE"

}

upsert_data = [(vector_id, vector, metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)


DEPARTMENT OF AGRICULTURE
Agricultural Marketing Service
7 CFR Part 175
[Doc. No. AMS–LP–24–0012]
RIN 0581–AE29
Greenhouse Gas Technical Assistance
Pr


{'upserted_count': 1}

## Adding 4th doc to Pinecone


Executive Order 14123 of June 14, 2024
White House Council on Supply Chain Resilience 



In [19]:
filepath = "texts\\2024-13810.txt"

# Executive Order 14123 of June 14, 2024
# White House Council on Supply Chain Resilience 



# Read text file into string
s = textFileToString(filepath)
print(s[:150])

# convert string to embedding
vector = embed(s)

# check embedding is correct size
if len(vector) != 1024:
    print("Vector length invalid")


vector_id = "2024-13810"

metadata = {

    'publication_date': '2024-06-21',
    'document_type': 'Presidential Document', 
    'document_citation': "89 FR 51949",
    'page_start': 51949,
    'page_end':51953,
    'document_number': "2024-13810", 
    'agency': "EXECUTIVE OFFICE OF THE PRESIDENT"

}

upsert_data = [(vector_id, vector, metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)


Presidential Documents
51949
Federal Register
Vol. 89, No. 120
Friday, June 21, 2024
Title 3—
The President
Executive Order 14123 of June 14, 2024
Whi


{'upserted_count': 1}

## Adding 5th vector to Pinecone

Continuation of the National Emergency With Respect to Belarus

In [18]:
filepath = "texts\\2024-13361.txt"

# Executive Order 
# Continuation of the National Emergency With Respect to Belarus



# Read text file into string
s = textFileToString(filepath)
print(s[:150])

# convert string to embedding
vector = embed(s)

# check embedding is correct size
if len(vector) != 1024:
    print("Vector length invalid")


vector_id = "2024-13361"

metadata = {

    'publication_date': '2024-06-14',
    'document_type': 'Presidential Document', 
    'document_citation': "89 FR 51197",
    'page_start': 51197,
    'page_end':51198,
    'document_number': "2024-13361", 
    'agency': "EXECUTIVE OFFICE OF THE PRESIDENT"

}

upsert_data = [(vector_id, vector, metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)


Presidential Documents
51197
Federal Register
Vol. 89, No. 116
Friday, June 14, 2024
Title 3—
The President
Notice of June 13, 2024
Continuation of th


{'upserted_count': 1}

In [16]:
filepath = "texts\\2024-10166.txt"

# Proposed Rule from PTO 
# Terminal Disclaimer Practice To Obviate Nonstatutory Double Patenting



# Read text file into string
s = textFileToString(filepath)
print(s[:150])

# convert string to embedding
vector = embed(s)

# check embedding is correct size
if len(vector) != 1024:
    print("Vector length invalid")


vector_id = "2024-10166"

metadata = {

    'publication_date': '2024-05-10',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 40439",
    'page_start': 40439,
    'page_end':40449,
    'document_number': "2024-10166", 
    'agency': ["DEPARTMENT OF COMMERCE", "PATENT TRADEMARK OFFICE"], 

}

upsert_data = [(vector_id, vector, metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)


Federal Register / Vol. 89, No. 92 / Friday, May 10, 2024 / Proposed Rules 40439
DEPARTMENT OF COMMERCE
Patent and Trademark Office
37 CFR Part 1
[Doc


{'upserted_count': 1}

In [17]:
## 9-11 Response and Biometric Entry-Exit Fee for H-1B and L-1 Visas

## Department of Homeland Security
## US Customs and Border Protection



filepath = "texts\\2024-12396.txt"


# Read text file into string
s = textFileToString(filepath)
print(s[:150])

# convert string to embedding
vector = embed(s)

# check embedding is correct size
if len(vector) != 1024:
    print("Vector length invalid")


vector_id = "2024-12396"

metadata = {

    'publication_date': '2024-07-08',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 48339",
    'page_start': 48339,
    'page_end':48348,
    'document_number': "2024-12396", 
    'agency': ["Department of Homeland Security", "U.S. Customs and Border Protection"]
    
}

upsert_data = [(vector_id, vector, metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)



This section of the FEDERAL REGISTER
contains notices to the public of the proposed
issuance of rules and regulations. The
purpose of these notices is


{'upserted_count': 1}

### Querying new ocument to find relevant documents in our database

In [24]:
print(f"Indexes: {pc.list_indexes().names()}")
print(f"Collections: {pc.list_collections().names()}")

Indexes: ['idx-one']
Collections: []


In [25]:
filepath = "texts\\2024-12240.txt"


# Read text file into string
s = textFileToString(filepath)
print(s[:150])

# convert string to embedding
query_vector = embed(s)

# check embedding is correct size
if len(vector) != 1024:
    print("Vector length invalid")


# # Print the IDs of similar documents and their distances (similarity scores)
# for result in results:
#     document_id = result.id
#     distance = result.distance
#     print(f"Document ID: {document_id}, Distance: {distance}")





47536 Federal Register / Vol. 89, No. 107 / Monday, June 3, 2024 / Notices
DEPARTMENT OF COMMERCE
International Trade Administration
[Docket No. 24053


In [29]:
index

<pinecone.data.index.Index at 0x1f7a158e2d0>

In [31]:
# Send the query to Pinecone to find similar documents
index.query(
    vector=query_vector,
    top_k=3,
    include_values=False
)

{'matches': [{'id': '2024-11137', 'score': 0.972805679, 'values': []},
             {'id': '2024-10166', 'score': 0.97222805, 'values': []},
             {'id': '2024-11424', 'score': 0.972106159, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [32]:
# Send the query to Pinecone to find similar documents
index.query(
    vector=query_vector,
    top_k=10,
    include_values=False
)

{'matches': [{'id': '2024-11137', 'score': 0.972921729, 'values': []},
             {'id': '2024-10166', 'score': 0.97222805, 'values': []},
             {'id': '2024-11424', 'score': 0.972106159, 'values': []},
             {'id': '2024-13810', 'score': 0.959383905, 'values': []},
             {'id': '2024-13361', 'score': 0.956893742, 'values': []},
             {'id': '2024-13446', 'score': 0.944576442, 'values': []},
             {'id': '2024-12396', 'score': 0.921917617, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}