## Load in Example file 

In [1]:
file_path = 'marijuana.txt'

def textFileToString(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        str = file.read()
    return str

file_content = textFileToString(file_path=file_path)
print(file_content)




DEPARTMENT OF JUSTICE
Drug Enforcement Administration
21 CFR Part 1308
[Docket No. DEA–1362; A.G. Order No.
5931–2024]
Schedules of Controlled Substances:
Rescheduling of Marijuana
AGENCY: Drug Enforcement
Administration, Department of Justice.
ACTION: Notice of proposed rulemaking.
SUMMARY: The Department of Justice
(‘‘DOJ’’) proposes to transfer marijuana
from schedule I of the Controlled
Substances Act (‘‘CSA’’) to schedule III
of the CSA, consistent with the view of
the Department of Health and Human
Services (‘‘HHS’’) that marijuana has a
currently accepted medical use as well
as HHS’s views about marijuana’s abuse
potential and level of physical or
psychological dependence. The CSA
requires that such actions be made
through formal rulemaking on the
record after opportunity for a hearing. If
the transfer to schedule III is finalized,
the regulatory controls applicable to
schedule III controlled substances
would apply, as appropriate, along with
existing marijuana-specific
requirem

## Connecting to Pinecone

In [2]:
%pip install pinecone-client
%pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
## Load API key from .env


from dotenv import load_dotenv
import os

## Error Handling for API key retreival
try: 
            
    load_dotenv()

    PC_KEY = os.getenv('PINECONE_API_KEY')
    print(PC_KEY)

    if not PC_KEY:
        raise ValueError("PINECONE_API_KEY not found in .env file")


except Exception as e:
    print(f"Error: {e}")



623494db-40e1-44ee-9890-26f24e1dd55b


In [4]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PC_KEY)


  from tqdm.autonotebook import tqdm


## Create Index 

In [5]:
print(f"Indexes: {pc.list_indexes().names()}")
print(f"Collections: {pc.list_collections().names()}")

Indexes: ['idx-one']
Collections: []


In [6]:
## This POC currently only uses 1 index. As we scale up our data and decide how we can partition different legal documents, we can scale horizontally. 
## In this demo, there are only a handful of documents, enabling efficiency with a single index


index_name = "idx-one"

## Embedding model is [BERT large model (uncased)], which outputs vectors of [1024] dimensions
## Cosine similarity so search is not skewed by magnitude

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 
else:
    print(f'Error creating index "{index_name}". Index with the name "{index_name}" already exists.')

Error creating index "idx-one". Index with the name "idx-one" already exists.


In [7]:
print(f"Indexes: {pc.list_indexes().names()}") # 'idx-one' added successfully
print(f"Collections: {pc.list_collections().names()}")

Indexes: ['idx-one']
Collections: []


## Converting Text to Embeddings

In [8]:
%pip install transformers torch 




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [9]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [10]:

def strToVector(file_content):
    inputs = tokenizer(file_content, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    vector = last_hidden_state[:, 0, :].squeeze().numpy()
    return vector



In [11]:
vector = strToVector(file_content) 

print(vector)


[ 0.16149496 -0.5803626  -0.04409358 ... -0.73962927 -0.66389704
  0.4203385 ]


In [12]:
print(vector.shape)

(1024,)


## Add vector to Pinecone


TODO: Insert above vector with metadata\
TODO: repeat process for several more vectors, potentially streanline via api

In [13]:
doc_one_metadata =  {
    'publication_date': '2024-05-21',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 44597",
    'page_start': 44597,
    'page_end':44622,
    'cfr': "21 CFR 1308", 
    'document_number': "2024-11137", 
}

In [14]:
print(f"First 100 chars of file: {file_content[0:100]}")

print(f"Last 100 chars of file: {file_content[-100:]}")

First 100 chars of file: DEPARTMENT OF JUSTICE
Drug Enforcement Administration
21 CFR Part 1308
[Docket No. DEA–1362; A.G. Or
Last 100 chars of file: k B. Garland,
Attorney General.
[FR Doc. 2024–11137 Filed 5–17–24; 11:15 am]
BILLING CODE 4410–09–P 


In [15]:
# Unique ID for the vector
vector_id = 'marijuana-doc-1'

# Prepare the upsert data
upsert_data = [(vector_id, vector.tolist(), doc_one_metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)


{'upserted_count': 1}

In [16]:
response = index.fetch(ids=[vector_id])
print(response)


{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}


In [17]:
file_two = "hhs.txt"
doc_two_text = textFileToString(file_two)

v = strToVector(doc_two_text)

doc_two_metadata =  {
    'publication_date': '2024-03-26',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 20882",
    'page_start': 20882,
    'page_end':20897,
    'cfr': "21 CFR 882",  ## doc has two cfrs, I added one for simplicity
    'document_number': "2024-06037"
}


# Unique ID for the vector
vector_id = 'hhs_doc'

# Prepare the upsert data
upsert_data = [(vector_id, v.tolist(), doc_two_metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)



{'upserted_count': 1}

## PDF Reccomending

Pinecone has two documents; one about marijuana, and one about the FDA and the Department of HHS

Given another document about marijuana, the returned vector should be the the marijuana document first entered into Pinecone.  

In [31]:
file_three_path = "demo.txt"
doc_three_text = textFileToString(file_three_path)



demo_vec = strToVector(doc_three_text)


demo_metadata =  {
    'publication_date': '2023-12-86',
    'document_type': 'Presidential Document', 
    'document_citation': "88 FR 90083",
    'page_start': 90083,
    'page_end':90084,
    'document_number': "2023-28805"
}


# Unique ID for the vector
vector_id = 'demo-doc'



In [None]:
print(doc_three_text)

In [34]:
query_vector = demo_vec.tolist()

# Perform a query to find similar vectors
index.query(    
    vector=query_vector, 
    top_k=2,
)


{'matches': [{'id': 'hhs_doc', 'score': 0.0361101, 'values': []},
             {'id': 'marijuana-doc-1', 'score': 0.0221999604, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [40]:
marijuana_two_file_path = "marijuana_two.txt"
str = textFileToString(marijuana_two_file_path)
print(str[:100])

Federal Register / Vol. 89, No. 22 / Thursday, February 1, 2024 / Proposed Rules 6455 

DEPARTMENT O


In [41]:
query_vector = strToVector(str).tolist()

# Perform a query to find similar vectors
index.query(    
    vector=query_vector, 
    top_k=2,
)


{'matches': [{'id': 'hhs_doc', 'score': 0.971591, 'values': []},
             {'id': 'marijuana-doc-1', 'score': 0.958133221, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}