## Load in Example file 

In [6]:
file_path = 'marijuana.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    file_content = file.read()

print(len(file_content))


190425


## Connecting to Pinecone

In [1]:
%pip install pinecone-client
%pip install python-dotenv


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
## Load API key from .env


from dotenv import load_dotenv
import os

## Error Handling for API key retreival
try: 
            
    load_dotenv()

    PC_KEY = os.getenv('PINECONE_API_KEY')
    print(PC_KEY)

    if not PC_KEY:
        raise ValueError("PINECONE_API_KEY not found in .env file")


except Exception as e:
    print(f"Error: {e}")



623494db-40e1-44ee-9890-26f24e1dd55b


In [4]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PC_KEY)


## Create Index 

In [8]:
print(f"Indexes: {pc.list_indexes().names()}")
print(f"Collections: {pc.list_collections().names()}")

Indexes: []
Collections: []


In [9]:
## This POC currently only uses 1 index. As we scale up our data and decide how we can partition different legal documents, we can scale horizontally. 
## In this demo, there are only a handful of documents, enabling efficiency with a single index


index_name = "idx-one"

## Embedding model is [BERT large model (uncased)], which outputs vectors of [1024] dimensions
## Cosine similarity so search is not skewed by magnitude

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 


In [10]:
print(f"Indexes: {pc.list_indexes().names()}") # 'idx-one' added successfully
print(f"Collections: {pc.list_collections().names()}")

Indexes: ['idx-one']
Collections: []


## Converting Text to Embeddings

In [11]:
%pip install transformers torch 

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.8 kB ? eta -:--:--
     ---------------------------------------- 43.8/43.8 kB ? eta 0:00:00
Collecting torch
  Downloading torch-2.3.1-cp312-cp312-win_amd64.whl.metadata (26 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.1-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 42.0/42.0

In [12]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
inputs = tokenizer(file_content, return_tensors='pt', max_length=512, truncation=True, padding=True)
with torch.no_grad():
    outputs = model(**inputs)
last_hidden_state = outputs.last_hidden_state
vector = last_hidden_state[:, 0, :].squeeze().numpy()


In [14]:
print(vector)

[ 0.16149496 -0.5803626  -0.04409358 ... -0.73962927 -0.66389704
  0.4203385 ]


In [18]:
print(vector.shape)

(1024,)


## Add vector to Pinecone


TODO: Insert above vector with metadata\
TODO: repeat process for several more vectors, potentially streanline via api

In [19]:
doc_one_metadata =  {
    'publication_date': '2024-05-21',
    'document_type': 'Proposed Rule', 
    'document_citation': "89 FR 44597",
    'page_start': 44597,
    'page_end':44622,
    'cfr': "21 CFR 1308", 
    'document_number': "2024-11137", 
}

In [21]:
file_content



In [23]:
# Unique ID for the vector
vector_id = 'example-doc-1'

# Prepare the upsert data
upsert_data = [(vector_id, vector.tolist(), doc_one_metadata)]

# Upsert the data to the Pinecone index

index = pc.Index(index_name)
index.upsert(upsert_data)


{'upserted_count': 1}

In [24]:
response = index.fetch(ids=[vector_id])
print(response)


{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'example-doc-1': {'id': 'example-doc-1',
                               'metadata': {'cfr': '21 CFR 1308',
                                            'document_citation': '89 FR 44597',
                                            'document_number': '2024-11137',
                                            'document_type': 'Proposed Rule',
                                            'page_end': 44622.0,
                                            'page_start': 44597.0,
                                            'publication_date': '2024-05-21'},
                               'values': [0.161494955,
                                          -0.580362618,
                                          -0.0440935753,
                                          -0.239859223,
                                          0.0751856565,
                                          -0.0476518236,
                                          -0.04627