## Connecting to Pinecone

In [1]:
%pip install pinecone-client -quiet
%pip install python-dotenv -quiet


Note: you may need to restart the kernel to use updated packages.



Usage:   
  c:\Program Files\Python312\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  c:\Program Files\Python312\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  c:\Program Files\Python312\python.exe -m pip install [options] [-e] <vcs project url> ...
  c:\Program Files\Python312\python.exe -m pip install [options] [-e] <local project path> ...
  c:\Program Files\Python312\python.exe -m pip install [options] <archive url/path> ...

no such option: -u


Note: you may need to restart the kernel to use updated packages.



Usage:   
  c:\Program Files\Python312\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  c:\Program Files\Python312\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  c:\Program Files\Python312\python.exe -m pip install [options] [-e] <vcs project url> ...
  c:\Program Files\Python312\python.exe -m pip install [options] [-e] <local project path> ...
  c:\Program Files\Python312\python.exe -m pip install [options] <archive url/path> ...

no such option: -u


In [3]:
## Load API key from .env


from dotenv import load_dotenv
import os

## Error Handling for API key retreival
try: 
            
    load_dotenv()

    PC_KEY = os.getenv('PINECONE_API_KEY')
    print(PC_KEY)

    if not PC_KEY:
        raise ValueError("PINECONE_API_KEY not found in .env file")


except Exception as e:
    print(f"Error: {e}")



623494db-40e1-44ee-9890-26f24e1dd55b


In [4]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PC_KEY)


## Create Index 

In [5]:
## View current indexes in Pinecone 
 
print(f"Indexes: {pc.list_indexes().names()}")
print(f"Collections: {pc.list_collections().names()}")

Indexes: ['idx-one']
Collections: []


In [8]:
## This POC currently only uses 1 index. As we scale up our data and decide how we can partition different legal documents, we can scale horizontally. 
## In this demo, there are only a handful of documents, enabling efficiency with a single index


index_name = "idx-one"

## Embedding model is [BERT large model (uncased)], which outputs vectors of [1024] dimensions
## Cosine similarity so search is not skewed by magnitude

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 
else: 
    print(f'Error: Could not create index. Index with name "{index_name}" already exists. ')

Error: Could not create index. Index with name "idx-one" already exists. 


## Converting text file to string 

In [20]:
def textFileToString(filepath):

    with open(filepath, 'r', encoding='utf-8') as file:
        str = file.read()

    return str






## Converting Strings to Embeddings 

In [9]:
%pip install transformers torch -Q

Note: you may need to restart the kernel to use updated packages.



Usage:   
  c:\Program Files\Python312\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  c:\Program Files\Python312\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  c:\Program Files\Python312\python.exe -m pip install [options] [-e] <vcs project url> ...
  c:\Program Files\Python312\python.exe -m pip install [options] [-e] <local project path> ...
  c:\Program Files\Python312\python.exe -m pip install [options] <archive url/path> ...

no such option: -Q


In [10]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


In [11]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [12]:
def embed(str):
    inputs = tokenizer(str, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    vector = last_hidden_state[:, 0, :].squeeze().numpy()
    return vector.tolist() ## return as list (len 1024)


## Add first vector to Pinecone

In [22]:
filepath = "texts\2024-11137.txt"

s = textFileToString(filepath)


# def addVectorToPinecone(filepath):
#     file_content = textFileToString(filepath = filepath)
#     print(file_content[:150])
#     vector = embed(file_content)
#     print(len(vector))
#     vector_id = filepath[6:]
#     print(vector_id)

# addVectorToPinecone(filepath=filepath)
    



FileNotFoundError: [Errno 2] No such file or directory: 'texts\x824-11137.txt'