In [1]:
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Fetch the API key from environment variables
api_key = os.getenv("PINECONE_API_KEY")

if api_key is None:
    raise ValueError("API key not found. Please set the PINECONE_API_KEY environment variable.")

In [2]:
# Initialize Pinecone with the API key
pc = Pinecone(api_key=api_key)

In [4]:
pc.create_index(
    name="il-legal",
    dimension=384, # Replace with your model dimensions (384 emd dim size for all-minilm-l6-v2)
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [5]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

pdf_files = [
    "../ilcs/720_ILCS_CRIMINAL_OFFENSES.pdf",
    "../ilcs/725_ILCS_CRIMINAL_PROCEDURE.pdf",
    "../ilcs/Illinois Safe-T Act Full Text.pdf"
]

# Extract text from PDF files
texts = {pdf: extract_text_from_pdf(pdf) for pdf in pdf_files}

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

def chunk_text(text, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS
    )
    return text_splitter.split_text(text)

chunked_texts = {pdf: chunk_text(texts[pdf]) for pdf in pdf_files}

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.squeeze().numpy()

# Embed the chunked texts
embeddings = {
    pdf: [embed_text(chunk) for chunk in chunks]
    for pdf, chunks in chunked_texts.items()
}

In [12]:
# Define index name
index_name = "il-legal"
index = pc.Index(index_name)

# Batch size for upserts
batch_size =  100  # Adjust based on your data size and limit

# Prepare and insert vectors into Pinecone
for pdf, chunks in embeddings.items():
    vectors = [{"id": f"{pdf}_{i}", "values": chunk.tolist()} for i, chunk in enumerate(chunks)]
    
    # Batch upserts
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)

print("Embeddings inserted into Pinecone.")

Embeddings inserted into Pinecone.
