In [None]:
!pip install pinecone unstructured[pdf] unstructured[local-inference]

In [None]:
!apt-get install -y poppler-utils

In [None]:
import os
import sys
import glob
import shutil
import pandas as pd
import kagglehub
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from collections import Counter

# import pinecone
from pinecone import Pinecone
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Optional
import json
from tqdm import tqdm
import hashlib
import time
import pickle

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Pinecone Configuration
PINECONE_API_KEY = "[your-pinecone-api-key]"
PINECONE_INDEX_NAME = "legal-contract-search"
PINECONE_ENVIRONMENT = "us-east-1"  # Based on your image

# Model Configuration
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
EXPECTED_DIMENSION = 768  # Based on your Pinecone index


METADATA_NAMESPACE = "contract_metadata"
CHUNKS_NAMESPACE = "contract_chunks"

print("✅ Configuration set up!")
print(f"Embedding model: {EMBEDDING_MODEL}")
print(f"Pinecone index: {PINECONE_INDEX_NAME}")
print(f"Expected dimensions: {EXPECTED_DIMENSION}")

In [None]:
def connect_to_pinecone(api_key: str, index_name: str) -> tuple:
    """Connect to Pinecone instance"""
    try:
        # Initialize Pinecone
        pc = Pinecone(api_key=api_key)

        # Get index
        index = pc.Index(index_name)

        # Test connection by getting index stats
        stats = index.describe_index_stats()
        print("✅ Successfully connected to Pinecone!")
        print(f"Index stats: {stats}")

        return pc, index

    except Exception as e:
        print(f"❌ Failed to connect to Pinecone: {e}")
        raise

# Initialize connection
pc, index = connect_to_pinecone(PINECONE_API_KEY, PINECONE_INDEX_NAME)


In [None]:
print("Loading embedding model...")
model = SentenceTransformer(EMBEDDING_MODEL)
print(f"✅ Loaded {EMBEDDING_MODEL}")

# Test the model
test_text = "This is a test sentence for embedding."
test_embedding = model.encode([test_text])
print(f"Model output shape: {test_embedding.shape}")
print(f"Embedding dimension: {test_embedding.shape[1]}")

EMBEDDING_DIMENSION = test_embedding.shape[1]

# Verify dimension matches Pinecone index
if EMBEDDING_DIMENSION != EXPECTED_DIMENSION:
    print(f"⚠️  Warning: Model dimension ({EMBEDDING_DIMENSION}) doesn't match Pinecone index ({EXPECTED_DIMENSION})")
else:
    print(f"✅ Model dimension matches Pinecone index: {EMBEDDING_DIMENSION}")


In [None]:
def generate_id(text: str, prefix: str = "") -> str:
    """Generate a unique ID for a text"""
    hash_obj = hashlib.md5(text.encode())
    return f"{prefix}{hash_obj.hexdigest()}"

def chunk_list(lst: List, chunk_size: int) -> List[List]:
    """Chunk a list into smaller lists"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

def clean_field_name(field_name: str) -> str:
    """Convert field names to snake_case"""
    return field_name.lower().replace(' ', '_').replace('/', '_').replace('-', '_').replace('&', 'and')

print("✅ Utility functions defined!")

In [None]:
def prepare_metadata_for_pinecone(metadata_row: Dict[str, Any]) -> Dict[str, Any]:
    """Prepare metadata row for Pinecone ingestion"""

    # Create metadata dict with all relevant fields
    metadata = {
        'type': 'contract_metadata',
        'filename': str(metadata_row.get('Filename', '')),
        'document_name': str(metadata_row.get('Document Name', '')),
        'parties': str(metadata_row.get('Parties', '')),
        'agreement_date': str(metadata_row.get('Agreement Date', '')),
        'effective_date': str(metadata_row.get('Effective Date', '')),
        'expiration_date': str(metadata_row.get('Expiration Date', '')),
        'renewal_term': str(metadata_row.get('Renewal Term', '')),
        'notice_period_terminate_renewal': str(metadata_row.get('Notice Period To Terminate Renewal', '')),
        'governing_law': str(metadata_row.get('Governing Law', '')),
        'competitive_restriction_exception': str(metadata_row.get('Competitive Restriction Exception', '')),
        'non_compete': str(metadata_row.get('Non-Compete', '')),
        'exclusivity': str(metadata_row.get('Exclusivity', '')),
        'no_solicit_customers': str(metadata_row.get('No-Solicit Of Customers', '')),
        'no_solicit_employees': str(metadata_row.get('No-Solicit Of Employees', '')),
        'non_disparagement': str(metadata_row.get('Non-Disparagement', '')),
        'termination_for_convenience': str(metadata_row.get('Termination For Convenience', '')),
        'rofr_rofo_rofn': str(metadata_row.get('Rofr/Rofo/Rofn', '')),
        'change_of_control': str(metadata_row.get('Change Of Control', '')),
        'anti_assignment': str(metadata_row.get('Anti-Assignment', '')),
        'revenue_profit_sharing': str(metadata_row.get('Revenue/Profit Sharing', '')),
        'minimum_commitment': str(metadata_row.get('Minimum Commitment', '')),
        'volume_restriction': str(metadata_row.get('Volume Restriction', '')),
        'ip_ownership_assignment': str(metadata_row.get('Ip Ownership Assignment', '')),
        'joint_ip_ownership': str(metadata_row.get('Joint Ip Ownership', '')),
        'license_grant': str(metadata_row.get('License Grant', '')),
        'affiliate_license_licensor': str(metadata_row.get('Affiliate License-Licensor', '')),
        'affiliate_license_licensee': str(metadata_row.get('Affiliate License-Licensee', '')),
        'unlimited_license': str(metadata_row.get('Unlimited/All-You-Can-Eat-License', '')),
        'irrevocable_perpetual_license': str(metadata_row.get('Irrevocable Or Perpetual License', '')),
        'post_termination_services': str(metadata_row.get('Post-Termination Services', '')),
        'audit_rights': str(metadata_row.get('Audit Rights', '')),
        'uncapped_liability': str(metadata_row.get('Uncapped Liability', '')),
        'cap_on_liability': str(metadata_row.get('Cap On Liability', '')),
        'liquidated_damages': str(metadata_row.get('Liquidated Damages', '')),
        'warranty_duration': str(metadata_row.get('Warranty Duration', '')),
        'insurance': str(metadata_row.get('Insurance', '')),
        'covenant_not_to_sue': str(metadata_row.get('Covenant Not To Sue', ''))
    }

    # Clean up NaN values
    for key, value in metadata.items():
        if pd.isna(value) or value == 'nan':
            metadata[key] = ''

    return metadata

def prepare_chunk_for_pinecone(chunk_data: Dict[str, Any]) -> Dict[str, Any]:
    """Prepare chunk data for Pinecone ingestion"""
    metadata = chunk_data['metadata']

    chunk_metadata = {
        'type': 'contract_chunk',
        'filename': str(metadata.get('Filename', '')),
        'text': chunk_data['text'][:1000],  # Limit text length in metadata
        'full_text_length': len(chunk_data['text']),
        'element_type': str(metadata.get('element_type', '')),
        'chunk_type': str(metadata.get('chunk_type', '')),
        'page_number': int(metadata.get('page_number', 0)) if metadata.get('page_number') is not None else 0,
        'document_name': str(metadata.get('Document Name', '')),
        'parties': str(metadata.get('Parties', '')),
        'governing_law': str(metadata.get('Governing Law', '')),
    }

    # Clean up NaN values
    for key, value in chunk_metadata.items():
        if pd.isna(value) or value == 'nan':
            chunk_metadata[key] = ''

    return chunk_metadata

print("✅ Data preparation functions defined!")

In [None]:
def get_metadata_size(metadata: Dict[str, Any]) -> int:
    """Calculate the size of metadata in bytes"""
    return len(json.dumps(metadata, default=str).encode('utf-8'))

def chunk_large_metadata(metadata: Dict[str, Any], max_size: int = 40000) -> List[Dict[str, Any]]:
    """
    Split large metadata into smaller chunks while preserving essential context.

    Args:
        metadata: The metadata dictionary to potentially chunk
        max_size: Maximum size in bytes (default 40KB with some buffer)

    Returns:
        List of metadata chunks
    """
    current_size = get_metadata_size(metadata)

    if current_size <= max_size:
        return [metadata]

    # Essential fields that should be in every chunk
    essential_fields = [
        'Document Name', 'Filename', 'Parties', 'Governing Law',
        'embedding_text', 'is_chunked', 'chunk_number', 'total_chunks'
    ]

    # Create base metadata with essential fields
    base_metadata = {k: metadata[k] for k in essential_fields if k in metadata}

    chunks = []
    chunk_num = 1
    current_chunk = base_metadata.copy()
    current_chunk['is_chunked'] = True
    current_chunk['chunk_number'] = chunk_num
    current_chunk['total_chunks'] = 'TBD'

    remaining_fields = [k for k in metadata.keys() if k not in essential_fields]

    for field in remaining_fields:
        test_chunk = current_chunk.copy()
        test_chunk[field] = metadata[field]

        if get_metadata_size(test_chunk) <= max_size:
            current_chunk[field] = metadata[field]
        else:
            chunks.append(current_chunk)
            chunk_num += 1
            current_chunk = base_metadata.copy()
            current_chunk['is_chunked'] = True
            current_chunk['chunk_number'] = chunk_num
            current_chunk['total_chunks'] = 'TBD'
            current_chunk[field] = metadata[field]

            if get_metadata_size(current_chunk) > max_size:
                field_value = str(metadata[field])
                base_size = get_metadata_size({**base_metadata})
                max_field_size = max_size - base_size - 100
                if max_field_size > 0:
                    truncated_value = field_value[:max_field_size] + "...[TRUNCATED]"
                    current_chunk[field] = truncated_value
                else:
                    del current_chunk[field]

    if current_chunk:
        chunks.append(current_chunk)

    total_chunks = len(chunks)
    for chunk in chunks:
        chunk['total_chunks'] = total_chunks

    return chunks

def prepare_metadata_for_pinecone(metadata: Dict[str, Any]) -> Dict[str, Any]:
    """
    Prepare metadata for Pinecone, handling None values and data types.
    """
    clean_metadata = {}

    for key, value in metadata.items():
        if pd.isna(value) or value is None:
            continue
        if isinstance(value, (int, float, bool)):
            clean_metadata[key] = str(value)
        elif isinstance(value, str):
            clean_metadata[key] = value
        elif isinstance(value, dict):
            clean_metadata[key] = json.dumps(value)
        else:
            clean_metadata[key] = str(value)

    return clean_metadata

def generate_chunked_id(original_text: str, prefix: str = "meta_", chunk_num: int = 1) -> str:
    """Generate ID for chunked metadata"""
    import hashlib
    hash_obj = hashlib.md5(original_text.encode())
    base_id = hash_obj.hexdigest()[:12]
    return f"{prefix}{base_id}_chunk_{chunk_num}"

def ingest_metadata_to_pinecone(metadata_df: pd.DataFrame,
                               index,
                               model: SentenceTransformer,
                               namespace: str = METADATA_NAMESPACE,
                               batch_size: int = 100) -> int:
    """Ingest metadata to Pinecone with embeddings, handling large metadata by chunking"""

    print(f"Starting metadata ingestion for {len(metadata_df)} rows...")

    all_vectors = []
    chunked_records = 0

    for idx, row in tqdm(metadata_df.iterrows(), total=len(metadata_df), desc="Preparing metadata"):
        text_parts = []

        if pd.notna(row.get('Document Name')):
            text_parts.append(f"Document: {row['Document Name']}")
        if pd.notna(row.get('Parties')):
            text_parts.append(f"Parties: {row['Parties']}")
        if pd.notna(row.get('Governing Law')):
            text_parts.append(f"Governing Law: {row['Governing Law']}")

        boolean_fields = ['Non-Compete', 'Exclusivity', 'No-Solicit Of Customers',
                         'No-Solicit Of Employees', 'Termination For Convenience',
                         'Anti-Assignment', 'Revenue/Profit Sharing', 'Audit Rights',
                         'Uncapped Liability', 'Cap On Liability', 'Insurance']

        active_clauses = [field.replace('-', ' ').replace('/', ' ') for field in boolean_fields
                          if field in row and str(row[field]).lower() == 'yes']

        if active_clauses:
            text_parts.append(f"Contract includes: {', '.join(active_clauses)}")

        metadata_text = '. '.join(text_parts) if text_parts else f"Contract metadata for {row.get('Filename', 'unknown')}"

        embedding = model.encode([metadata_text])[0]

        initial_metadata = prepare_metadata_for_pinecone(row.to_dict())
        initial_metadata['embedding_text'] = metadata_text

        metadata_chunks = chunk_large_metadata(initial_metadata)

        if len(metadata_chunks) > 1:
            chunked_records += 1
            print(f"Chunked large metadata for {row.get('Filename', 'unknown')} into {len(metadata_chunks)} chunks")

        for chunk_idx, metadata_chunk in enumerate(metadata_chunks, 1):
            vector_id = generate_chunked_id(metadata_text, "meta_", chunk_idx) if len(metadata_chunks) > 1 \
                        else generate_chunked_id(metadata_text, "meta_", 1)

            all_vectors.append({
                'id': vector_id,
                'values': embedding.tolist(),
                'metadata': metadata_chunk
            })

    print(f"Total vectors to upsert: {len(all_vectors)}")
    print(f"Records that required chunking: {chunked_records}")

    print("Upserting metadata to Pinecone...")
    total_upserted = 0
    failed_batches = 0

    for batch in tqdm(chunk_list(all_vectors, batch_size), desc="Upserting batches"):
        try:
            valid_vectors = []
            for vector in batch:
                metadata_size = get_metadata_size(vector['metadata'])
                if metadata_size > 40960:
                    print(f"Warning: Vector {vector['id']} still exceeds size limit ({metadata_size} bytes)")
                    continue
                valid_vectors.append(vector)

            if valid_vectors:
                index.upsert(vectors=valid_vectors, namespace=namespace)
                total_upserted += len(valid_vectors)
                time.sleep(0.1)

        except Exception as e:
            print(f"Error upserting batch: {e}")
            failed_batches += 1
            continue

    print(f"✅ Successfully ingested {total_upserted} metadata records!")
    print(f"❌ Failed batches: {failed_batches}")
    return total_upserted

In [None]:
# Usage remains the same:
print("Loading metadata from Excel file...")
metadata_df = pd.read_excel("master_clauses_cleaned-modified.xlsx")
print(f"Loaded {len(metadata_df)} metadata rows")

In [None]:
# Ingest metadata
ingested_count = ingest_metadata_to_pinecone(metadata_df, index, model)
print(f"📊 Ingested {ingested_count} metadata records to Pinecone")

In [None]:
def ingest_chunks_to_pinecone(chunks_data: List[Dict[str, Any]],
                            index,
                            model: SentenceTransformer,
                            namespace: str = CHUNKS_NAMESPACE,
                            batch_size: int = 100) -> int:
    """Ingest chunks to Pinecone with embeddings"""

    print(f"Starting chunks ingestion for {len(chunks_data)} chunks...")

    total_upserted = 0

    # Process in batches
    for i in tqdm(range(0, len(chunks_data), batch_size), desc="Processing chunks"):
        batch_chunks = chunks_data[i:i + batch_size]

        # Prepare batch vectors
        batch_vectors = []

        for chunk in batch_chunks:
            try:
                # Generate embedding
                embedding = model.encode([chunk['text']])[0]

                # Prepare metadata
                chunk_metadata = prepare_chunk_for_pinecone(chunk)

                # Create vector
                vector_id = generate_id(chunk['text'], "chunk_")
                batch_vectors.append({
                    'id': vector_id,
                    'values': embedding.tolist(),
                    'metadata': chunk_metadata
                })

            except Exception as e:
                print(f"Error processing chunk: {e}")
                continue

        # Upsert batch
        if batch_vectors:
            try:
                index.upsert(vectors=batch_vectors, namespace=namespace)
                total_upserted += len(batch_vectors)
                time.sleep(0.1)  # Small delay to avoid rate limits
            except Exception as e:
                print(f"Error upserting chunk batch: {e}")
                continue

        # Progress update
        if (i + batch_size) % 1000 == 0:
            print(f"\nUpserted {total_upserted} chunks so far...")

    print(f"✅ Successfully ingested {total_upserted} chunks!")
    return total_upserted

In [None]:
# Load or use your existing chunks_with_metadata
chunks_with_metadata = pickle.load(open('/content/chunks_with_metadata.pkl', 'rb'))

print(f"Ready to ingest {len(chunks_with_metadata)} chunks...")
print("Note: This will take some time depending on the number of chunks")

# Ingest chunks
ingested_chunks = ingest_chunks_to_pinecone(chunks_with_metadata, index, model)
print(f"📊 Ingested {ingested_chunks} chunks to Pinecone")