In [None]:
#final both (removal of duplicate index (after chucking) and removal of duplicate of document(set())

import requests
import re
import numpy as np
from typing import List, Dict, Any

def get_embeddings_via_api(sentence: str) -> List[float]:
    """Get embeddings from API (using all-MiniLM-L6-v2 model)"""
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",  # Endpoint of your embedding API
        json={"model": "sentence-transformers/all-MiniLM-L6-v2", "input": [sentence]}
    )
    # Ensure response was successful
    if response.status_code == 200:
        return response.json()["data"][0]["embedding"]
    else:
        raise Exception(f"Error in API call: {response.status_code} - {response.text}")

def split_into_sentences(text: str) -> List[Dict[str, Any]]:
    """Split text into sentences with metadata and remove exact duplicates"""
    sentences = [{'sentence': s, 'index': i} 
                 for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
    
    # Remove exact duplicates by using a set to track unique sentences
    unique_sentences = []
    seen_sentences = set()  # Set to track seen sentences (exact match)

    for sentence in sentences:
        # If the sentence hasn't been seen before, add it
        if sentence['sentence'] not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence['sentence'])

    return combine_sentences(unique_sentences)

def combine_sentences(sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
    """Combine sentences with context"""
    combined = []
    for i, sent in enumerate(sentences):
        context = []
        for j in range(max(0, i - buffer_size), i):
            context.append(sentences[j]['sentence'])
        context.append(sent['sentence'])
        for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
            context.append(sentences[j]['sentence'])
        sent['combined_sentence'] = ' '.join(context)
        combined.append(sent)
    return combined

def create_chunks(sentences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Create document chunks based on semantic similarity using API for embeddings"""
    embeddings = [get_embeddings_via_api(s['combined_sentence']) for s in sentences]
    distances = []
    for i in range(len(embeddings) - 1):
        similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
            np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1])
        )
        distances.append(1 - similarity)

    threshold = np.percentile(distances, 95)
    chunks = []
    start_idx = 0
    for i, distance in enumerate(distances):
        if distance > threshold:
            chunk = {
                'chunk': ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]]),
                'indices': [s['index'] for s in sentences[start_idx:i + 1]]  # Track indices
            }
            chunks.append(chunk)
            start_idx = i + 1
    if start_idx < len(sentences):
        chunk = {
            'chunk': ' '.join([s['sentence'] for s in sentences[start_idx:]]),
            'indices': [s['index'] for s in sentences[start_idx:]]  # Track indices
        }
        chunks.append(chunk)
    return chunks

def estimate_tokens(text: str) -> int:
    """Estimate number of tokens in text (rough approximation)"""
    return len(text.split()) * 1.3  # Rough estimate of tokens

def chunk_by_tokens(semantic_chunks: List[Dict[str, Any]], max_chunk_tokens: int) -> List[str]:
    """Split semantic chunks into smaller chunks based on token count"""
    chunks = []
    current_chunk = []
    current_tokens = 0
    all_used_indices = set()  # To track used indices
    
    for chunk in semantic_chunks:
        estimated_tokens = estimate_tokens(chunk['chunk'])
        
        if current_tokens + estimated_tokens > max_chunk_tokens:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            current_chunk = [chunk['chunk']]
            current_tokens = estimated_tokens
            all_used_indices.update(chunk['indices'])  # Keep track of indices in the current chunk
        else:
            current_chunk.append(chunk['chunk'])
            current_tokens += estimated_tokens
            all_used_indices.update(chunk['indices'])  # Add current chunk's indices to the set
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    # Return final chunks along with the set of all used indices
    return chunks, all_used_indices

def process_document(file_path: str, max_chunk_tokens: int) -> List[str]:
    """Process document and split into chunks"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        sentences = split_into_sentences(text)
        semantic_chunks = create_chunks(sentences)
        chunked_texts, all_used_indices = chunk_by_tokens(semantic_chunks, max_chunk_tokens)
        return chunked_texts
    except Exception as e:
        print(f"Error processing document {file_path}: {e}")
        return []

# Example usage:
file = f"/data/searchEnhancement/sumit/78804029.txt"

# Define a maximum token size
max_chunk_tokens = 100000  # or whatever value fits your requirements

# Process the document
chunks = process_document(file, max_chunk_tokens)
print(chunks)


In [None]:
#sumit code

import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            # "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work"
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria"
            # "List all supporting documents required for this tender.": "Supporting Documents",
            # "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value": "Important Dates",
            # "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.request_count = 0 
    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 1000) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            self.request_count += 1  # Increment the request counter
            
            # Print the current request details
            print(f"Request {self.request_count}:")
            print(f"Query: {query}")
            
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        combined_text = " ".join(chunks)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()