## single file path

In [3]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            # "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work"
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria"
            # "List all supporting documents required for this tender.": "Supporting Documents",
            # "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value": "Important Dates",
            # "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.request_count = 0 
    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            # print("text:::::::",text)
        
        # Split into sentences and create chunks
        sentences = self._split_into_sentences(text)
        # print("sentences:::::::",sentences)
        chunks = self._create_chunks(sentences)
        # print("chunks:::::::",chunks)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 1000) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            self.request_count += 1  # Increment the request counter
            
            # Print the current request details
            print(f"Request {self.request_count}:")
            print(f"Query: {query}")
            
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        # print("chunks::::",chunks)
        combined_text = " ".join(chunks)
        # print("combined_text::::",combined_text)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

Request 1:
Query: Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.
{
    "Prequalification Criteria": "Here are the extracted clauses that specify Pre-Qualification Criteria or eligibility criteria:\n\n**Section 3 - Evaluation and Qualification Criteria**\n\n1. **Eligibility** (Criteria Compliance Requirements Documents)\n\t* 2.1.1 Nationality: The Bidder must meet the requirement of nationality in accordance with ITB Sub-Clause 4.2.\n\t* 2.1.2 Conflict of Interest: The Bidder must meet the requirement of no conflicts of interest in accordance with ITB Sub-Clause 4.3.\n\t* 2.1.3 Government-owned Entity: The Bidder must meet the requirements of ITB Sub-Clause 4.5.\n\t* 2.1.4 Government-owned Entity: The Bidder must meet the requirements of ITB Sub-Clause 4.5.\n\t* 2.1.5 UN Eligibility: The Bidder must meet the requirement of not being declared ineligible by the UN.\n2. **Pending Litigation** (Criteria Compliance Requirements Documents)\n\t* 2.2.1 Pending 

## prompt template

In [6]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.prompts import PromptTemplate

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        
        # Create a custom prompt template
        self.prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template="""You are an expert tender document analyzer. 
Carefully extract the exact relevant information based on the query from the given context. 
Follow these guidelines:
1. Be precise and concise
2. Avoid repeating sentences
3. Focus on unique, key information
4. If no relevant information is found, respond with "No specific information found"

Context: {context}

Query: {question}

Extracted Information:"""
        )
        
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=2048,
            temperature=0.1
        )
        
        # Update the chain to use the custom prompt template
        self.chain = load_qa_chain(
            self.llm, 
            chain_type='stuff', 
            prompt=self.prompt_template
        )
        
        self.queries = {
            "Extract the most critical pre-qualification or eligibility criteria mentioned in the document.": "Prequalification Criteria",
            # Add more queries as needed
        }
        self.request_count = 0 
        
        
    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            # print("text:::::::",text)
        
        # Split into sentences and create chunks
        sentences = self._split_into_sentences(text)
        # print("sentences:::::::",sentences)
        chunks = self._create_chunks(sentences)
        # print("chunks:::::::",chunks)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 1000) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text with improved extraction"""
        try:
            self.request_count += 1  # Increment the request counter
            
            # Print the current request details
            print(f"Request {self.request_count}:")
            print(f"Query: {query}")
            
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
                
                # Post-process response to remove duplicates and improve clarity
                unique_sentences = list(dict.fromkeys(response.split('. ')))
                processed_response = '. '.join(unique_sentences)
                
            return processed_response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"
        
    def analyze_tender(self, file_path: str) -> Dict[str, str]:
            """Main analysis function"""
            # Process document
            chunks = self.process_document(file_path)
            # print("chunks::::",chunks)
            combined_text = " ".join(chunks)
            # print("combined_text::::",combined_text)
            
            # Process queries in parallel
            results = {}
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, combined_text): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results[title] = response
                    except Exception as e:
                        results[title] = f"Error: {str(e)}"
            
            return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()


Request 1:
Query: Extract the most critical pre-qualification or eligibility criteria mentioned in the document.
{
    "Prequalification Criteria": "Based on the provided document, the most critical pre-qualification or eligibility criteria mentioned are:\n\n1. **Nationality**: The Bidder must meet the requirement of nationality as specified in ITB 4.2.\n2. **Conflict of Interest**: The Bidder must not have any conflict of interest as specified in ITB 4.3.\n3. **Government-owned Entity**: The Bidder must meet the requirements of ITB 4.5 if it is a government-owned entity.\n4. **Eligibility**: The Bidder must not have been declared ineligible by Government of Uttarakhand as specified in ITB 4.4.\n5. **Financial Situation**: The Bidder must demonstrate a sound financial position and prospective long-term profitability as specified in ITB 2.3.1.\n6. **Average Annual Construction Turnover**: The Bidder must have a minimum average annual construction turnover of INR 183.06 Lakhs as specifie

In [2]:
import os
import requests
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

def get_embeddings_via_api(sentence):
    """Get embeddings from API (using all-mpnet-base-v2 model)"""
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "sentence-transformers/all-MiniLM-L6-v2", "input": [sentence]}
    )
    return response.json()["data"][0]["embedding"]



class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            # "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work"
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria"
            # "List all supporting documents required for this tender.": "Supporting Documents",
            # "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value": "Important Dates",
            # "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.request_count = 0 
    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Split into sentences and create chunks
        sentences = self._split_into_sentences(text)
        semantic_chunks = self._create_chunks(sentences)
        chunked_texts, all_used_indices = self._chunk_by_tokens(semantic_chunks)
        return chunked_texts

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        # Remove exact duplicates by using a set to track unique sentences
        unique_sentences = []
        seen_sentences = set()  # Set to track seen sentences (exact match)

        for sentence in sentences:
            # If the sentence hasn't been seen before, add it
            if sentence['sentence'] not in seen_sentences:
                unique_sentences.append(sentence)
                seen_sentences.add(sentence['sentence'])
        return self._combine_sentences(sentences)
    
    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = [get_embeddings_via_api(s['combined_sentence']) for s in sentences]
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1])
            )
            distances.append(1 - similarity)

        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = {
                    'chunk': ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]]),
                    'indices': [s['index'] for s in sentences[start_idx:i + 1]]  # Track indices
                }
                chunks.append(chunk)
                start_idx = i + 1
        if start_idx < len(sentences):
            chunk = {
                'chunk': ' '.join([s['sentence'] for s in sentences[start_idx:]]),
                'indices': [s['index'] for s in sentences[start_idx:]]  # Track indices
            }
            chunks.append(chunk)
        return chunks

    def _estimate_tokens(self, text: str) -> int:
        """Estimate number of tokens in text (rough approximation)"""
        return len(text.split()) * 1.3  # Rough estimate of tokens
    
    
    
    def _chunk_by_tokens(self, semantic_chunks: List[Dict[str, Any]], max_tokens: int = 1000) -> (List[str], set):
        """Split texts into smaller chunks based on token count"""
        chunks = []
        current_chunk = []
        current_tokens = 0
        all_used_indices = set()  # To track used indices

        for chunk in semantic_chunks:
            estimated_tokens = self._estimate_tokens(chunk['chunk'])  # Use the proper function to estimate tokens

            if current_tokens + estimated_tokens > max_tokens:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [chunk['chunk']]
                current_tokens = estimated_tokens
                all_used_indices.update(chunk['indices'])  # Keep track of indices in the current chunk
            else:
                current_chunk.append(chunk['chunk'])
                current_tokens += estimated_tokens
                all_used_indices.update(chunk['indices'])  # Add current chunk's indices to the set

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        # Return final chunks along with the set of all used indices
        return chunks, all_used_indices
    
    

    
    # def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 1000) -> List[str]:
    #     """Split texts into smaller chunks based on token count"""
    #     chunks = []
    #     current_chunk = []
    #     current_tokens = 0
    #     all_used_indices = set()  # To track used indices
        
    #     for chunk in semantic_chunks:
    #         estimated_tokens = estimate_tokens(chunk['chunk'])
            
    #         if current_tokens + estimated_tokens > max_chunk_tokens:
    #             if current_chunk:
    #                 chunks.append(" ".join(current_chunk))
    #             current_chunk = [chunk['chunk']]
    #             current_tokens = estimated_tokens
    #             all_used_indices.update(chunk['indices'])  # Keep track of indices in the current chunk
    #         else:
    #             current_chunk.append(chunk['chunk'])
    #             current_tokens += estimated_tokens
    #             all_used_indices.update(chunk['indices'])  # Add current chunk's indices to the set
        
    #     if current_chunk:
    #         chunks.append(" ".join(current_chunk))
        
    #     # Return final chunks along with the set of all used indices
    #     return chunks, all_used_indices
    

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            self.request_count += 1  # Increment the request counter
            
            # Print the current request details
            print(f"Request {self.request_count}:")
            print(f"Query: {query}")
            
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        # print("chunks::::",chunks)
        combined_text = " ".join(chunks)
        # print("combined_text::::",combined_text)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

Request 1:
Query: Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.
{
    "Prequalification Criteria": "Here are the clauses that specify Pre-Qualification Criteria or eligibility criteria:\n\n**Section 3 - Evaluation and Qualification Criteria**\n\n1. **Nationality** (Clause 2.1.1): The Bidder must meet the requirement of nationality as specified in ITB 4.2.\n2. **Conflict of Interest** (Clause 2.1.2): The Bidder must not have any conflict of interest as specified in ITB 4.3.\n3. **ADB Eligibility** (Clause 2.1.3): The Bidder must meet the eligibility criteria specified in ITB 4.4.\n4. **Government-owned Entity** (Clause 2.1.4): The Bidder must meet the requirements specified in ITB 4.5.\n5. **UN Eligibility** (Clause 2.1.5): The Bidder must meet the eligibility criteria specified in ITB 4.6.\n6. **Pending Litigation** (Clause 2.2.1): The Bidder must not have any pending litigation as specified in ITB 4.7.\n7. **Financial Situation** (Clause 2.3.1): The 

## remove duplicate

In [1]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "Analyze the document and extract ONLY the functional requirements exactly as they are mentioned in the tender document. Provide a precise and concise list of requirements without adding any external information or interpretation.": "Scope of Work",
            "Extract ONLY the pre-qualification or eligibility criteria directly stated in the tender document. Do not include any additional context or explanations beyond what is explicitly written.": "Prequalification Criteria",
            "List ONLY the supporting documents that are explicitly required in the tender document. Include no additional commentary or external suggestions.": "Supporting Documents",
            "Extract ALL dates mentioned in the tender document, including but not limited to: Bid submission end date, Opening date, Closing date, Pre-bid meeting date, EMD amount, Tender fee, and Tender value. Provide the exact dates as they appear in the document.": "Important Dates",
            "Extract the EXACT contact details of the officer mentioned in the document. This should include only the name, email ID, and contact number as they are written in the original text.": "Contact Details"
        }
        self.request_count = 0 

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Split into sentences 
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)
    
    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences"""
        # Use regex to split sentences, but handle common abbreviations
        sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
        
        # Remove empty and whitespace-only sentences
        sentences = [s.strip() for s in sentences if s.strip()]
        
        return [{'sentence': s, 'index': i} for i, s in enumerate(sentences)]
    
    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 2) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current sentence
            context.append(sent['sentence'])
            # Add next sentences
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined
    
    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        if not sentences:
            return []

        # Combine sentences with context
        sentences_with_context = self._combine_sentences(sentences)

        # Encode sentences with context
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences_with_context])
        
        # Calculate pairwise distances
        distances = [
            1 - np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            for i in range(len(embeddings) - 1)
        ]
        
        # Use adaptive thresholding
        threshold = np.percentile(distances, 90)  # Slightly lower threshold for more granular chunking
        
        chunks = []
        start_idx = 0

        for i, distance in enumerate(distances):
            if distance > threshold or i == len(distances) - 1:
                # Collect sentences for this chunk
                chunk_sentences = [
                    sent['sentence'] for sent in sentences_with_context[start_idx:i+1]
                ]
                
                # Join sentences into a chunk
                if chunk_sentences:
                    chunks.append(' '.join(chunk_sentences))
                
                start_idx = i + 1

        return chunks
        
    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 1000) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            self.request_count += 1  # Increment the request counter
            
            # Print the current request details
            print(f"Request {self.request_count}:")
            print(f"Query: {query}")
            
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        combined_text = " ".join(chunks)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

  from tqdm.autonotebook import tqdm, trange


Request 1:
Query: Analyze the document and extract ONLY the functional requirements exactly as they are mentioned in the tender document. Provide a precise and concise list of requirements without adding any external information or interpretation.
Request 2:
Query: Extract ONLY the pre-qualification or eligibility criteria directly stated in the tender document. Do not include any additional context or explanations beyond what is explicitly written.
Request 3:
Query: List ONLY the supporting documents that are explicitly required in the tender document. Include no additional commentary or external suggestions.
Request 4:
Query: Extract ALL dates mentioned in the tender document, including but not limited to: Bid submission end date, Opening date, Closing date, Pre-bid meeting date, EMD amount, Tender fee, and Tender value. Provide the exact dates as they appear in the document.
Request 5:
Query: Extract the EXACT contact details of the officer mentioned in the document. This should inc

## vaishnavi more code 

In [None]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any, Union
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed
import os 

# # Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore")

warnings.filterwarnings("ignore")

class TenderAnalyzer:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=512,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            # "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "List of all the  important dates and times mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value":"Importants Date",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.max_chunk_tokens = 100000  # Safe limit below model's maximum

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        
        # Remove exact duplicates by using a set to track unique sentences
        unique_sentences = []
        seen_sentences = set()  # Set to track seen sentences (exact match)

        for sentence in sentences:
            # If the sentence hasn't been seen before, add it
            if sentence['sentence'] not in seen_sentences:
                unique_sentences.append(sentence)
                seen_sentences.add(sentence['sentence'])
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Create document chunks based on semantic similarity"""
        embeddings = [get_embeddings_via_api(s['combined_sentence']) for s in sentences]
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1])
            )
            distances.append(1 - similarity)

        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = {
                    'chunk': ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]]),
                    'indices': [s['index'] for s in sentences[start_idx:i + 1]]  # Track indices
                }
                chunks.append(chunk)
                start_idx = i + 1
        if start_idx < len(sentences):
            chunk = {
                'chunk': ' '.join([s['sentence'] for s in sentences[start_idx:]]),
                'indices': [s['index'] for s in sentences[start_idx:]]  # Track indices
            }
            chunks.append(chunk)
        return chunks

    def _estimate_tokens(self, text: str) -> int:
        """Estimate number of tokens in text (rough approximation)"""
        return len(text.split()) * 1.3  # Rough estimate of tokens

    def _chunk_by_tokens(self, semantic_chunks: List[str]) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        chunks = []
        current_chunk = []
        current_tokens = 0
        all_used_indices = set()  # To track used indices
        
        for chunk in semantic_chunks:
            estimated_tokens = estimate_tokens(chunk['chunk'])
            
            if current_tokens + estimated_tokens > max_chunk_tokens:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [chunk['chunk']]
                current_tokens = estimated_tokens
                all_used_indices.update(chunk['indices'])  # Keep track of indices in the current chunk
            else:
                current_chunk.append(chunk['chunk'])
                current_tokens += estimated_tokens
                all_used_indices.update(chunk['indices'])  # Add current chunk's indices to the set
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        # Return final chunks along with the set of all used indices
        return chunks, all_used_indices

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            sentences = self._split_into_sentences(text)
            semantic_chunks = self._create_chunks(sentences)
            chunked_texts, all_used_indices = self._chunk_by_tokens(semantic_chunks)
            return chunked_texts
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            return []

    # def _merge_responses(self, responses: List[str]) -> str:
    #     """Merge multiple responses into a coherent summary"""
    #     # Remove duplicates while maintaining order
    #     unique_lines = []
    #     seen = set()
    #     for response in responses:
    #         for line in response.split('\n'):
    #             line = line.strip()
    #             if line and line not in seen:
    #                 seen.add(line)
    #                 unique_lines.append(line)
        
    #     return '\n'.join(unique_lines)
    
    
    
    
    def _merge_responses(self, responses: List[str]) -> str:
                                                """Merge multiple responses into a coherent summary and remove duplicates."""
        unique_lines = []
        seen = set()

        for response in responses:
            for line in response.split('\n'):
                line = line.strip()
                if line and line not in seen:
                    seen.add(line)
                    unique_lines.append(line)
        
        # Additional deduplication for repeated sections within a single line
        cleaned_lines = []
        for line in unique_lines:
            parts = line.split()  # Tokenize the line
            deduped_line = " ".join(dict.fromkeys(parts))  # Remove repeated words
            cleaned_lines.append(deduped_line)
        
        return '\n'.join(cleaned_lines)


    def process_query(self, query: str, chunks: List[str]) -> str:
        """Process a single query against multiple text chunks"""
        try:
            responses = []
            for chunk in chunks:
                with get_openai_callback() as cb:
                    response = self.chain.run(
                        input_documents=[Document(page_content=chunk)],
                        question=query
                    )
                    responses.append(response.strip())
            
            return self._merge_responses(responses)
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function"""
        try:
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            results = []
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, chunks): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]

from opensearchpy import OpenSearch

def process_folder(base_folder: str) -> Dict[str, Any]:
    """Process all text files in the given folder and its subfolders, and index results into OpenSearch."""
    analyzer = TenderAnalyzer()
    all_results = []

    # Set up OpenSearch client
    index_name = 'tprocanswers'
    opensearch_client = OpenSearch(
        hosts=['https://localhost:9200'],
        http_auth=("admin", "4Z*lwtz,,2T:0TGu"),
        use_ssl=True,
        verify_certs=False,
        ssl_show_warn=False
    )

    with ThreadPoolExecutor(max_workers=32) as executor:
        future_to_file = {}
        
        for root, _, files in os.walk(base_folder):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    future_to_file[executor.submit(analyzer.analyze_tender, file_path)] = file_path
        
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                results = future.result()
                tcno = os.path.basename(os.path.dirname(file_path))
                all_results.append({
                    "tcno": tcno,
                    # "file_path": file_path,
                    "results": results
                })
                
                # Index results into OpenSearch
                opensearch_client.index(index=index_name, id=tcno, body={"file_path": file_path, "results": results})
                print(f"Indexed results for {tcno} in OpenSearch.")
            
            except Exception as e:
                all_results.append({
                    "tcno": "Unknown",
                    # "file_path": file_path,
                    "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
                })
    
    return {"results": all_results}

def main():
    """Main execution function"""
    # Base folder path
    date_str = "22-11-24"  # You can modify this as needed
    folder_path = f"/data/txtfolder/dailydocument_23-11-24_txt"
    
    # Process all documents in the folder
    results = process_folder(folder_path)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()
    
    
    
    
    
    

In [None]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff',verbose=True)
        self.queries = {
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Split into sentences and create chunks
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 3500) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        unique_chunks = set()  # Track unique chunks
        
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            
            # Add only unique chunks
            for chunk in text_chunks:
                if chunk not in unique_chunks:
                    chunks.append(chunk)
                    unique_chunks.add(chunk)
        
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return self._remove_duplicates(response.strip())
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def _remove_duplicates(self, text: str) -> str:
        """Remove duplicate points from the response"""
        # Split the text into points
        points = [point.strip() for point in re.split(r'\n|\d+\.', text) if point.strip()]
        
        # Remove duplicates while preserving order
        unique_points = []
        seen = set()
        for point in points:
            # Normalize point by lowercasing and removing extra whitespace
            normalized_point = ' '.join(point.lower().split())
            
            if normalized_point not in seen:
                unique_points.append(point)
                seen.add(normalized_point)
        
        # Reconstruct the text with unique points
        return '\n'.join(f"{i+1}. {point}" for i, point in enumerate(unique_points))

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        combined_text = " ".join(chunks)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

## store response in excel

In [None]:
# import pandas as pd

# def main():
#     """Main execution function"""
#     # Process tender document
#     input_file = "/data/txtfolder/dailydoc_test/70398187/70398187.txt"
    
#     # Analyze and get results
#     results = analyze_tender_document(input_file)
    
#     # Convert the results to a DataFrame
#     results_df = pd.DataFrame(list(results.items()), columns=["Key", "Value"])

#     # Save the results to an Excel file
#     output_file = "tender_analysis_results.xlsx"
#     results_df.to_excel(output_file, index=False)

#     # Print results (optional)
#     print(f"Results saved to {output_file}")
    
#     return results

# if __name__ == "__main__":
#     main()


In [None]:
import requests
from typing import List
from langchain.embeddings.base import Embeddings
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

def get_embedding(text: str) -> List[float]:
    response = requests.post("http://0.0.0.0:5002/embeddings",
        json={"model": "sentence-transformers/all-mpnet-base-v2", "input": [text]})
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)



class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self):
        self.model =self.model = CustomEmbeddings()
        self.llm = ChatOpenAI(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Split into sentences and create chunks
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 3500) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        combined_text = " ".join(chunks)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file =    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

In [None]:
import os
import numpy as np
import re
import warnings
import json
from datetime import datetime
from typing import Dict, List, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
import multiprocessing

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"


class TenderAnalyzer:
    """Main class for analyzing tender documents"""

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=500,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            # "Identify the functional requirements, also referred to as the scope of work, specified in the document.": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria"
        #     "List all mandatory qualification criteria, including blacklisting status and required certifications.": "Mandatory Qualification Criteria",
        #     "Summarize the work specifications that bidders must meet to fulfill the tender requirements.": "Specifications",
        #     "List all supporting documents required for this tender.": "Supporting Documents",
        #     "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
        #     "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        if not text:  # Skip empty files
            print(f"Empty file: {file_path}")
            return []

        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

	
    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i}
                     for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])

        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)

        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0

        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1

        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)

        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 3500) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars]
                           for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
      """Analyze a single tender document"""
      chunks = self.process_document(file_path)
      if not chunks:  # Skip empty documents
        print(f"No content to process in: {file_path}")
        return {title: "No content" for title in self.queries.values()}

      results = {title: "" for title in self.queries.values()}
      query_tasks = {}

      with ThreadPoolExecutor(max_workers=4) as executor:
        for chunk in chunks:
            for query, title in self.queries.items():
                future = executor.submit(self.process_query, query, chunk)
                query_tasks[future] = title

        for future in as_completed(query_tasks):
            title = query_tasks[future]
            try:
                response = future.result()
                results[title] += f"{response}\n"
            except Exception as e:
                print(f"Error processing query for {title}: {e}")

      return results



def process_folder(tcno):
    """Process a single subfolder containing .txt files"""
    try:
        folder_path = f"/data/txtfolder/dailydoc_test/{tcno}"
        if not os.path.exists(folder_path):
            print(f"Folder not found: {folder_path}")
            return

        analyzer = TenderAnalyzer()
        folder_results = []

        txt_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

        with ThreadPoolExecutor() as executor:
            futures = {executor.submit(analyzer.analyze_tender, file): file for file in txt_files}
            for future in as_completed(futures):
                file_name = os.path.basename(futures[future])
                try:
                    analysis_result = future.result()
                    folder_results.append({"file_name": file_name, "analysis": analysis_result})
                except Exception as e:
                    print(f"Failed to process file {file_name}: {e}")

        # Format response
        json_response = {
            "tcno": tcno,
            "results": folder_results
        }

        print(f"Processed folder: {tcno}")
        return json_response

    except Exception as e:
        print(f"Failed to process folder {tcno}: {str(e)}")


def process_folders_in_parallel():
    """Process all subfolders in parallel"""
    try:
        base_folder_path = "/data/txtfolder/dailydoc_test"
        if not os.path.exists(base_folder_path):
            print(f"Base folder not found: {base_folder_path}")
            return

        tcno_folders = [tcno for tcno in os.listdir(base_folder_path) if os.path.isdir(os.path.join(base_folder_path, tcno))]

        results = []
        with ThreadPoolExecutor(max_workers=2) as executor:
            futures = {executor.submit(process_folder, tcno): tcno for tcno in tcno_folders}
            for future in as_completed(futures):
                tcno = futures[future]
                try:
                    folder_result = future.result()
                    if folder_result:
                        results.append(folder_result)
                except Exception as e:
                    print(f"Failed to process folder {tcno}: {e}")

        # Save results to a JSON file
        output_path = f"analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=4)
        print(f"All results saved to {output_path}")

    except Exception as e:
        print(f"Error in processing folders: {e}")


if __name__ == '__main__':
    process_folders_in_parallel()


In [None]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any, Union
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=500,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            sentences = self._split_into_sentences(text)
            chunks = self._create_chunks(sentences)
            return self._chunk_by_tokens(chunks)
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            return []

    
    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 3500) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    # ... [Previous methods remain the same until analyze_tender] ...

    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function with modified response format"""
        try:
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            combined_text = " ".join(chunks)
            results = []
            
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, combined_text): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]

from concurrent.futures import ThreadPoolExecutor, as_completed

def process_folder(base_folder: str) -> Dict[str, Any]:
    """
    Process all text files in the given folder and its subfolders in parallel.
    
    Args:
        base_folder (str): Base folder path containing subfolders with text files
        
    Returns:
        Dict[str, Any]: Dictionary containing results for all processed files
    """
    analyzer = TenderAnalyzer()
    all_results = []
    
    def analyze_file(file_path: str) -> Dict[str, Any]:
        """Analyze a single file and return the result"""
        try:
            # Get tender number from folder name
            tcno = os.path.basename(os.path.dirname(file_path))
            
            # Analyze the tender document
            results = analyzer.analyze_tender(file_path)
            
            return {
                "tcno": tcno,
                "file_path": file_path,
                "results": results
            }
        except Exception as e:
            return {
                "tcno": "Unknown",
                "file_path": file_path,
                "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
            }
    
    # Create ThreadPoolExecutor to process each file in parallel
    with ThreadPoolExecutor(max_workers=8) as executor:  # You can adjust max_workers based on your system's capacity
        future_to_file = {}
        
        # Walk through all subfolders and submit tasks for files ending with .txt
        for root, _, files in os.walk(base_folder):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    future_to_file[executor.submit(analyze_file, file_path)] = file_path
        
        # Collect the results from all futures as they complete
        for future in as_completed(future_to_file):
            result = future.result()
            all_results.append(result)
    
    return {"results": all_results}


def main():
    """Main execution function"""
    # Base folder path
    date_str = "22-11-24"  # You can modify this as needed
    folder_path = f"/data/txtfolder/dailydoc_test"
    
    # Process all documents in the folder
    results = process_folder(folder_path)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

In [None]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed
import tiktoken

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

# Define the maximum token limit
MAX_TOKENS = 131072

# Function to calculate token count
def calculate_token_count(text: str, model: str = 'meta-llama/Llama-3.1-8B-Instruct') -> int:
    """Calculate the number of tokens in a text."""
    enc = tiktoken.get_encoding("cl100k_base")  # For Llama models
    return len(enc.encode(text))

def chunk_by_tokens_with_limit(texts: List[str], max_tokens: int = MAX_TOKENS) -> List[str]:
    """Split texts into smaller chunks based on token count."""
    chunks = []
    current_chunk = ""
    
    for text in texts:
        token_count = calculate_token_count(current_chunk + text)
        
        if token_count > max_tokens:
            # If adding this text would exceed the limit, start a new chunk
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = text  # Start new chunk with current text
        else:
            current_chunk += text  # Add text to the current chunk
    
    if current_chunk:  # Add remaining chunk if any
        chunks.append(current_chunk)
    
    return chunks

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=500,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            sentences = self._split_into_sentences(text)
            chunks = self._create_chunks(sentences)
            
            # Apply chunking based on token limits
            return chunk_by_tokens_with_limit(chunks, max_tokens=MAX_TOKENS)
    
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            return []

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            # Split text into smaller chunks to avoid token limit errors
            chunks = chunk_by_tokens_with_limit([text], max_tokens=MAX_TOKENS)
            
            full_response = ""
            
            # Process each chunk and combine the responses
            for chunk in chunks:
                with get_openai_callback() as cb:
                    response = self.chain.run(
                        input_documents=[Document(page_content=chunk)],
                        question=query
                    )
                full_response += response.strip() + " "
            
            return full_response.strip()  # Combine all responses from chunks
        
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function with modified response format"""
        try:
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            combined_text = " ".join(chunks)
            results = []
            
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, combined_text): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]

def process_folder(base_folder: str) -> Dict[str, Any]:
    """
    Process all text files in the given folder and its subfolders
    
    Args:
        base_folder (str): Base folder path containing subfolders with text files
        
    Returns:
        Dict[str, Any]: Dictionary containing results for all processed files
    """
    analyzer = TenderAnalyzer()
    all_results = []
    
    # Walk through all subfolders
    for root, _, files in os.walk(base_folder):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                try:
                    # Get tender number from folder name
                    tcno = os.path.basename(os.path.dirname(file_path))
                    
                    # Analyze the tender document
                    results = analyzer.analyze_tender(file_path)
                    
                    # Add file information to results
                    all_results.append({
                        "tcno": tcno,
                        "file_path": file_path,
                        "results": results
                    })
                    
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
                    all_results.append({
                        "tcno": tcno,
                        "file_path": file_path,
                        "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
                    })
    
    return {"results": all_results}

def main():
    """Main execution function"""
    # Base folder path
    date_str = "22-11-24"  # You can modify this as needed
    folder_path = f"/data/txtfolder/dailydoc_test"
    
    # Process all documents in the folder
    results = process_folder(folder_path)
    
    # Print final results
    print(f"Processed {len(results['results'])} files.")
    for file_result in results['results']:
        print(f"Results for file {file_result['file_path']}:")
        for query_result in file_result["results"]:
            print(f"{query_result['title']} -> {query_result['response']}")

if __name__ == "__main__":
    main()


  from tqdm.autonotebook import tqdm, trange


Error processing query: Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 131072 tokens. However, you requested 146481 tokens (145981 in the messages, 500 in the completion). Please reduce the length of the messages or completion.", 'type': 'BadRequestError', 'param': None, 'code': 400}
Error processing query: Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 131072 tokens. However, you requested 146495 tokens (145995 in the messages, 500 in the completion). Please reduce the length of the messages or completion.", 'type': 'BadRequestError', 'param': None, 'code': 400}
Error processing query: Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 131072 tokens. However, you requested 146487 tokens (145987 in the messages, 500 in the completion). Please reduce the length of the messages or completion.", 'type': 'BadRequestError', 'param': None, 'code': 400}
Error processing qu

In [None]:
import os
import re
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed
import tiktoken

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Define constants
MAX_TOKENS = 131072
MAX_COMPLETION_TOKENS = 500  # Maximum tokens allowed for model's completion

# Function to calculate token count
def calculate_token_count(text: str) -> int:
    """Calculate the number of tokens in a text."""
    enc = tiktoken.get_encoding("cl100k_base")  # For Llama models
    return len(enc.encode(text))

def chunk_by_tokens(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
    """Chunk a long text into smaller chunks based on token limits."""
    tokens = tiktoken.get_encoding("cl100k_base").encode(text)
    chunks = []
    
    while len(tokens) > max_tokens:
        # Find the cutoff point where token limit is reached
        cutoff = max_tokens
        while cutoff > 0 and tokens[cutoff] not in [b' ', b'.', b'?', b'!']:
            cutoff -= 1
        chunks.append(tiktoken.get_encoding("cl100k_base").decode(tokens[:cutoff]))
        tokens = tokens[cutoff:]
    
    # Add the final chunk (which is within limit)
    if tokens:
        chunks.append(tiktoken.get_encoding("cl100k_base").decode(tokens))
    
    return chunks

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=MAX_COMPLETION_TOKENS,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            # Chunk the document into smaller parts
            return chunk_by_tokens(text, max_tokens=MAX_TOKENS)
    
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            return []

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            # Split text into smaller chunks to avoid token limit errors
            chunks = chunk_by_tokens(text, max_tokens=MAX_TOKENS)
            
            full_response = ""
            
            # Process each chunk and combine the responses
            for chunk in chunks:
                with get_openai_callback() as cb:
                    response = self.chain.run(
                        input_documents=[Document(page_content=chunk)],
                        question=query
                    )
                full_response += response.strip() + " "
            
            return full_response.strip()  # Combine all responses from chunks
        
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function with modified response format"""
        try:
            # Process document and get chunks
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            combined_text = " ".join(chunks)
            results = []
            
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, combined_text): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]

def process_folder(base_folder: str) -> Dict[str, Any]:
    """
    Process all text files in the given folder and its subfolders
    
    Args:
        base_folder (str): Base folder path containing subfolders with text files
        
    Returns:
        Dict[str, Any]: Dictionary containing results for all processed files
    """
    analyzer = TenderAnalyzer()
    all_results = []
    
    # Walk through all subfolders
    for root, _, files in os.walk(base_folder):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                try:
                    # Get tender number from folder name
                    tcno = os.path.basename(os.path.dirname(file_path))
                    
                    # Analyze the tender document
                    results = analyzer.analyze_tender(file_path)
                    
                    # Add file information to results
                    all_results.append({
                        "tcno": tcno,
                        "file_path": file_path,
                        "results": results
                    })
                    
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
                    all_results.append({
                        "tcno": tcno,
                        "file_path": file_path,
                        "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
                    })
    
    return {"results": all_results}

def main():
    """Main execution function"""
    # Base folder path
    date_str = "22-11-24"  # You can modify this as needed
    folder_path = f"/data/txtfolder/dailydoc_test"
    
    # Process all documents in the folder
    results = process_folder(folder_path)
    
    # Print final results
    print(f"Processed {len(results['results'])} files.")
    for file_result in results['results']:
        print(f"Results for file {file_result['file_path']}:")
        for query_result in file_result["results"]:
            print(f"{query_result['title']} -> {query_result['response']}")

if __name__ == "__main__":
    main()


In [None]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any, Union
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

warnings.filterwarnings("ignore")

class TenderAnalyzer:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=500,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.max_chunk_tokens = 100000  # Safe limit below model's maximum

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _estimate_tokens(self, text: str) -> int:
        """Estimate number of tokens in text (rough approximation)"""
        return len(text.split()) * 1.3  # Rough estimate of tokens

    def _chunk_by_tokens(self, texts: List[str]) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for text in texts:
            estimated_tokens = self._estimate_tokens(text)
            
            if current_tokens + estimated_tokens > self.max_chunk_tokens:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [text]
                current_tokens = estimated_tokens
            else:
                current_chunk.append(text)
                current_tokens += estimated_tokens
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        return chunks

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            sentences = self._split_into_sentences(text)
            semantic_chunks = self._create_chunks(sentences)
            return self._chunk_by_tokens(semantic_chunks)
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            return []

    def _merge_responses(self, responses: List[str]) -> str:
        """Merge multiple responses into a coherent summary"""
        # Remove duplicates while maintaining order
        unique_lines = []
        seen = set()
        for response in responses:
            for line in response.split('\n'):
                line = line.strip()
                if line and line not in seen:
                    seen.add(line)
                    unique_lines.append(line)
        
        return '\n'.join(unique_lines)

    def process_query(self, query: str, chunks: List[str]) -> str:
        """Process a single query against multiple text chunks"""
        try:
            responses = []
            for chunk in chunks:
                with get_openai_callback() as cb:
                    response = self.chain.run(
                        input_documents=[Document(page_content=chunk)],
                        question=query
                    )
                    responses.append(response.strip())
            
            return self._merge_responses(responses)
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function"""
        try:
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            results = []
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, chunks): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]

def process_folder(base_folder: str) -> Dict[str, Any]:
    """Process all text files in the given folder and its subfolders"""
    analyzer = TenderAnalyzer()
    all_results = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_file = {}
        
        for root, _, files in os.walk(base_folder):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    future_to_file[executor.submit(analyzer.analyze_tender, file_path)] = file_path
        
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                results = future.result()
                tcno = os.path.basename(os.path.dirname(file_path))
                all_results.append({
                    "tcno": tcno,
                    "file_path": file_path,
                    "results": results
                })
            except Exception as e:
                all_results.append({
                    "tcno": "Unknown",
                    "file_path": file_path,
                    "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
                })
    
    return {"results": all_results}

def main():
    """Main execution function"""
    # Base folder path
    date_str = "22-11-24"  # You can modify this as needed
    folder_path = f"/data/txtfolder/dailydoc_test"
    
    # Process all documents in the folder
    results = process_folder(folder_path)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

## working with storepq in opensearch

In [None]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any, Union
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed
import os 

# # Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore")

warnings.filterwarnings("ignore")

class TenderAnalyzer:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=512,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            # "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value":"Importants Date",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.max_chunk_tokens = 100000  # Safe limit below model's maximum

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _estimate_tokens(self, text: str) -> int:
        """Estimate number of tokens in text (rough approximation)"""
        return len(text.split()) * 1.3  # Rough estimate of tokens

    def _chunk_by_tokens(self, texts: List[str]) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for text in texts:
            estimated_tokens = self._estimate_tokens(text)
            
            if current_tokens + estimated_tokens > self.max_chunk_tokens:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [text]
                current_tokens = estimated_tokens
            else:
                current_chunk.append(text)
                current_tokens += estimated_tokens
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        return chunks

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            sentences = self._split_into_sentences(text)
            semantic_chunks = self._create_chunks(sentences)
            return self._chunk_by_tokens(semantic_chunks)
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            return []

    # def _merge_responses(self, responses: List[str]) -> str:
    #     """Merge multiple responses into a coherent summary"""
    #     # Remove duplicates while maintaining order
    #     unique_lines = []
    #     seen = set()
    #     for response in responses:
    #         for line in response.split('\n'):
    #             line = line.strip()
    #             if line and line not in seen:
    #                 seen.add(line)
    #                 unique_lines.append(line)
        
    #     return '\n'.join(unique_lines)
    def _merge_responses(self, responses: List[str]) -> str:
        """Merge multiple responses into a coherent summary and remove duplicates."""
        unique_lines = []
        seen = set()

        for response in responses:
            for line in response.split('\n'):
                line = line.strip()
                if line and line not in seen:
                    seen.add(line)
                    unique_lines.append(line)
        
        # Additional deduplication for repeated sections within a single line
        cleaned_lines = []
        for line in unique_lines:
            parts = line.split()  # Tokenize the line
            deduped_line = " ".join(dict.fromkeys(parts))  # Remove repeated words
            cleaned_lines.append(deduped_line)
        
        return '\n'.join(cleaned_lines)


    def process_query(self, query: str, chunks: List[str]) -> str:
        """Process a single query against multiple text chunks"""
        try:
            responses = []
            for chunk in chunks:
                with get_openai_callback() as cb:
                    response = self.chain.run(
                        input_documents=[Document(page_content=chunk)],
                        question=query
                    )
                    responses.append(response.strip())
            
            return self._merge_responses(responses)
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function"""
        try:
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            results = []
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, chunks): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]

from opensearchpy import OpenSearch

def process_folder(base_folder: str) -> Dict[str, Any]:
    """Process all text files in the given folder and its subfolders, and index results into OpenSearch."""
    analyzer = TenderAnalyzer()
    all_results = []

    # Set up OpenSearch client
    index_name = 'tprocanswers'
    opensearch_client = OpenSearch(
        hosts=['https://localhost:9200'],
        http_auth=("admin", "4Z*lwtz,,2T:0TGu"),
        use_ssl=True,
        verify_certs=False,
        ssl_show_warn=False
    )

    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_file = {}
        
        for root, _, files in os.walk(base_folder):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    future_to_file[executor.submit(analyzer.analyze_tender, file_path)] = file_path
        
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                results = future.result()
                tcno = os.path.basename(os.path.dirname(file_path))
                all_results.append({
                    "tcno": tcno,
                    # "file_path": file_path,
                    "results": results
                })
                
                # Index results into OpenSearch
                opensearch_client.index(index=index_name, id=tcno, body={"file_path": file_path, "results": results})
                print(f"Indexed results for {tcno} in OpenSearch.")
            
            except Exception as e:
                all_results.append({
                    "tcno": "Unknown",
                    # "file_path": file_path,
                    "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
                })
    
    return {"results": all_results}

def main():
    """Main execution function"""
    # Base folder path
    date_str = "22-11-24"  # You can modify this as needed
    folder_path = f"/data/txtfolder/dailydocument_23-11-24_txt"
    
    # Process all documents in the folder
    results = process_folder(folder_path)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

  from tqdm.autonotebook import tqdm, trange


Indexed results for 78356751 in OpenSearch.
Indexed results for 78341034 in OpenSearch.
Indexed results for 78363562 in OpenSearch.
Indexed results for 78353126 in OpenSearch.
Indexed results for 78331655 in OpenSearch.
Indexed results for 78353138 in OpenSearch.
Indexed results for 78349785 in OpenSearch.
Indexed results for 77909955 in OpenSearch.
Indexed results for 78357275 in OpenSearch.
Indexed results for 78348519 in OpenSearch.
Indexed results for 78347571 in OpenSearch.
Indexed results for 78362677 in OpenSearch.
Indexed results for 78366106 in OpenSearch.
Indexed results for 77968697 in OpenSearch.
Indexed results for 78339728 in OpenSearch.
Indexed results for 78346492 in OpenSearch.
Indexed results for 78359456 in OpenSearch.
Indexed results for 78359556 in OpenSearch.
Indexed results for 78350030 in OpenSearch.
Indexed results for 77495147 in OpenSearch.
Indexed results for 78145210 in OpenSearch.
Indexed results for 78326902 in OpenSearch.
Indexed results for 78342527 in 

## working with our hosted model 

In [None]:
import os
import re
import warnings
import numpy as np
import requests
from typing import Dict, List, Any, Union
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore")

def get_embeddings_via_api(sentence):
    """Get embeddings from API (using all-mpnet-base-v2 model)"""
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "sentence-transformers/all-MiniLM-L6-v2", "input": [sentence]}
    )
    return response.json()["data"][0]["embedding"]

class TenderAnalyzer:
    def __init__(self):
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=512,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.max_chunk_tokens = 100000  # Safe limit below model's maximum

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity using API for embeddings"""
        embeddings = [get_embeddings_via_api(s['combined_sentence']) for s in sentences]
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _estimate_tokens(self, text: str) -> int:
        """Estimate number of tokens in text (rough approximation)"""
        return len(text.split()) * 1.3  # Rough estimate of tokens

    def _chunk_by_tokens(self, texts: List[str]) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for text in texts:
            estimated_tokens = self._estimate_tokens(text)
            
            if current_tokens + estimated_tokens > self.max_chunk_tokens:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [text]
                current_tokens = estimated_tokens
            else:
                current_chunk.append(text)
                current_tokens += estimated_tokens
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        return chunks

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            sentences = self._split_into_sentences(text)
            semantic_chunks = self._create_chunks(sentences)
            return self._chunk_by_tokens(semantic_chunks)
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            return []

    def _merge_responses(self, responses: List[str]) -> str:
        """Merge multiple responses into a coherent summary and remove duplicates."""
        unique_lines = []
        seen = set()

        for response in responses:
            for line in response.split('\n'):
                line = line.strip()
                if line and line not in seen:
                    seen.add(line)
                    unique_lines.append(line)
        
        cleaned_lines = []
        for line in unique_lines:
            parts = line.split()  # Tokenize the line
            deduped_line = " ".join(dict.fromkeys(parts))  # Remove repeated words
            cleaned_lines.append(deduped_line)
        
        return '\n'.join(cleaned_lines)

    def process_query(self, query: str, chunks: List[str]) -> str:
        """Process a single query against multiple text chunks"""
        try:
            responses = []
            for chunk in chunks:
                with get_openai_callback() as cb:
                    response = self.chain.run(
                        input_documents=[Document(page_content=chunk)],
                        question=query
                    )
                    responses.append(response.strip())
            
            return self._merge_responses(responses)
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function"""
        try:
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            results = []
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, chunks): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]
        

from opensearchpy import OpenSearch

def process_folder(base_folder: str) -> Dict[str, Any]:
    """Process all text files in the given folder and its subfolders, and index results into OpenSearch."""
    analyzer = TenderAnalyzer()
    all_results = []

    # Set up OpenSearch client
    index_name = 'tprocanswers'
    opensearch_client = OpenSearch(
        hosts=['https://localhost:9200'],
        http_auth=("admin", "4Z*lwtz,,2T:0TGu"),
        use_ssl=True,
        verify_certs=False,
        ssl_show_warn=False
    )

    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_file = {}
        
        for root, _, files in os.walk(base_folder):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    future_to_file[executor.submit(analyzer.analyze_tender, file_path)] = file_path
        
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                results = future.result()
                tcno = os.path.basename(os.path.dirname(file_path))
                all_results.append({
                    "tcno": tcno,
                    # "file_path": file_path,
                    "results": results
                })
                
                # Index results into OpenSearch
                opensearch_client.index(index=index_name, id=tcno, body={"file_path": file_path, "results": results})
                print(f"Indexed results for {tcno} in OpenSearch.")
            
            except Exception as e:
                all_results.append({
                    "tcno": "Unknown",
                    # "file_path": file_path,
                    "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
                })
    
    return {"results": all_results}

def main():
    """Main execution function"""
    # Base folder path
    date_str = "22-11-24"  # You can modify this as needed
    folder_path = f"/data/txtfolder/dailydocument_23-11-24_txt"
    
    # Process all documents in the folder
    results = process_folder(folder_path)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()       


Indexed results for 78356751 in OpenSearch.
Indexed results for 78363562 in OpenSearch.
Indexed results for 78341034 in OpenSearch.
Indexed results for 78353138 in OpenSearch.
Indexed results for 78353126 in OpenSearch.
Indexed results for 78366106 in OpenSearch.
Indexed results for 78331655 in OpenSearch.
Indexed results for 78348519 in OpenSearch.


## parrlel processing in embedding generation

In [None]:
import os
import re
import warnings
import numpy as np
import requests
from typing import Dict, List, Any, Union
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore")

def get_embeddings_via_api(sentences: List[str]) -> List[List[float]]:
    """Get embeddings from API concurrently for a list of sentences"""
    with ThreadPoolExecutor(max_workers=40) as executor:
        results = list(executor.map(lambda sentence: requests.post(
            "http://0.0.0.0:5002/embeddings",
            json={"model": "sentence-transformers/all-mpnet-base-v2", "input": [sentence]}
        ).json()["data"][0]["embedding"], sentences))
    
    return results

class TenderAnalyzer:
    def __init__(self):
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.max_chunk_tokens = 100000  # Safe limit below model's maximum

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity using API for embeddings"""
        sentences_text = [s['combined_sentence'] for s in sentences]
        embeddings = get_embeddings_via_api(sentences_text)
        
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _estimate_tokens(self, text: str) -> int:
        """Estimate number of tokens in text (rough approximation)"""
        return len(text.split()) * 1.3  # Rough estimate of tokens

    def _chunk_by_tokens(self, texts: List[str]) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for text in texts:
            estimated_tokens = self._estimate_tokens(text)
            
            if current_tokens + estimated_tokens > self.max_chunk_tokens:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [text]
                current_tokens = estimated_tokens
            else:
                current_chunk.append(text)
                current_tokens += estimated_tokens
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        return chunks

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            sentences = self._split_into_sentences(text)
            semantic_chunks = self._create_chunks(sentences)
            return self._chunk_by_tokens(semantic_chunks)
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            return []

    def _merge_responses(self, responses: List[str]) -> str:
        """Merge multiple responses into a coherent summary and remove duplicates."""
        unique_lines = []
        seen = set()

        for response in responses:
            for line in response.split('\n'):
                line = line.strip()
                if line and line not in seen:
                    seen.add(line)
                    unique_lines.append(line)
        
        cleaned_lines = []
        for line in unique_lines:
            parts = line.split()  # Tokenize the line
            deduped_line = " ".join(dict.fromkeys(parts))  # Remove repeated words
            cleaned_lines.append(deduped_line)
        
        return '\n'.join(cleaned_lines)

    def process_query(self, query: str, chunks: List[str]) -> str:
        """Process a single query against multiple text chunks"""
        try:
            responses = []
            for chunk in chunks:
                with get_openai_callback() as cb:
                    response = self.chain.run(
                        input_documents=[Document(page_content=chunk)],
                        question=query
                    )
                    responses.append(response.strip())
            
            return self._merge_responses(responses)
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function"""
        try:
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            results = []
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, chunks): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]
        

from opensearchpy import OpenSearch

def process_folder(base_folder: str) -> Dict[str, Any]:
    """Process all text files in the given folder and its subfolders, and index results into OpenSearch."""
    analyzer = TenderAnalyzer()
    all_results = []

    # Set up OpenSearch client
    index_name = 'tprocanswers'
    opensearch_client = OpenSearch(
        hosts=['https://localhost:9200'],
        http_auth=("admin", "4Z*lwtz,,2T:0TGu"),
        use_ssl=True,
        verify_certs=False,
        ssl_show_warn=False
    )

    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_file = {}
        
        for root, _, files in os.walk(base_folder):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    future_to_file[executor.submit(analyzer.analyze_tender, file_path)] = file_path
        
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                results = future.result()
                tcno = os.path.basename(os.path.dirname(file_path))
                all_results.append({
                    "tcno": tcno,
                    # "file_path": file_path,
                    "results": results
                })
                
                # Index results into OpenSearch
                opensearch_client.index(index=index_name, id=tcno, body={"file_path": file_path, "results": results})
                print(f"Indexed results for {tcno} in OpenSearch.")
            
            except Exception as e:
                all_results.append({
                    "tcno": "Unknown",
                    # "file_path": file_path,
                    "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
                })
    
    return {"results": all_results}

def main():
    """Main execution function"""
    # Base folder path
    date_str = "22-11-24"  # You can modify this as needed
    folder_path = f"/data/txtfolder/dailydocument_23-11-24_txt"
    
    # Process all documents in the folder
    results = process_folder(folder_path)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()       


Indexed results for 78356751 in OpenSearch.
Indexed results for 78363562 in OpenSearch.
Indexed results for 78341034 in OpenSearch.


## remove the duplicate 

In [3]:
import os
import re
import warnings
import numpy as np
import requests
import logging
import json
from typing import Dict, List, Any, Union
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed
from opensearchpy import OpenSearch

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler('tender_analyzer.log'),
        logging.StreamHandler()
    ]
)

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore")

def get_embeddings_via_api(sentence):
    """Get embeddings from API (using all-mpnet-base-v2 model)"""
    try:
        response = requests.post(
            "http://0.0.0.0:5002/embeddings",
            json={"model": "sentence-transformers/all-MiniLM-L6-v2", "input": [sentence]}
        )
        return response.json()["data"][0]["embedding"]
    except Exception as e:
        logging.error(f"Embedding API error: {e}")
        return None

class TenderAnalyzer:
    def __init__(self):
        self.llm = ChatOpenAI(
            model_name="meta-llama/Meta-Llama-3-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.max_chunk_tokens = 100000  # Safe limit below model's maximum

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity using API for embeddings"""
        embeddings = [get_embeddings_via_api(s['combined_sentence']) for s in sentences]
        embeddings = [emb for emb in embeddings if emb is not None]
        
        if not embeddings:
            logging.warning("No embeddings could be generated")
            return [' '.join([s['sentence'] for s in sentences])]
        
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _estimate_tokens(self, text: str) -> int:
        """Estimate number of tokens in text (rough approximation)"""
        return len(text.split()) * 1.3  # Rough estimate of tokens

    def _chunk_by_tokens(self, texts: List[str]) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for text in texts:
            estimated_tokens = self._estimate_tokens(text)
            
            if current_tokens + estimated_tokens > self.max_chunk_tokens:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [text]
                current_tokens = estimated_tokens
            else:
                current_chunk.append(text)
                current_tokens += estimated_tokens
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        return chunks

    def _merge_responses(self, responses: List[str]) -> str:
        """
        Merge multiple responses into a coherent summary with improved deduplication.
        
        Args:
            responses (List[str]): List of response strings to merge
        
        Returns:
            str: A merged, deduplicated response string
        """
        # Remove empty responses
        responses = [resp.strip() for resp in responses if resp.strip()]
        
        if not responses:
            return ""
        
        # If only one response, return it directly
        if len(responses) == 1:
            return responses[0]
        
        # Tokenize and clean each response
        cleaned_responses = []
        for response in responses:
            # Split into lines and clean each line
            lines = response.split('\n')
            cleaned_lines = []
            
            for line in lines:
                # Remove extra whitespace and convert to lowercase for comparison
                cleaned_line = ' '.join(line.split())
                
                # Skip if line is too short or already seen
                if len(cleaned_line) > 3 and cleaned_line not in cleaned_lines:
                    cleaned_lines.append(cleaned_line)
            
            # Combine cleaned lines for this response
            cleaned_responses.append('\n'.join(cleaned_lines))
        
        # Merge unique responses
        final_lines = []
        seen_lines = set()
        
        for response in cleaned_responses:
            for line in response.split('\n'):
                # Further clean and normalize the line
                normalized_line = ' '.join(line.split())
                
                # Add line if it's not a duplicate and brings new information
                if normalized_line and normalized_line not in seen_lines:
                    final_lines.append(line)
                    seen_lines.add(normalized_line)
        
        return '\n'.join(final_lines)

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            sentences = self._split_into_sentences(text)
            semantic_chunks = self._create_chunks(sentences)
            return self._chunk_by_tokens(semantic_chunks)
        except Exception as e:
            logging.error(f"Error processing document {file_path}: {e}")
            return []

    def process_query(self, query: str, chunks: List[str]) -> str:
        """ Process a single query against multiple text chunks with strict response handling.for duplicated"""
        try:
            responses = []
            for chunk in chunks:
                try:
                    with get_openai_callback() as cb:
                        response = self.chain.run(
                            input_documents=[Document(page_content=chunk)],
                            question=query
                        )
                        responses.append(response.strip())
                except Exception as e:
                    logging.warning(f"Error processing chunk: {e}")
            
            return self._merge_responses(responses)
        except Exception as e:
            logging.error(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    # def process_query(self, query: str, chunks: List[str]) -> str:
    #     """
    #     Process a single query against multiple text chunks.
        
    #     Args:
    #         query (str): The specific query to process
    #         chunks (List[str]): Text chunks to analyze
        
    #     Returns:
    #         str: Consolidated unique response
    #     """
    #     try:
    #         responses = []
            
    #         # Process each chunk with a timeout and error handling
    #         for chunk in chunks:
    #             try:
    #                 # Use a shorter context window if possible
    #                 if len(chunk) > self.max_chunk_tokens:
    #                     chunk = chunk[:self.max_chunk_tokens]
                    
    #                 with get_openai_callback() as cb:
    #                     # Add query refinement to improve precision
    #                     refined_query = f"Precisely and concisely {query}"
                        
    #                     response = self.chain.run(
    #                         input_documents=[Document(page_content=chunk)],
    #                         question=refined_query
    #                     )
                        
    #                     # Basic response validation
    #                     cleaned_response = response.strip()
    #                     if cleaned_response and len(cleaned_response.split()) > 3:
    #                         responses.append(cleaned_response)
                
    #             except Exception as e:
    #                 logging.warning(f"Chunk processing error: {e}")
            
    #         # Merge responses with strict deduplication
    #         final_response = self._merge_responses(responses)
            
    #         # Final cleaning
    #         return '\n'.join(line.strip() for line in final_response.split('\n') if line.strip())
    
    #     except Exception as e:
    #         logging.error(f"Query processing error: {e}")
    #         return f"Error processing query: {str(e)}"
        


    def analyze_tender(self, file_path: str) -> List[Dict[str, str]]:
        """Main analysis function"""
        try:
            chunks = self.process_document(file_path)
            if not chunks:
                return [{"title": title, "response": "Error: Failed to process document"} 
                        for title in self.queries.values()]
            
            results = []
            with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
                future_to_query = {
                    executor.submit(self.process_query, query, chunks): title
                    for query, title in self.queries.items()
                }
                
                for future in as_completed(future_to_query):
                    title = future_to_query[future]
                    try:
                        response = future.result()
                        results.append({
                            "title": title,
                            "response": response
                        })
                    except Exception as e:
                        logging.error(f"Error processing {title}: {e}")
                        results.append({
                            "title": title,
                            "response": f"Error: {str(e)}"
                        })
            
            return results
        except Exception as e:
            logging.error(f"Overall analysis error: {e}")
            return [{"title": title, "response": f"Error: {str(e)}"} 
                    for title in self.queries.values()]

def process_folder(base_folder: str) -> Dict[str, Any]:
    """Process all text files in the given folder and its subfolders, and index results into OpenSearch."""
    analyzer = TenderAnalyzer()
    all_results = []

    # Set up OpenSearch client
    index_name = 'tprocanswers'
    try:
        opensearch_client = OpenSearch(
            hosts=['https://localhost:9200'],
            http_auth=("admin", "4Z*lwtz,,2T:0TGu"),
            use_ssl=True,
            verify_certs=False,
            ssl_show_warn=False
        )
    except Exception as e:
        logging.error(f"OpenSearch connection error: {e}")
        return {"results": [], "error": str(e)}

    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_file = {}
        
        for root, _, files in os.walk(base_folder):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    future_to_file[executor.submit(analyzer.analyze_tender, file_path)] = file_path
        
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                results = future.result()
                tcno = os.path.basename(os.path.dirname(file_path))
                result_entry = {
                    "tcno": tcno,
                    "results": results
                }
                all_results.append(result_entry)
                
                # Index results into OpenSearch
                try:
                    opensearch_client.index(index=index_name, id=tcno, body={"file_path": file_path, "results": results})
                    logging.info(f"Indexed results for {tcno} in OpenSearch.")
                except Exception as e:
                    logging.error(f"Failed to index results for {tcno}: {e}")
            
            except Exception as e:
                logging.error(f"Failed to process file {file_path}: {str(e)}")
                all_results.append({
                    "tcno": "Unknown",
                    "results": [{"title": "Error", "response": f"Failed to process file: {str(e)}"}]
                })
    
    return {"results": all_results}

def main():
    """Main execution function"""
    try:
        # Base folder path
        date_str = "22-11-24"  # You can modify this as needed
        folder_path = f"/data/txtfolder/dailydoc_test"
        
        # Process all documents in the folder
        results = process_folder(folder_path)
        
        # Print results (optional)
        print(json.dumps(results, indent=4))
        
        return results
    except Exception as e:
        logging.error(f"Main function error: {e}")
        return {"error": str(e)}

if __name__ == "__main__":
    main()

2024-11-26 13:25:41,499 - INFO: HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-11-26 13:25:41,638 - INFO: HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-11-26 13:25:41,770 - INFO: HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-11-26 13:25:41,906 - INFO: HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-11-26 13:25:42,045 - INFO: HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-11-26 13:25:42,098 - INFO: PUT https://localhost:9200/tprocanswers/_doc/78349785 [status:200 request:0.050s]
2024-11-26 13:25:42,098 - INFO: Indexed results for 78349785 in OpenSearch.
2024-11-26 13:25:47,541 - INFO: HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 400 Bad Request"
2024-11-26 13:25:47,899 - INFO: HTTP Request: POST http://localhost:8000/v1/chat/completio

{
    "results": [
        {
            "tcno": "78349785",
            "results": [
                {
                    "title": "Prequalification Criteria",
                    "response": ""
                },
                {
                    "title": "Scope of Work",
                    "response": ""
                },
                {
                    "title": "Important Dates",
                    "response": ""
                },
                {
                    "title": "Supporting Documents",
                    "response": ""
                },
                {
                    "title": "Contact Details",
                    "response": ""
                }
            ]
        },
        {
            "tcno": "70398187",
            "results": [
                {
                    "title": "Scope of Work",
                    "response": ""
                },
                {
                    "title": "Important Dates",
                    "respo

In [1]:
import shutil
import os

# Define paths
main_folder =  "/data/txtfolder/dailydocument_23-11-24_txt"
# main_folder = 
subfolder_name = '78349785'
destination_folder = "/data/txtfolder/dailydoc_test"

# Build paths
source_path = os.path.join(main_folder, subfolder_name)
destination_path = os.path.join(destination_folder, subfolder_name)

# Copy subfolder
shutil.copytree(source_path, destination_path)

print(f"Subfolder {subfolder_name} copied to {destination_folder}")


Subfolder 78349785 copied to /data/txtfolder/dailydoc_test
