In [None]:
import os
import re
import warnings
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

@dataclass
class TenderQuery:
    """Data class to store query information"""
    query: str
    title: str

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, verbose=True, chain_type='stuff')
        self.queries = {
            "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
            # "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "Extract all point of Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            # "List all mandatory qualification criteria, including blacklisting status and required certifications.": "Mandatory Qualification Criteria",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "List of all the dates with its time mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD date":"Importants Date",
            # "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Split into sentences and create chunks
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 3500) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str, output_path: str) -> None:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        combined_text = " ".join(chunks)
        
        # Process queries in parallel
        results = defaultdict(str)
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        # Create DataFrame and save results
        df = pd.DataFrame({
            'Title': list(self.queries.values()),
            'Response': [results[title] for title in self.queries.values()]
        })
        
        # Save to Excel
        df.to_excel(output_path, index=False)
        
        # Also save as JSON for backup
        json_path = output_path.rsplit('.', 1)[0] + '.json'
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(dict(results), f, indent=4, ensure_ascii=False)

def main():
    """Main execution function"""
    # Initialize analyzer
    analyzer = TenderAnalyzer()
    
    # Process tender document
    input_file = '/data/Pqmatch/testing/78216093/78216093.txt'
    output_file = '78216093.xlsx'
    
    analyzer.analyze_tender(input_file, output_file)
    print(f"Analysis completed. Results saved to {output_file}")

if __name__ == "__main__":
    main()

## return result in json

In [None]:
# import os
# import re
# import json
# import warnings
# import numpy as np
# from typing import Dict, List, Any
# from sentence_transformers import SentenceTransformer
# from langchain.schema import Document
# from langchain.chat_models import ChatOpenAI
# from langchain.chains.question_answering import load_qa_chain
# from langchain.callbacks import get_openai_callback
# from concurrent.futures import ThreadPoolExecutor, as_completed
# from collections import defaultdict

# # Suppress warnings
# warnings.filterwarnings("ignore")

# # Environment setup
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# os.environ['TORCH_USE_CUDA_DSA'] = "1"

# class TenderAnalyzer:
#     """Main class for analyzing tender documents"""
    
#     def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
#         self.model = SentenceTransformer(model_name)
#         self.llm = ChatOpenAI(
#              model_name="meta-llama/Llama-3.1-8B-Instruct",
#             openai_api_base="http://localhost:8000/v1",
#             openai_api_key="FAKE",
#             max_tokens=500,
#             temperature=0.1
#         )
#         self.chain = load_qa_chain(self.llm, verbose=True, chain_type='stuff')
#         self.queries = {
#             "Identify the functional requirements, also referred to as the scope of work, specified in the document.": "Scope of Work",
#             "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
#             "List all mandatory qualification criteria, including blacklisting status and required certifications.": "Mandatory Qualification Criteria",
#             "Summarize the work specifications that bidders must meet to fulfill the tender requirements.": "Specifications",
#             "List all supporting documents required for this tender.": "Supporting Documents",
#             "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
#             "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
#         }

#     def process_document(self, file_path: str) -> List[str]:
#         """Process document and split into chunks"""
#         with open(file_path, 'r', encoding='utf-8') as f:
#             text = f.read()
        
#         # Split into sentences and create chunks
#         sentences = self._split_into_sentences(text)
#         chunks = self._create_chunks(sentences)
#         return self._chunk_by_tokens(chunks)

#     def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
#         """Split text into sentences with metadata"""
#         sentences = [{'sentence': s, 'index': i} 
#                     for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
#         return self._combine_sentences(sentences)

#     def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
#         """Combine sentences with context"""
#         combined = []
#         for i, sent in enumerate(sentences):
#             context = []
#             # Add previous sentences
#             for j in range(max(0, i - buffer_size), i):
#                 context.append(sentences[j]['sentence'])
#             # Add current and next sentences
#             context.append(sent['sentence'])
#             for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
#                 context.append(sentences[j]['sentence'])
#             sent['combined_sentence'] = ' '.join(context)
#             combined.append(sent)
#         return combined

#     def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
#         """Create document chunks based on semantic similarity"""
#         # Create embeddings
#         embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
#         # Calculate distances
#         distances = []
#         for i in range(len(embeddings) - 1):
#             similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
#                 np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
#             distances.append(1 - similarity)
        
#         # Split into chunks
#         threshold = np.percentile(distances, 95)
#         chunks = []
#         start_idx = 0
        
#         for i, distance in enumerate(distances):
#             if distance > threshold:
#                 chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
#                 chunks.append(chunk)
#                 start_idx = i + 1
        
#         if start_idx < len(sentences):
#             chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
#             chunks.append(chunk)
        
#         return chunks

#     def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 3500) -> List[str]:
#         """Split texts into smaller chunks based on token count"""
#         max_chars = max_tokens * 2
#         chunks = []
#         for text in texts:
#             text_chunks = [text[i:i + max_chars] 
#                          for i in range(0, len(text), max_chars)]
#             chunks.extend(text_chunks)
#         return chunks

#     def process_query(self, query: str, text: str) -> str:
#         """Process a single query against the text"""
#         try:
#             with get_openai_callback() as cb:
#                 response = self.chain.run(
#                     input_documents=[Document(page_content=text)],
#                     question=query
#                 )
#             return response.strip()
#         except Exception as e:
#             print(f"Error processing query: {e}")
#             return f"Error: {str(e)}"

#     def analyze_tender(self, file_path: str) -> Dict[str, str]:
#         """Main analysis function"""
#         # Process document
#         chunks = self.process_document(file_path)
#         combined_text = " ".join(chunks)
        
#         # Process queries in parallel
#         results = {}
#         with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
#             future_to_query = {
#                 executor.submit(self.process_query, query, combined_text): title
#                 for query, title in self.queries.items()
#             }
            
#             for future in as_completed(future_to_query):
#                 title = future_to_query[future]
#                 try:
#                     response = future.result()
#                     results[title] = response
#                 except Exception as e:
#                     results[title] = f"Error: {str(e)}"
        
#         return results

# def main():
#     """Main execution function"""
#     # Initialize analyzer
#     analyzer = TenderAnalyzer()
    
#     # Process tender document
#     input_file = '/data/Pqmatch/testing/78216093/78216093.txt'
    
#     # Analyze and get results
#     results = analyzer.analyze_tender(input_file)
    
#     # Output path for JSON
#     output_file = '/data/QAAPI/stored_files/77774640_analysis.json'
    
#     # Ensure output directory exists
#     os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
#     # Save results as JSON
#     with open(output_file, 'w', encoding='utf-8') as f:
#         json.dump(results, f, indent=4, ensure_ascii=False)
    
#     print(f"Analysis completed. Results saved to {output_file}")
    
#     # Return the results for potential further processing
#     return results

# if __name__ == "__main__":
#     main()

## return response in json

In [19]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=500,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "Identify the functional requirements, also referred to as the scope of work, specified in the document.": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            "List all mandatory qualification criteria, including blacklisting status and required certifications.": "Mandatory Qualification Criteria",
            "Summarize the work specifications that bidders must meet to fulfill the tender requirements.": "Specifications",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Split into sentences and create chunks
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 3500) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        combined_text = " ".join(chunks)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file = '/data/Pqmatch/testing/78216093/78216093.txt'
    
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

{
    "Prequalification Criteria": "Based on the provided text data, the following clauses specify pre-qualification criteria or eligibility criteria:\n\n1. Clause 1.3.1 of the SBD (Standard Bidding Document):\n\n\"A Bidder shall be a registered contractor in Kerala Public Works Department or from any State or Central Government Engineering Departments which are having similar functionalities like Kerala PWD in the required category as specified in the NIT.\"\n\n2. Clause 1.3.2 of the SBD:\n\n\"Only those bidders having a valid and active registration, on the date of bid submission, shall submit bids online on the e-GP website.\"\n\n3. Clause 1.3.4 of the SBD:\n\n\"All Bidders are required to register in the e-procurement portal. The Bidder intending to participate in the bid is required to register in the e-tenders portal using his Login ID and attach his valid Digital Signature Certificate (DSC) to his unique Login ID.\"\n\n4. Clause 1.3.5 of the SBD:\n\n\"A firm/bidder shall submit 

In [None]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"


class TenderAnalyzer:
    """Main class for analyzing tender documents"""

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=500,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "Identify the functional requirements, also referred to as the scope of work, specified in the document.": "Scope of Work",
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
            # "List all mandatory qualification criteria, including blacklisting status and required certifications.": "Mandatory Qualification Criteria",
            # "Summarize the work specifications that bidders must meet to fulfill the tender requirements.": "Specifications",
            "List all supporting documents required for this tender.": "Supporting Documents",
            "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
            "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }

    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Split into sentences and create chunks
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i}
                     for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])

        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)

        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0

        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1

        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)

        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 3500) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars]
                           for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document into chunks
        chunks = self.process_document(file_path)

        # Results storage
        results = {title: "" for title in self.queries.values()}
        
        # Map futures to queries and titles
        future_to_query = {}

        # Limit workers to prevent overloading
        max_workers = min(multiprocessing.cpu_count(), len(self.queries))

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit tasks for each chunk and query pair
            for chunk in chunks:
                for query, title in self.queries.items():
                    future = executor.submit(self.process_query, query, chunk)
                    future_to_query[future] = title

            # Gather results
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] += f"{response}\n"
                except Exception as e:
                    print(f"Error processing query '{title}': {e}")
                    results[title] += f"Error: {str(e)}\n"

        return results


def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document

    Args:
        file_path (str): Path to the tender document

    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)


def main():
    """Main execution function"""
    # Process tender document
    input_file = '/data/Pqmatch/testing/78216093/78216093.txt'

    # Analyze and get results
    results = analyze_tender_document(input_file)

    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))

    return results


if __name__ == "__main__":
    main()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import os
import re
import numpy as np
import pandas as pd
import requests
from typing import List, Dict
import torch
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.document_loaders import TextLoader
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from sklearn.metrics.pairwise import cosine_similarity
import tiktoken

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

def truncate_text(text, max_tokens=7000):
    """
    Truncate text to specified max tokens
    """
    tokens = tokenizer.encode(text)
    truncated_tokens = tokens[:max_tokens]
    return tokenizer.decode(truncated_tokens)

def count_tokens(text):
    """
    Count tokens in a given text
    """
    return len(tokenizer.encode(text))

class CustomEmbeddings(Embeddings):
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        # Truncate each text to manage token count
        truncated_texts = [truncate_text(text) for text in texts]
        return self.model.encode(truncated_texts).tolist()

    def embed_query(self, text: str) -> List[float]:
        # Truncate query text
        truncated_text = truncate_text(text)
        return self.model.encode(truncated_text).tolist()

def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            # Extract text content from Document objects
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text

def chunk_text_with_token_limit(text, max_tokens=7000):
    """
    Chunk text while respecting token limits
    """
    tokens = tokenizer.encode(text)
    chunks = []
    
    # Split tokens into chunks
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i+max_tokens]
        chunk = tokenizer.decode(chunk_tokens)
        chunks.append(chunk)
    
    return chunks

def process_text(texts, labels=None):
    # Default labels if not provided
    if labels is None:
        labels = [
            "Important Dates", 
            "Eligibility Criteria", 
            "Scope of Work", 
            "Contact Details", 
            "Tender Overview", 
            "Submission Guidelines"
        ]
    
    # Join all document texts into a single string
    combined_text = "\n".join(texts)
    
    # Truncate combined text to manage token count
    combined_text = truncate_text(combined_text)
    
    # Split sentences
    single_sentences_list = re.split(r'(?<=[.?!])\s+', combined_text)
    
    # Create sentence objects with index
    sentences = [{'sentence': x, 'index': i} for i, x in enumerate(single_sentences_list)]
    
    # Function to combine sentences into chunks
    def combine_sentences(sentences, buffer_size=1):
        for i in range(len(sentences)):
            combined_sentence = ''
            for j in range(i - buffer_size, i):
                if j >= 0:
                    combined_sentence += sentences[j]['sentence'] + ' '
            
            combined_sentence += sentences[i]['sentence']
            
            for j in range(i + 1, i + 1 + buffer_size):
                if j < len(sentences):
                    combined_sentence += ' ' + sentences[j]['sentence']
            sentences[i]['combined_sentence'] = combined_sentence
        return sentences
    
    # Initialize Sentence Transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Combine sentences
    sentences = combine_sentences(sentences)
    
    # Create embeddings
    try:
        embeddings = model.encode([x['combined_sentence'] for x in sentences])
    except Exception as e:
        print(f"Embedding error: {e}")
        embeddings = model.encode([x['combined_sentence'][:1000] for x in sentences])
    
    # Prepare labeled chunks
    labeled_chunks = {label: [] for label in labels + ['Uncategorized']}
    final_chunks = []
    
    for i, sentence in enumerate(sentences):
        # Try to assign a label based on predefined labels
        label_match = 'Uncategorized'
        for label in labels:
            if label.lower() in sentence['combined_sentence'].lower():
                label_match = label
                break
        
        # Limit chunk size
        chunk = truncate_text(sentence['combined_sentence'], max_tokens=1000)
        
        # Add chunk with label prefix
        labeled_chunk = f"[{label_match}] {chunk}"
        final_chunks.append(labeled_chunk)
        labeled_chunks[label_match].append(chunk)
    
    # Create embeddings for the knowledge base
    try:
        embeddings = CustomEmbeddings()
        knowledge_base = FAISS.from_texts(final_chunks, embedding=embeddings)
    except Exception as e:
        print(f"Knowledge base creation error: {e}")
        # Fallback: create knowledge base with truncated chunks
        fallback_chunks = [truncate_text(chunk, max_tokens=500) for chunk in final_chunks]
        knowledge_base = FAISS.from_texts(fallback_chunks, embedding=embeddings)
    
    return knowledge_base, labeled_chunks

def process_query(knowledge_base, query, title):
    # Truncate query to manage token count
    query = truncate_text(query, max_tokens=3500)
    
    # Retrieve relevant documents
    docs = knowledge_base.similarity_search(query, k=3)  # Limit to 3 most relevant docs
    
    # Prepare docs to fit within token limit
    context_docs = []
    current_tokens = 0
    for doc in docs:
        doc_tokens = count_tokens(doc.page_content)
        if current_tokens + doc_tokens <= 3500:
            context_docs.append(doc)
            current_tokens += doc_tokens
        else:
            break
    
    # Initialize the language model
    llm = ChatOpenAI(
     model_name="meta-llama/Llama-3.1-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",
        max_tokens=4096,
        temperature=0.1
    )
    
    # Load the QA chain
    chain = load_qa_chain(llm, verbose=False, chain_type='stuff')
    
    # Run the chain and capture the response
    try:
        response = chain.run(input_documents=context_docs, question=query)
        # Strip unnecessary context
        response = response.strip()
        
        # Truncate response if it's too long
        response = truncate_text(response, max_tokens=1000)
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response

def main(folder_path, queries, output_file, labels=None):
    # Load documents from the specified folder
    all_docs_text = load_text_files_from_directory(folder_path)
    
    # Process texts and create knowledge base with labels
    knowledge_base, labeled_chunks = process_text(all_docs_text, labels)
    
    # Initialize lists to store results
    titles = []
    responses = []
    
    # Use ThreadPoolExecutor to run queries in parallel
    with ThreadPoolExecutor(max_workers=len(queries)) as executor:
        future_to_query = {
            executor.submit(process_query, knowledge_base, query, title): title 
            for query, title in queries.items()
        }
        
        for future in as_completed(future_to_query):
            title = future_to_query[future]
            try:
                result_title, result_response = future.result()
                titles.append(result_title)
                responses.append((result_title, result_response))
            except Exception as e:
                print(f"Query processing failed for title {title}: {e}")
                titles.append(title)
                responses.append((title, f"Error: {e}"))
    
    # Create a DataFrame for responses
    df_responses = pd.DataFrame({
        'Title': [title for title in queries.values()],
        'Response': [
            next((resp for t, resp in responses if t == title), 'No response') 
            for title in queries.values()
        ]
    })
    
    # Create a DataFrame for labeled chunks
    df_labels = pd.DataFrame.from_dict(labeled_chunks, orient='index')
    df_labels.index.name = 'Label'
    df_labels.reset_index(inplace=True)
    
    # Save DataFrames to Excel
    with pd.ExcelWriter(output_file) as writer:
        df_responses.to_excel(writer, sheet_name='Responses', index=False)
        df_labels.to_excel(writer, sheet_name='Labeled Chunks', index=False)
    
    print(f"Responses and labeled chunks have been saved to '{output_file}'.")
    
    return knowledge_base, labeled_chunks

# Example usage
if __name__ == "__main__":
    folder_path = "/data/tendergpt/livetender_txt/71484890"
    
    # Predefined labels
    labels = [
        "Important Dates", 
        "Eligibility Criteria", 
        "Scope of Work", 
        "Contact Details", 
        "Tender Overview", 
        "Submission Guidelines"
    ]
    
    queries = {
        "extract the all important date  with time and amount mentioned in this document": "Contact Details"
    }
    
    output_file = '77326167.xlsx'
    
    # Run the main function
    knowledge_base, labeled_chunks = main(folder_path, queries, output_file, labels)

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TORCH_USE_CUDA_DSA'] = "0"
import warnings
warnings.filterwarnings("ignore")
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback
from langchain_community.vectorstores import FAISS
import os
from langchain_community.document_loaders import TextLoader

In [4]:
import os
import requests
from typing import List
from langchain.embeddings.base import Embeddings
from langchain.document_loaders import TextLoader
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
import faiss
import re
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback
# from langchain.chains.qa import load_qa_chain
# from langchain.chat_models import ChatOpenAI


# Get embeddings from external API
def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")


class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)


def load_text_files_from_directory(folder_path):
    """
    Loads text content from all `.txt` files in the specified folder.
    """
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                all_text.append(content)
    return all_text



# Sentence processing and chunk creation
def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '

        combined_sentence += sentences[i]['sentence']

        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        sentences[i]['combined_sentence'] = combined_sentence
    return sentences


# Prepare sentence embeddings
def process_text(texts):
    combined_text = "\n".join(texts)
    single_sentences_list = re.split(r'(?<=[.?!])\s+', combined_text)
    sentences = [{'sentence': x, 'index': i} for i, x in enumerate(single_sentences_list)]
    
    sentences = combine_sentences(sentences)
    embeddings = model.encode([x['combined_sentence'] for x in sentences])
    
    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = embeddings[i]
    
    return sentences


# Calculate cosine distances between consecutive sentences
def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]['distance_to_next'] = distance
    return distances, sentences


# Custom similarity search using FAISS
def similarity_search(faiss_index, query_embedding, top_k=5):
    query_embedding = np.array(query_embedding).reshape(1, -1)  # Ensure correct shape
    distances, indices = faiss_index.search(query_embedding, top_k)
    return indices[0], distances[0]


# FAISS semantic clustering
def segment_text_with_faiss_label_assignment(semantic_chunks, threshold=-0.7):
    labeled_segments = defaultdict(list)
    for chunk in semantic_chunks:
        if chunk.strip():
            paragraph_embedding = model.encode(chunk).reshape(1, -1)
            distances, label_indices = faiss_index.search(paragraph_embedding, len(labels))
            similarities = 1 - distances
            assigned_labels = [labels[i] for i, sim in enumerate(similarities[0]) if sim >= threshold]

            if assigned_labels:
                for label in assigned_labels:
                    labeled_segments[label].append(chunk)
            else:
                labeled_segments["Other"].append(chunk)
    return labeled_segments


# Main processing
model = SentenceTransformer('all-MiniLM-L6-v2')
labels = ["Important Dates", "Eligibility or Prequalification Criteria", "Scope of Work", "Contact Details"]

folder_path = "/data/tendergpt/testing/77153810"
all_docs_text = load_text_files_from_directory(folder_path)
sentences = process_text(all_docs_text)

distances, sentences = calculate_cosine_distances(sentences)

breakpoint_percentile_threshold = 95
breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]

start_index = 0
chunks = []
for index in indices_above_thresh:
    end_index = index
    group = sentences[start_index:end_index + 1]
    combined_text = ' '.join([d['sentence'] for d in group])
    chunks.append(combined_text)
    start_index = index + 1

if start_index < len(sentences):
    combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
    chunks.append(combined_text)


# Initialize FAISS knowledge base
label_embeddings = model.encode(labels)
dimension = label_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(label_embeddings)

segmented_result = segment_text_with_faiss_label_assignment(chunks)

# Save result to JSON
out_file_path = r'/data/QAAPI/stored_files/out.json'
with open(out_file_path, 'w', encoding='utf-8') as out_file:
    json.dump(segmented_result, out_file, indent=4, ensure_ascii=False)


queries = {
    "Extract all the points of Eligibility criteria from the tender document": "PQ"
}


def process_query(query, title):
    query_embedding = model.encode(query)
    indices, distances = similarity_search(faiss_index, query_embedding, top_k=5)

    # Retrieve corresponding chunks based on indices
    docs = [chunks[i] for i in indices if i < len(chunks)]
    
    # Initialize the language model
    llm = ChatOpenAI(
        model_name="meta-llama/Llama-3.1-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
    )
    
    # Load the QA chain
    chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
    
    # Run the chain and capture the response
    try:
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response


def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = ""
        aggregated[title] += "\n" + response if response else ""
    return aggregated


titles = []
responses = []

with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

aggregated_responses = aggregate_responses(responses)

df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})

df.to_excel('output.xlsx', index=False)

print("Responses have been saved to 'output.xlsx'.")




[1m> Entering new StuffDocumentsChain chain...[0m
Responses have been saved to 'output.xlsx'.


In [12]:
import os
import json
import requests
import numpy as np
from typing import List
import re
from collections import defaultdict
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import faiss

# Custom embedding function and class
def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

class CustomEmbeddings:
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def split_into_sentences(text):
    sentences = re.split(r'(?<=[.?!])\s+', text)
    return [{'sentence': x, 'index': i} for i, x in enumerate(sentences)]

def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '
        
        combined_sentence += sentences[i]['sentence']
        
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        
        sentences[i]['combined_sentence'] = combined_sentence
    
    return sentences

def calculate_distances(sentences, embeddings):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = embeddings[i]
        embedding_next = embeddings[i + 1]
        
        similarity = np.dot(embedding_current, embedding_next) / (
            np.linalg.norm(embedding_current) * np.linalg.norm(embedding_next)
        )
        
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]['distance_to_next'] = distance
    
    return distances, sentences

def create_knowledge_base(sentences, embedder):
    # Convert sentences to Documents format
    documents = []
    for sentence in sentences:
        doc = Document(
            page_content=sentence['combined_sentence'],
            metadata={'index': sentence['index']}
        )
        documents.append(doc)
    
    # Create FAISS vector store
    embeddings = [sentence['combined_sentence_embedding'] for sentence in sentences]
    texts = [doc.page_content for doc in documents]
    
    vector_store = FAISS.from_embeddings(
        text_embeddings=list(zip(texts, embeddings)),
        embedding=embedder,
        metadatas=[doc.metadata for doc in documents]
    )
    
    return vector_store

def process_query(query, title, knowledge_base):
    docs = knowledge_base.similarity_search(query, k=4)  # Retrieve top 4 relevant documents
    
    llm = ChatOpenAI(
        model_name="meta-llama/Llama-3.1-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",
        max_tokens=4096,
        temperature=0.1
    )
    
    chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
    
    try:
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response

def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = ""
        aggregated[title] += "\n" + response if response else ""
    return aggregated

def process_tender_document(file_path, output_file):
    # Process text
    text = load_text_file(file_path)
    print(f"Loaded text file: {file_path}")
    
    sentences = split_into_sentences(text)
    print(f"{len(sentences)} sentences found")
    
    sentences = combine_sentences(sentences)
    
    embedder = CustomEmbeddings()
    embeddings = embedder.embed_documents([x['combined_sentence'] for x in sentences])
    
    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = embeddings[i]
    
    distances, sentences = calculate_distances(sentences, embeddings)
    
    # Create knowledge base
    knowledge_base = create_knowledge_base(sentences, embedder)
    print("Knowledge base created successfully")
    
    # Define queries for analysis
    queries = {
        "Extract all the points of Eligibility criteria from the tender document": "PQ"
    }
    
    # Process queries in parallel
    titles = []
    responses = []
    
    with ThreadPoolExecutor(max_workers=len(queries)) as executor:
        future_to_query = {
            executor.submit(process_query, query, title, knowledge_base): title 
            for query, title in queries.items()
        }
        
        for future in as_completed(future_to_query):
            title = future_to_query[future]
            try:
                result_title, result_response = future.result()
                titles.append(result_title)
                responses.append((result_title, result_response))
            except Exception as e:
                print(f"Query processing failed for title {title}: {e}")
                titles.append(title)
                responses.append((title, f"Error: {e}"))
    
    # Aggregate and save results
    aggregated_responses = aggregate_responses(responses)
    df = pd.DataFrame({
        'Title': [title for title in queries.values()],
        'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
    })
    
    df.to_excel(output_file, index=False)
    print(f"Responses have been saved to '{output_file}'.")

if __name__ == "__main__":
    file_path = "/data/tendergpt/testing/77153810/77153810.txt"
    output_file = "77326167.xlsx"
    process_tender_document(file_path, output_file)

Loaded text file: /data/tendergpt/testing/77153810/77153810.txt
646 sentences found


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


Knowledge base created successfully
Query processing failed for title PQ: 'CustomEmbeddings' object is not callable
Responses have been saved to '77326167.xlsx'.


In [None]:
import os
import json
import requests
import numpy as np
from typing import List
import re
from collections import defaultdict
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
import faiss

# Custom embedding class implementing LangChain's Embeddings interface
class CustomEmbeddings(Embeddings):
    def __init__(self):
        self.api_url = "http://0.0.0.0:5002/embeddings"
        self.model = "BAAI/bge-small-en-v1.5"

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents."""
        try:
            response = requests.post(
                self.api_url,
                json={"model": self.model, "input": texts}
            )
            if response.status_code == 200:
                data = response.json()
                return [item['embedding'] for item in data['data']]
            else:
                raise Exception(f"API request failed with status code {response.status_code}")
        except Exception as e:
            raise Exception(f"Error in embed_documents: {str(e)}")

    def embed_query(self, text: str) -> List[float]:
        """Embed a query."""
        try:
            response = requests.post(
                self.api_url,
                json={"model": self.model, "input": [text]}
            )
            if response.status_code == 200:
                data = response.json()
                return data['data'][0]['embedding']
            else:
                raise Exception(f"API request failed with status code {response.status_code}")
        except Exception as e:
            raise Exception(f"Error in embed_query: {str(e)}")

def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def split_into_sentences(text):
    sentences = re.split(r'(?<=[.?!])\s+', text)
    return [{'sentence': x, 'index': i} for i, x in enumerate(sentences)]

def combine_sentences(sentences, buffer_size=1):
    combined_sentences = []
    for i in range(len(sentences)):
        combined_sentence = ''
        
        # Add previous sentences
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '
        
        # Add current sentence
        combined_sentence += sentences[i]['sentence']
        
        # Add following sentences
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        
        combined_sentences.append({
            'index': sentences[i]['index'],
            'combined_sentence': combined_sentence.strip()
        })
    
    return combined_sentences

def create_knowledge_base(sentences, embedder):
    try:
        # Convert sentences to Documents format
        documents = [
            Document(
                page_content=sentence['combined_sentence'],
                metadata={'index': sentence['index']}
            )
            for sentence in sentences
        ]
        
        # Create FAISS vector store
        texts = [doc.page_content for doc in documents]
        vector_store = FAISS.from_texts(
            texts,
            embedder,
            metadatas=[doc.metadata for doc in documents]
        )
        
        return vector_store
    except Exception as e:
        raise Exception(f"Error creating knowledge base: {str(e)}")

def process_query(query, title, knowledge_base):
    try:
        docs = knowledge_base.similarity_search(query, k=4)
        
        llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=4096,
            temperature=0.1
        )
        
        chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
        
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        
        return title, response.strip()
    except Exception as e:
        print(f"Error in process_query: {str(e)}")
        return title, f"Error processing query: {str(e)}"

def process_tender_document(file_path, output_file):
    try:
        # Process text
        print(f"Loading text file: {file_path}")
        text = load_text_file(file_path)
        
        print("Splitting text into sentences...")
        sentences = split_into_sentences(text)
        print(f"{len(sentences)} sentences found")
        
        print("Combining sentences with context...")
        sentences = combine_sentences(sentences)
        
        print("Initializing embedding model...")
        embedder = CustomEmbeddings()
        
        print("Creating knowledge base...")
        knowledge_base = create_knowledge_base(sentences, embedder)
        print("Knowledge base created successfully")
        
        # Define queries for analysis
        queries = {
            "Extract all the points of Eligibility criteria from the tender document. List each criterion separately.": "PQ"
        }
        
        # Process queries in parallel
        titles = []
        responses = []
        
        print("Processing queries...")
        with ThreadPoolExecutor(max_workers=len(queries)) as executor:
            future_to_query = {
                executor.submit(process_query, query, title, knowledge_base): title 
                for query, title in queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    result_title, result_response = future.result()
                    titles.append(result_title)
                    responses.append((result_title, result_response))
                except Exception as e:
                    print(f"Query processing failed for title {title}: {e}")
                    titles.append(title)
                    responses.append((title, f"Error: {e}"))
        
        # Aggregate and save results
        print("Saving results...")
        aggregated_responses = {
            title: "\n".join(response for t, response in responses if t == title)
            for title in set(titles)
        }
        
        df = pd.DataFrame({
            'Title': [title for title in queries.values()],
            'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
        })
        
        df.to_excel(output_file, index=False)
        print(f"Responses have been saved to '{output_file}'")
        
    except Exception as e:
        print(f"Error in process_tender_document: {str(e)}")

if __name__ == "__main__":
    file_path = "/data/tendergpt/testing/77153810/77153810.txt"
    output_file = "77326167.xlsx"
    process_tender_document(file_path, output_file)

## without visualizatiomn

In [17]:
import os
import json
import requests
from typing import List
import numpy as np
from collections import defaultdict
import re
import faiss

# Custom embedding function and class
def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

class CustomEmbeddings:
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

# Load and process text file
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

# Split text into sentences
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.?!])\s+', text)
    return [{'sentence': x, 'index': i} for i, x in enumerate(sentences)]

# Combine sentences with buffer
def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        
        # Add previous sentences
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '
        
        # Add current sentence
        combined_sentence += sentences[i]['sentence']
        
        # Add following sentences
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        
        sentences[i]['combined_sentence'] = combined_sentence
    
    return sentences

# Calculate distances between embeddings
def calculate_distances(sentences, embeddings):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = embeddings[i]
        embedding_next = embeddings[i + 1]
        
        # Calculate cosine similarity
        similarity = np.dot(embedding_current, embedding_next) / (
            np.linalg.norm(embedding_current) * np.linalg.norm(embedding_next)
        )
        
        # Convert to distance
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]['distance_to_next'] = distance
    
    return distances, sentences

# Semantic clustering and labeling functions
def initialize_label_index(labels, embedder):
    """Initialize FAISS index with label embeddings"""
    label_embeddings = embedder.embed_documents(labels)
    label_embeddings = np.array(label_embeddings).astype('float32')
    
    dimension = len(label_embeddings[0])
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(label_embeddings)
    
    return faiss_index, label_embeddings

def segment_text_with_labels(chunks, labels, embedder, threshold=-0.7):
    """Segment text chunks and assign labels based on semantic similarity"""
    labeled_segments = defaultdict(list)
    faiss_index, label_embeddings = initialize_label_index(labels, embedder)
    
    for chunk in chunks:
        if chunk.strip():
            chunk_embedding = np.array(embedder.embed_query(chunk)).reshape(1, -1).astype('float32')
            
            # Get similarity scores with all labels
            distances, label_indices = faiss_index.search(chunk_embedding, len(labels))
            similarities = 1 - distances / 2  # Convert L2 distances to approximate cosine similarities
            
            # Assign labels based on similarity threshold
            assigned_labels = [labels[i] for i, sim in enumerate(similarities[0]) if sim >= threshold]
            
            for label in assigned_labels:
                labeled_segments[label].append(chunk)
            if not assigned_labels:
                labeled_segments["Other"].append(chunk)
    
    return labeled_segments

# Main processing function
def process_text(file_path):
    # Load text
    text = load_text_file(file_path)
    
    # Split into sentences
    sentences = split_into_sentences(text)
    
    # Combine sentences with context
    sentences = combine_sentences(sentences)
    
    # Create embeddings
    embedder = CustomEmbeddings()
    embeddings = embedder.embed_documents([x['combined_sentence'] for x in sentences])
    
    # Store embeddings in sentences
    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = embeddings[i]
    
    # Calculate distances
    distances, sentences = calculate_distances(sentences, embeddings)
    
    return sentences, distances, embeddings, embedder

# Usage example
if __name__ == "__main__":
    # File path and labels
    file_path = "/data/tendergpt/testing/77153810/77153810.txt"
    labels = ["Important Date", "Eligibility or Prequalification Criteria", 
              "scope of work", "Contact Details"]
    
    # Process text
    sentences, distances, embeddings, embedder = process_text(file_path)
    
    # Identify chunk breakpoints
    breakpoint_distance_threshold = np.percentile(distances, 95)
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
    
    # Create chunks
    chunks = []
    start_index = 0
    for index in indices_above_thresh:
        group = sentences[start_index:index + 1]
        combined_text = ' '.join([d['sentence'] for d in group])
        chunks.append(combined_text)
        start_index = index + 1
    
    # Add final chunk
    if start_index < len(sentences):
        combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
        chunks.append(combined_text)
    
    # Segment and label chunks
    labeled_segments = segment_text_with_labels(chunks, labels, embedder)
    
    # Save results to JSON
    out_file_path = 'labeled_segments.json'
    with open(out_file_path, 'w', encoding='utf-8') as out_file:
        json.dump(labeled_segments, out_file, indent=4, ensure_ascii=False)


## working with semenation 

In [None]:
import os
import json
import requests
from typing import List
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import re
from typing import List
import faiss

# Custom embedding function and class
def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

class CustomEmbeddings:
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

# Load and process text file
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

# Split text into sentences
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.?!])\s+', text)
    return [{'sentence': x, 'index': i} for i, x in enumerate(sentences)]

# Combine sentences with buffer
def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        
        # Add previous sentences
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '
        
        # Add current sentence
        combined_sentence += sentences[i]['sentence']
        
        # Add following sentences
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        
        sentences[i]['combined_sentence'] = combined_sentence
    
    return sentences

# Calculate distances between embeddings
def calculate_distances(sentences, embeddings):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = embeddings[i]
        embedding_next = embeddings[i + 1]
        
        # Calculate cosine similarity
        similarity = np.dot(embedding_current, embedding_next) / (
            np.linalg.norm(embedding_current) * np.linalg.norm(embedding_next)
        )
        
        # Convert to distance
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]['distance_to_next'] = distance
    
    return distances, sentences

# Semantic clustering and labeling functions
def initialize_label_index(labels, embedder):
    """Initialize FAISS index with label embeddings"""
    label_embeddings = embedder.embed_documents(labels)
    label_embeddings = np.array(label_embeddings).astype('float32')
    
    dimension = len(label_embeddings[0])
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(label_embeddings)
    
    return faiss_index, label_embeddings

def segment_text_with_labels(chunks, labels, embedder, threshold=-0.7):
    """Segment text chunks and assign labels based on semantic similarity"""
    labeled_segments = defaultdict(list)
    faiss_index, label_embeddings = initialize_label_index(labels, embedder)
    
    for chunk in chunks:
        if chunk.strip():
            chunk_embedding = np.array(embedder.embed_query(chunk)).reshape(1, -1).astype('float32')
            
            # Get similarity scores with all labels
            distances, label_indices = faiss_index.search(chunk_embedding, len(labels))
            similarities = 1 - distances / 2  # Convert L2 distances to approximate cosine similarities
            
            # Assign labels based on similarity threshold
            assigned_labels = [labels[i] for i, sim in enumerate(similarities[0]) if sim >= threshold]
            
            if assigned_labels:
                print(f"Assigned labels: {assigned_labels}")
                for label in assigned_labels:
                    labeled_segments[label].append(chunk)
            else:
                labeled_segments["Other"].append(chunk)
    
    # Print statistics
    for label in labels + ["Other"]:
        print(f"{label}: {len(labeled_segments[label])} chunks")
    
    return labeled_segments

def visualize_clusters(labeled_segments, labels, embedder):
    """Visualize the clustering of text segments"""
    colors = ['red', 'blue', 'green', 'yellow', 'purple', 'orange']
    scatter_points = []
    labels_for_plot = []
    
    # Prepare data for visualization
    for label_index, label in enumerate(labels):
        paragraphs = labeled_segments[label]
        if paragraphs:
            paragraph_embeddings = embedder.embed_documents(paragraphs)
            scatter_points.append(np.array(paragraph_embeddings))
            labels_for_plot.extend([label] * len(paragraphs))
    
    # Concatenate all embeddings
    all_embeddings = np.vstack(scatter_points)
    
    # Cluster using KMeans
    n_clusters = len(labels)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(all_embeddings)
    
    # Reduce dimensions for visualization
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(all_embeddings)
    
    # Create scatter plot
    plt.figure(figsize=(10, 6))
    for i, label in enumerate(labels):
        indices = np.where(cluster_labels == i)[0]
        if len(indices) > 0:
            plt.scatter(reduced_embeddings[indices, 0], 
                       reduced_embeddings[indices, 1],
                       label=label, 
                       color=colors[i % len(colors)], 
                       alpha=0.7)
    
    plt.title('Text Segmentation Clusters')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.legend()
    plt.grid(True)
    plt.show()

# Main processing function
def process_text(file_path):
    # Load text
    text = load_text_file(file_path)
    print(f"Loaded text file: {file_path}")
    
    # Split into sentences
    sentences = split_into_sentences(text)
    print(f"{len(sentences)} sentences found")
    
    # Combine sentences with context
    sentences = combine_sentences(sentences)
    
    # Create embeddings
    embedder = CustomEmbeddings()
    embeddings = embedder.embed_documents([x['combined_sentence'] for x in sentences])
    
    # Store embeddings in sentences
    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = embeddings[i]
    
    # Calculate distances
    distances, sentences = calculate_distances(sentences, embeddings)
    
    return sentences, distances, embeddings, embedder

# Visualization function for chunks
def visualize_chunks(distances, breakpoint_percentile=95, y_upper_bound=0.2):
    plt.figure(figsize=(12, 6))
    plt.plot(distances)
    plt.ylim(0, y_upper_bound)
    plt.xlim(0, len(distances))
    
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile)
    plt.axhline(y=breakpoint_distance_threshold, color='r', linestyle='-')
    
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
    num_chunks = len(indices_above_thresh) + 1
    
    plt.text(x=(len(distances)*0.01), y=y_upper_bound/50, s=f"{num_chunks} Chunks")
    
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
    for i, breakpoint_index in enumerate(indices_above_thresh):
        start_index = 0 if i == 0 else indices_above_thresh[i - 1]
        end_index = breakpoint_index
        
        plt.axvspan(start_index, end_index, facecolor=colors[i % len(colors)], alpha=0.25)
        plt.text(x=np.average([start_index, end_index]),
                y=breakpoint_distance_threshold + (y_upper_bound)/20,
                s=f"Chunk #{i}",
                horizontalalignment='center',
                rotation='vertical')
    
    if indices_above_thresh:
        last_breakpoint = indices_above_thresh[-1]
        if last_breakpoint < len(distances):
            plt.axvspan(last_breakpoint, len(distances),
                       facecolor=colors[len(indices_above_thresh) % len(colors)],
                       alpha=0.25)
            plt.text(x=np.average([last_breakpoint, len(distances)]),
                    y=breakpoint_distance_threshold + (y_upper_bound)/20,
                    s=f"Chunk #{len(indices_above_thresh)}",
                    rotation='vertical')
    
    plt.title("Text Chunks Based On Embedding Breakpoints")
    plt.xlabel("Sentence Position")
    plt.ylabel("Cosine distance between sequential sentences")
    plt.show()
    
    return indices_above_thresh

# Usage example
if __name__ == "__main__":
    # File path and labels
    file_path = "/data/tendergpt/testing/77153810/77153810.txt"
    labels = ["Important Date", "Eligibility or Prequalification Criteria", 
              "scope of work", "Contact Details"]
    
    # Process text
    sentences, distances, embeddings, embedder = process_text(file_path)
    
    # Visualize and get chunk breakpoints
    indices_above_thresh = visualize_chunks(distances)
    
    # Create chunks
    chunks = []
    start_index = 0
    for index in indices_above_thresh:
        group = sentences[start_index:index + 1]
        combined_text = ' '.join([d['sentence'] for d in group])
        chunks.append(combined_text)
        start_index = index + 1
    
    # Add final chunk
    if start_index < len(sentences):
        combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
        chunks.append(combined_text)
    
    print(f"Created {len(chunks)} chunks")
    
    # Segment and label chunks
    labeled_segments = segment_text_with_labels(chunks, labels, embedder)
    
    # Visualize clusters
    visualize_clusters(labeled_segments, labels, embedder)
    
    # Save results to JSON
    out_file_path = 'labeled_segments.json'
    with open(out_file_path, 'w', encoding='utf-8') as out_file:
        json.dump(labeled_segments, out_file, indent=4, ensure_ascii=False)
    
    print(f"Results saved to {out_file_path}")

In [None]:
import os
import json
import requests
import pandas as pd
from typing import List
from langchain.chat_models import ChatOpenAI
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed
# from langchain.llms import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback


# New Query Processing Functions
def process_query(query, title, knowledge_base):
    """Process a single query using the knowledge base"""
    try:
        # Retrieve relevant documents
        docs = knowledge_base.similarity_search(query)
        
        # Initialize the language model
        llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=4096,
            temperature=0.1
        )
        
        # Create and run the QA chain
        chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
        
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        
        return title, response.strip()
    
    except Exception as e:
        print(f"Error processing query: {e}")
        return title, f"Error processing query: {e}"

def aggregate_responses(responses):
    """Aggregate responses by title"""
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = ""
        aggregated[title] += "\n" + response if response else ""
    return aggregated

def process_queries(knowledge_base, output_file):
    """Process all queries and save results"""
    # Define queries
    queries = {
        "Extract all points related to bidder eligibility criteria, pre-qualification requirements, "
        "blacklisting criteria, and technical capability for bid submission as mentioned in this "
        "tender document. Include specific requirements such as: experience, financial criteria, "
        "certifications, technical qualifications, equipment or resource requirements, technical "
        "capability specifications, prior project experience, legal compliance, mandatory documents "
        "or certificates, and any conditions related to blacklisting or prior performance. Ensure "
        "all technical qualifications, minimum standards, and capability-related conditions are "
        "accurately extracted without including irrelevant information.": "PQ"
    }
    
    # Initialize results storage
    titles = []
    responses = []
    
    # Process queries in parallel
    with ThreadPoolExecutor(max_workers=len(queries)) as executor:
        future_to_query = {
            executor.submit(process_query, query, title, knowledge_base): title 
            for query, title in queries.items()
        }
        
        for future in as_completed(future_to_query):
            title = future_to_query[future]
            try:
                result_title, result_response = future.result()
                titles.append(result_title)
                responses.append((result_title, result_response))
            except Exception as e:
                print(f"Query processing failed for title {title}: {e}")
                titles.append(title)
                responses.append((title, f"Error: {e}"))
    
    # Aggregate responses
    aggregated_responses = aggregate_responses(responses)
    
    # Create DataFrame
    df = pd.DataFrame({
        'Title': [title for title in queries.values()],
        'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
    })
    
    # Save to Excel
    df.to_excel(output_file, index=False)
    print(f"Responses have been saved to '{output_file}'")
    
    return df

# Modified main execution
if __name__ == "__main__":
    # File paths
    input_file_path = "/data/tendergpt/testing/77153810/77153810.txt"
    output_file_path = "77153810.xlsx"
    
    # Labels for segmentation
    labels = [
        "Important Date",
        "Eligibility or Prequalification Criteria",
        "Technical Requirements",
        "Contact Details"
    ]
    
    # Process text and create knowledge base
    print("Processing text and creating knowledge base...")
    sentences, distances, embeddings, embedder = process_text(input_file_path)
    
    # Visualize and get chunk breakpoints
    indices_above_thresh = visualize_chunks(distances)
    
    # Create chunks
    chunks = []
    start_index = 0
    for index in indices_above_thresh:
        group = sentences[start_index:index + 1]
        combined_text = ' '.join([d['sentence'] for d in group])
        chunks.append(combined_text)
        start_index = index + 1
    
    # Add final chunk
    if start_index < len(sentences):
        combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
        chunks.append(combined_text)
    
    print(f"Created {len(chunks)} chunks")
    
    # Segment and label chunks
    labeled_segments = segment_text_with_labels(chunks, labels, embedder)
    
    # Visualize clusters
    visualize_clusters(labeled_segments, labels, embedder)
    
    # Save labeled segments
    segments_file = 'labeled_segments.json'
    with open(segments_file, 'w', encoding='utf-8') as out_file:
        json.dump(labeled_segments, out_file, indent=4, ensure_ascii=False)
    
    print(f"Labeled segments saved to {segments_file}")
    
    # Process queries and generate responses
    print("Processing queries and generating responses...")
    results_df = process_queries(knowledge_base, output_file_path)
    
    print("Processing complete!")
    print(f"Results saved to {output_file_path}")

In [2]:
# import requests
# from typing import List
# from langchain.vectorstores import FAISS
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.embeddings.base import Embeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# def get_embedding(text: str) -> List[float]:
#     response = requests.post("http://0.0.0.0:5002/embeddings",
#         json={"model": "BAAI/bge-small-en-v1.5", "input": [text]})
#     if response.status_code == 200:
#         data = response.json()
#         return data['data'][0]['embedding']
#     else:
#         raise Exception(f"API request failed with status code {response.status_code}")

# class CustomEmbeddings(Embeddings):
#     def embed_documents(self, texts: List[str]) -> List[List[float]]:
#         return [get_embedding(text) for text in texts]

#     def embed_query(self, text: str) -> List[float]:
#         return get_embedding(text)


# def process_text(text):
#     text = "\n".join([doc.page_content for doc in text])
    
#     # Semantic-based text splitting
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=1024,        # Adjust chunk size as needed
#         chunk_overlap=512,      # Overlap for maintaining context
#         length_function=len,
#         separators=["\n\n", "\n", ".", " "]  # Split by paragraphs and sentences
#     )
    
#     chunks = text_splitter.split_text(text)
    
#     # Create embeddings for the knowledge base
#     embeddings = CustomEmbeddings()
#     knowledgeBase = FAISS.from_texts(chunks, embedding=embeddings)
#     print(knowledgeBase)
#     return knowledgeBase

##new changes 

In [3]:
import os
import requests
from typing import List
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader


def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")


class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)


def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text


def process_text(texts):
    # Join all document texts into a single string to preserve context
    combined_text = "\n".join(texts)
    
    # Text splitting based on semantic boundaries to keep context intact
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4096,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ".", " "]
    )
    
    # Split text into chunks
    chunks = text_splitter.split_text(combined_text)
    
    # Create embeddings for the knowledge base
    embeddings = CustomEmbeddings()
    knowledge_base = FAISS.from_texts(chunks, embedding=embeddings)
    print(knowledge_base)
    return knowledge_base


# Specify folder path and load documents
# folder_path =f"/data/tendergpt/livetender_txt/77774640"
folder_path = f"/data/tendergpt/testing/77153810"
all_docs_text = load_text_files_from_directory(folder_path)

# Process texts and create knowledge base
knowledge_base = process_text(all_docs_text)


<langchain_community.vectorstores.faiss.FAISS object at 0x7efd65f80a90>


In [None]:
import os
import requests
from typing import List
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader


def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")


class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)


def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text


def process_text(texts):
    # Join all document texts into a single string to preserve context
    combined_text = "\n".join(texts)
    
    # Text splitting based on semantic boundaries to keep context intact
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4096,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ".", " "]
    )
    
    # Split text into chunks
    chunks = text_splitter.split_text(combined_text)
    
    # Create embeddings for the knowledge base
    embeddings = CustomEmbeddings()
    knowledge_base = FAISS.from_texts(chunks, embedding=embeddings)
    return knowledge_base


# Specify folder path and load documents
# folder_path =f"/data/tendergpt/livetender_txt/77774640"
folder_path = f"/data/tendergpt/testing/77153810"
all_docs_text = load_text_files_from_directory(folder_path)

# Process texts and create knowledge base
knowledge_base = process_text(all_docs_text)

import json
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
import faiss
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import re

model = SentenceTransformer('all-MiniLM-L6-v2')
labels = ["Important Dates", "Eligibility or Prequalification Criteria", "Scope of Work", "Contact Details"]
outFile = r'G:\Hetvi\data_segregation\PDF&TXT\ZPPA-PU-ORD-001-14.txt'
with open(outFile, 'r', encoding='utf-8') as f:
    essay = f.read()

single_sentences_list = re.split(r'(?<=[.?!])\s+', essay)
print(f"{len(single_sentences_list)} sentences were found")

sentences = [{'sentence': x, 'index': i} for i, x in enumerate(single_sentences_list)]


def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '

        combined_sentence += sentences[i]['sentence']

        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        sentences[i]['combined_sentence'] = combined_sentence
    return sentences


sentences = combine_sentences(sentences)
embeddings = model.encode([x['combined_sentence'] for x in sentences])

for i, sentence in enumerate(sentences):
    sentence['combined_sentence_embedding'] = embeddings[i]


def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]['distance_to_next'] = distance
    return distances, sentences


distances, sentences = calculate_cosine_distances(sentences)

y_upper_bound = 0.2

breakpoint_percentile_threshold = 95  # 95
breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)  # If you want more chunks, lower the percentile cutoff
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]  # The indices of those breakpoints on your list

start_index = 0

chunks = []
for index in indices_above_thresh:
    end_index = index
    group = sentences[start_index:end_index + 1]
    combined_text = ' '.join([d['sentence'] for d in group])
    chunks.append(combined_text)
    start_index = index + 1

if start_index < len(sentences):
    combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
    chunks.append(combined_text)

# Semantic Clustering
label_embeddings = model.encode(labels)  # Get embeddings for labels

# Initialize FAISS index with label embeddings
dimension = label_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(label_embeddings)


def segment_text_with_faiss_label_assignment(semantic_chunks):
    labeled_segments = defaultdict(list)
    for chunk in semantic_chunks:
        if chunk.strip():  
            paragraph_embedding = model.encode(chunk).reshape(1, -1)
            _, closest_label_index = faiss_index.search(paragraph_embedding, 1)
            closest_label = labels[closest_label_index[0][0]]
            labeled_segments[closest_label].append(chunk)
    return labeled_segments


RELEVANCE_THRESHOLD = -0.7  # Adjust based on experimentation
def segment_text_with_faiss_label_assignment(semantic_chunks, threshold=RELEVANCE_THRESHOLD):
    labeled_segments = defaultdict(list)
    for chunk in semantic_chunks:
        if chunk.strip():  
            paragraph_embedding = model.encode(chunk).reshape(1, -1)
            distances, label_indices = faiss_index.search(paragraph_embedding, len(labels))
            similarities = 1 - distances  
            assigned_labels = [labels[i] for i, sim in enumerate(similarities[0]) if sim >= threshold]

            if assigned_labels:
                print("Assigned labels : ")
                print(assigned_labels)
                for label in assigned_labels:
                    labeled_segments[label].append(chunk)
            else:
                labeled_segments["Other"].append(chunk)
    
    for label in labels:
        print(label, len(labeled_segments[label]))
    return labeled_segments


segmented_result = segment_text_with_faiss_label_assignment(chunks)
out_file_path = r'C:\Users\hetvi.solanki\Desktop\AIProjects\ragllm\ragTechniques\RAG_Techniques\data\out.json'
with open(out_file_path, 'w', encoding='utf-8') as out_file:
    json.dump(segmented_result, out_file, indent=4, ensure_ascii=False)


In [1]:
import os
import requests
from typing import List
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.document_loaders import TextLoader
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
import faiss
import re
from sklearn.metrics.pairwise import cosine_similarity


# Get embeddings from external API
def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")


class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)


# Load text from files
def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text


# Sentence processing and chunk creation
def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '

        combined_sentence += sentences[i]['sentence']

        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        sentences[i]['combined_sentence'] = combined_sentence
    return sentences


# Prepare sentence embeddings
def process_text(texts):
    combined_text = "\n".join(texts)
    single_sentences_list = re.split(r'(?<=[.?!])\s+', combined_text)
    sentences = [{'sentence': x, 'index': i} for i, x in enumerate(single_sentences_list)]
    
    sentences = combine_sentences(sentences)
    embeddings = model.encode([x['combined_sentence'] for x in sentences])
    
    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = embeddings[i]
    
    return sentences


# Calculate cosine distances between consecutive sentences
def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]['distance_to_next'] = distance
    return distances, sentences


# FAISS semantic clustering
def segment_text_with_faiss_label_assignment(semantic_chunks, threshold=-0.7):
    labeled_segments = defaultdict(list)
    for chunk in semantic_chunks:
        if chunk.strip():
            paragraph_embedding = model.encode(chunk).reshape(1, -1)
            distances, label_indices = faiss_index.search(paragraph_embedding, len(labels))
            similarities = 1 - distances
            assigned_labels = [labels[i] for i, sim in enumerate(similarities[0]) if sim >= threshold]

            if assigned_labels:
                for label in assigned_labels:
                    labeled_segments[label].append(chunk)
            else:
                labeled_segments["Other"].append(chunk)
    return labeled_segments


# Main processing
model = SentenceTransformer('all-MiniLM-L6-v2')
labels = ["Important Dates", "Eligibility or Prequalification Criteria", "Scope of Work", "Contact Details"]

folder_path = "/data/tendergpt/testing/77153810"
all_docs_text = load_text_files_from_directory(folder_path)
sentences = process_text(all_docs_text)

distances, sentences = calculate_cosine_distances(sentences)

y_upper_bound = 0.2
breakpoint_percentile_threshold = 95
breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]

start_index = 0
chunks = []
for index in indices_above_thresh:
    end_index = index
    group = sentences[start_index:end_index + 1]
    combined_text = ' '.join([d['sentence'] for d in group])
    chunks.append(combined_text)
    start_index = index + 1

if start_index < len(sentences):
    combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
    chunks.append(combined_text)



# Initialize FAISS knowledge base
knowledge_base = FAISS.from_texts(chunks, embedding=CustomEmbeddings())

# FAISS Label Assignment
label_embeddings = model.encode(labels)
dimension = label_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
print(faiss_index)
faiss_index.add(label_embeddings)

segmented_result = segment_text_with_faiss_label_assignment(chunks)

# Save result to JSON
out_file_path = r'/data/QAAPI/stored_files/out.json'
with open(out_file_path, 'w', encoding='utf-8') as out_file:
    json.dump(segmented_result, out_file, indent=4, ensure_ascii=False)

# # FAISS Label Assignment
# label_embeddings = model.encode(labels)
# dimension = label_embeddings.shape[1]
# faiss_index = faiss.IndexFlatL2(dimension)
# faiss_index.add(label_embeddings)

# segmented_result = segment_text_with_faiss_label_assignment(chunks)

# # Save result to JSON
# out_file_path = r'/data/QAAPI/stored_files\out.json'
# with open(out_file_path, 'w', encoding='utf-8') as out_file:
#     json.dump(segmented_result, out_file, indent=4, ensure_ascii=False)


  from tqdm.autonotebook import tqdm, trange


<faiss.swigfaiss_avx512.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7ff2715b0cc0> >


In [None]:
<faiss.swigfaiss_avx512.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7ff0386e0cc0> >

In [2]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback

queries = {

       "Extract all the point of Eligibilty criteria from the tender document":"PQ"    
}


def process_query(query, title):
    # Simulate retrieving documents from the knowledge base
    docs = faiss_index.similarity_search(query)
    
    # Initialize the language model
    llm = ChatOpenAI(
        # model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        model_name="meta-llama/Llama-3.1-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
    )
    
    # Load the QA chain
    chain = load_qa_chain(llm,verbose=True, chain_type='stuff')
    
    # Run the chain and capture the response
    try:
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        # Strip unnecessary context or text
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response

def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        # Ensure each title entry starts with an empty string if not already in aggregated
        if title not in aggregated:
            aggregated[title] = ""
        # Concatenate the response to the existing entry
        aggregated[title] += "\n" + response if response else ""
    return aggregated



# Initialize lists to store results
titles = []
responses = []

# Use ThreadPoolExecutor to run queries in parallel
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate responses for each point
aggregated_responses = aggregate_responses(responses)

# Create a DataFrame
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})

# Save the DataFrame to an Excel file
df.to_excel('77326167.xlsx', index=False)

print("Responses have been saved to '75927775.xlsx'.")

Query processing failed for title PQ: 'IndexFlatL2' object has no attribute 'similarity_search'
Responses have been saved to '75927775.xlsx'.


In [None]:
import os
import re
import json
from typing import List
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from collections import defaultdict
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load Sentence Transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Labels and FAISS index setup
labels = ["Important Date", "Eligibility or Prequalification Criteria", "scope of work", "Contact Details"]
label_embeddings = model.encode(labels)
dimension = label_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(label_embeddings)

# Step 1: Load text files from directory
def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                all_text.append(file.read())
    return all_text

# Step 2: Split text into semantic chunks
def split_text_into_chunks(texts: List[str], chunk_size=4096, chunk_overlap=200):
    combined_text = "\n".join(texts)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ".", " "]
    )
    return text_splitter.split_text(combined_text)

# Step 3: Generate embeddings and label assignment
RELEVANCE_THRESHOLD = -0.7

def assign_labels_to_chunks(semantic_chunks):
    labeled_segments = defaultdict(list)
    for chunk in semantic_chunks:
        if chunk.strip():
            paragraph_embedding = model.encode(chunk).reshape(1, -1)
            distances, label_indices = faiss_index.search(paragraph_embedding, len(labels))
            similarities = 1 - distances  # Convert distances to cosine similarity
            assigned_labels = [labels[i] for i, sim in enumerate(similarities[0]) if sim >= RELEVANCE_THRESHOLD]
            
            if assigned_labels:
                for label in assigned_labels:
                    labeled_segments[label].append(chunk)
            else:
                labeled_segments["Other"].append(chunk)
    return labeled_segments

# Step 4: Visualize labeled chunks
def visualize_clusters(segmented_result):
    scatter_points = []
    labels_for_plot = []
    colors = ['red', 'blue', 'green', 'yellow']

    for label_index, label in enumerate(labels):
        paragraphs = segmented_result[label]
        paragraph_embeddings = model.encode(paragraphs)
        scatter_points.append(paragraph_embeddings)
        labels_for_plot.extend([label] * len(paragraphs))

    all_embeddings = np.vstack(scatter_points)
    kmeans = KMeans(n_clusters=len(labels), random_state=42)
    kmeans.fit(all_embeddings)
    cluster_labels = kmeans.labels_

    reduced_embeddings = PCA(n_components=2).fit_transform(all_embeddings)

    plt.figure(figsize=(10, 6))
    for i, label in enumerate(labels):
        indices = np.where(cluster_labels == i)[0]
        plt.scatter(reduced_embeddings[indices, 0], reduced_embeddings[indices, 1],
                    label=label, color=colors[i], alpha=0.7)
    plt.title('Text Segmentation Clusters with K-Means')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.legend()
    plt.grid()
    plt.show()

# Step 5: Save results to JSON
def save_segmented_results(segmented_result, output_path):
    with open(output_path, 'w', encoding='utf-8') as out_file:
        json.dump(segmented_result, out_file, indent=4, ensure_ascii=False)

# Main workflow
folder_path = "/data/tendergpt/testing/77153810"  # Update to your folder path
all_text = load_text_files_from_directory(folder_path)
chunks = split_text_into_chunks(all_text)
segmented_result = assign_labels_to_chunks(chunks)

# Save and visualize results
output_path = "segmented_results.json"
save_segmented_results(segmented_result, output_path)
visualize_clusters(segmented_result)

# Optional: Search for a specific labeled chunk
eligibility_criteria_chunk = segmented_result["Eligibility or Prequalification Criteria"]


In [5]:
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback
from sentence_transformers import SentenceTransformer
import faiss
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI

# Define queries
queries = {
    "Extract all points related to bidder eligibility criteria, pre-qualification requirements, blacklisting criteria, "
    "and technical capability for bid submission as mentioned in this tender document. Include specific requirements such as: "
    "experience, financial criteria, certifications, technical qualifications, equipment or resource requirements, technical "
    "capability specifications, prior project experience, legal compliance, mandatory documents or certificates, and any conditions "
    "related to blacklisting or prior performance. Ensure all technical qualifications, minimum standards, and capability-related "
    "conditions are accurately extracted without including irrelevant information.": "PQ"
}

# Initialize Sentence Transformer and FAISS index
model = SentenceTransformer('all-MiniLM-L6-v2')
dimension = model.get_sentence_embedding_dimension()
faiss_index = faiss.IndexFlatL2(dimension)

# Simulate document retrieval for the knowledge base
knowledge_base = ...  # Replace with your document retrieval or embedding logic

# Function to preprocess text and create embeddings
def preprocess_and_embed(text):
    # Split text into sentences
    sentences = re.split(r'(?<=[.?!])\s+', text)
    # Chunk sentences
    buffer_size = 2
    chunks = []
    for i in range(len(sentences)):
        chunk = ' '.join(sentences[max(0, i - buffer_size): min(len(sentences), i + buffer_size + 1)])
        chunks.append(chunk)

    # Create embeddings
    chunk_embeddings = model.encode(chunks)
    return chunks, chunk_embeddings

# Function to add chunks to FAISS index
def add_chunks_to_index(chunks, embeddings):
    for i, chunk in enumerate(chunks):
        faiss_index.add(embeddings[i].reshape(1, -1))

# Function to process a query
def process_query(query, title):
    try:
        # Search similar chunks in FAISS
        query_embedding = model.encode([query])
        distances, indices = faiss_index.search(query_embedding, k=5)  # Top 5 results

        # Retrieve documents based on FAISS results
        docs = [knowledge_base[idx] for idx in indices[0] if idx < len(knowledge_base)]  # Replace with actual retrieval logic

        # Initialize LLM and QA chain
        llm = ChatOpenAI(
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",  # Replace with actual key
            max_tokens=4096,
            temperature=0.1
        )
        chain = load_qa_chain(llm, verbose=True, chain_type='stuff')

        # Run the chain
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        return title, response.strip()
    except Exception as e:
        return title, f"Error processing query: {e}"

# Function to aggregate responses
def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        aggregated[title] = aggregated.get(title, "") + "\n" + response
    return aggregated

# Process queries in parallel
responses = []
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result = future.result()
            responses.append(result)
        except Exception as e:
            responses.append((title, f"Error: {e}"))

# Aggregate responses
aggregated_responses = aggregate_responses(responses)

# Save results to Excel
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})
df.to_excel('responses.xlsx', index=False)

print("Responses have been saved to 'responses.xlsx'.")


Responses have been saved to 'responses.xlsx'.


In [None]:
import os
import requests
from typing import List, Dict
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback

class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return self.get_embedding(text)
    
    @staticmethod
    def get_embedding(text: str) -> List[float]:
        response = requests.post(
            "http://0.0.0.0:5002/embeddings",
            json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
        )
        if response.status_code == 200:
            data = response.json()
            return data['data'][0]['embedding']
        else:
            raise Exception(f"API request failed with status code {response.status_code}")

class TenderDocumentProcessor:
    def __init__(self, folder_path: str):
        self.folder_path = folder_path
        self.knowledge_base = None
        self.queries = {
            "eligibility": """
            Extract comprehensive eligibility criteria for bidders including:
            1. Basic qualification requirements
            2. Financial criteria (turnover, net worth, etc.)
            3. Technical qualifications and certifications
            4. Past experience requirements
            5. Legal compliance requirements
            6. Mandatory documents/certificates
            7. Blacklisting conditions
            8. Technical Capability
            Please provide specific details and numbers where mentioned.
            """,
            
            "dates": """
            Extract all important dates and deadlines including:
            1. Tender publication date
            2. Pre-bid meeting date and venue
            3. Bid submission start and end dates
            4. Technical bid opening date
            5. Financial bid opening date
            6. Project timeline/completion period
            List all dates in DD/MM/YYYY format where possible.
            """,
            
            "amounts": """
            Extract all financial details including:
            1. Estimated project cost
            2. EMD/Bid security amount
            3. Performance security amount
            4. Tender fee
            5. Minimum turnover requirement
            6. Any other significant financial figures
            Please specify the currency and provide exact figures.
            """,
            
            "scope": """
            Extract detailed scope of work including:
            1. Project overview and objectives
            2. Detailed deliverables
            3. Technical specifications
            4. Quality requirements
            5. Location details
            6. Timeline requirements
            7. Any specific conditions or constraints
            Please provide comprehensive details without omitting critical information.
            """,
            
            "contact": """
            Extract all contact information including:
            1. Tender inviting authority details
            2. Contact person name and designation
            3. Office address
            4. Phone numbers
            5. Email addresses
            6. Website details
            7. Helpdesk information
            Please include complete contact details for all mentioned points of contact.
            """
        }

    def load_documents(self) -> List[str]:
        all_text = []
        for filename in os.listdir(self.folder_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(self.folder_path, filename)
                try:
                    loader = TextLoader(file_path)
                    docs = loader.load()
                    all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
                except Exception as e:
                    print(f"Error loading file {filename}: {e}")
        return all_text

    def process_text(self, texts: List[str]):
        combined_text = "\n".join(texts)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4096,
            chunk_overlap=200,  # Increased overlap for better context
            length_function=len,
            separators=["\n\n", "\n", ".", " "]
        )
        chunks = text_splitter.split_text(combined_text)
        embeddings = CustomEmbeddings()
        self.knowledge_base = FAISS.from_texts(chunks, embedding=embeddings)

    def process_query(self, query: str, title: str) -> tuple:
        try:
            docs = self.knowledge_base.similarity_search(query, k=4)  # Increased k for better context
            
            llm = ChatOpenAI(
                model_name="meta-llama/Llama-3.1-8B-Instruct",
                openai_api_base="http://localhost:8000/v1",
                openai_api_key="FAKE",
                max_tokens=4096,
                temperature=0.1
            )
            
            chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
            
            with get_openai_callback() as cost:
                response = chain.run(input_documents=docs, question=query)
            
            return title, response.strip()
        except Exception as e:
            return title, f"Error processing query: {e}"

    def process_tender_document(self, output_filename: str):
        # Load and process documents
        all_docs_text = self.load_documents()
        self.process_text(all_docs_text)
        
        # Process queries in parallel
        responses = []
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, title): title 
                for title, query in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                try:
                    title, response = future.result()
                    responses.append((title, response))
                except Exception as e:
                    print(f"Error in processing: {e}")
        
        # Create DataFrame and save to Excel
        df = pd.DataFrame(responses, columns=['Category', 'Information'])
        df.to_excel(output_filename, index=False)
        print(f"Results saved to {output_filename}")

# Usage example
if __name__ == "__main__":
    folder_path = "/data/tendergpt/testing/77774640"
    processor = TenderDocumentProcessor(folder_path)
    processor.process_tender_document("tender_analysis.xlsx")

In [2]:
import requests
from typing import List
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.base import Embeddings

def get_embedding(text: str) -> List[float]:
    response = requests.post("http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]})
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

def process_text(text):
    text = "\n".join([doc.page_content for doc in text])
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=2048,
        chunk_overlap=32,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    
    embeddings = CustomEmbeddings()
    
    knowledgeBase = FAISS.from_texts(chunks, embedding=embeddings)
    return knowledgeBase

In [18]:
tcno  = 77562718 
def load_text_files_from_directory(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            all_docs.extend(loader.load())
    return all_docs

In [19]:

# folder_path=f"/data/tendergpt/testing/77302344"
folder_path = f"/data/tendergpt/livetender_txt/77562718"
all_docs = load_text_files_from_directory(folder_path)
knowledge_base = process_text(all_docs)


In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

# Define a more open-ended query to fetch all relevant information
queries = {
           """ Locate and extract the contact information for the officer associated with this tender. This includes:

The officer’s full name, precisely as listed.
The contact phone number in any format.
The official email address.
Each of these details is already present in the tender document. Search thoroughly across all sections to capture this information accurately. For any detail that cannot be found, return 'None.' """:"Contact details"
}



prompt_template = """
You are a  Question Answering assistant. Your primary task is to answer questions based STRICTLY on the provided context. 

RULES:
- ONLY answer if the question relates directly to the provided context.
- Do NOT provide information that is not explicitly mentioned in the context. Avoid speculating or adding details from outside the context.
- If the question does NOT directly match with the context, respond with  I don't know.
- If no context is provided, always respond with I don't know.
- Always use more text to elaborate the answer. However, ensure the elaboration is strictly based on the context.

Remember: Stick to the context. If uncertain, respond with I don't know.

Documents: {context}

Query: {query}
"""


# Function to create and run the chain with the given query and documents
def process_query(query, title):
    # Simulate retrieving documents from the FAISS knowledge base
    docs = knowledge_base.similarity_search(query)  # List of document objects
    
    # Initialize the language model (Llama 3)
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://10.0.0.19:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
     
    )
    
    # Define the template using `PromptTemplate`
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["query", "context"]  # Define input variables
    )
    
    # Create the LLM chain using the prompt and the Llama 3 model
    chain = LLMChain(llm=llm, verbose=True,prompt=prompt)
    
    # Prepare the documents to be passed to the model
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Run the chain and capture the response
    try:
        with get_openai_callback() as cost:
            response = chain.run(query=query, context=context)  # Pass inputs directly
        # Strip unnecessary context or text
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response

# Aggregate responses function
def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = response
        else:
            aggregated[title] += "\n" + response
    return aggregated

# Initialize lists to store results
titles = []
responses = []

# Use ThreadPoolExecutor to run queries in parallel
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate responses for each point
aggregated_responses = aggregate_responses(responses)

# Create a DataFrame
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})

# Save the DataFrame to an Excel file
df.to_excel('77326167.xlsx', index=False)

print("Responses have been saved to '77326167.xlsx'.")


In [4]:
import os
import requests
from typing import List
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader


def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")


class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)


def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            # Extract text content from Document objects
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text


def process_text(texts):
    # Join all document texts into a single string to preserve context
    combined_text = "\n".join(texts)
    
    # Text splitting based on semantic boundaries to keep context intact
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2048,
        chunk_overlap=32,
        length_function=len,
        separators=["\n\n", "\n", ".", " "]
    )
    
    # Split text into chunks
    chunks = text_splitter.split_text(combined_text)
    
    # Create embeddings for the knowledge base
    embeddings = CustomEmbeddings()
    knowledge_base = FAISS.from_texts(chunks, embedding=embeddings)
    return knowledge_base


# Specify folder path and load documents
folder_path =f"/data/tendergpt/livetender_txt/71484890"
all_docs_text = load_text_files_from_directory(folder_path)

# Process texts and create knowledge base
knowledge_base = process_text(all_docs_text)

import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback


queries = {

    "Extract the contact details of the officer from this document, including their name, email ID, and contact number. Search thoroughly across relevant sections, such as 'Contact Information,' 'Officer Details,' 'Authorized Contact,' or similar headings. If any detail is not found, return 'None' for that field.": "Contact Details"
}




def process_query(query, title):
    # Simulate retrieving documents from the knowledge base
    docs = knowledge_base.similarity_search(query)
    
    # Initialize the language model
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
    )
    
    # Load the QA chain
    chain = load_qa_chain(llm,verbose=True, chain_type='stuff')
    
    # Run the chain and capture the response
    try:
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        # Strip unnecessary context or text
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response

def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        # Ensure each title entry starts with an empty string if not already in aggregated
        if title not in aggregated:
            aggregated[title] = ""
        # Concatenate the response to the existing entry
        aggregated[title] += "\n" + response if response else ""
    return aggregated



# Initialize lists to store results
titles = []
responses = []

# Use ThreadPoolExecutor to run queries in parallel
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate responses for each point
aggregated_responses = aggregate_responses(responses)

# Create a DataFrame
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})

# Save the DataFrame to an Excel file
df.to_excel('77326167.xlsx', index=False)

print("Responses have been saved to '75927775.xlsx'.")


In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback


queries = {

    "Extract the contact details of the officer from this document, including their name, email ID, and contact number. Search thoroughly across relevant sections, such as 'Contact Information,' 'Officer Details,' 'Authorized Contact,' or similar headings. If any detail is not found, return 'None' for that field.": "Contact Details"
}




def process_query(query, title):
    # Simulate retrieving documents from the knowledge base
    docs = knowledge_base.similarity_search(query)
    
    # Initialize the language model
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
    )
    
    # Load the QA chain
    chain = load_qa_chain(llm,verbose=True, chain_type='stuff')
    
    # Run the chain and capture the response
    try:
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        # Strip unnecessary context or text
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response

def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        # Ensure each title entry starts with an empty string if not already in aggregated
        if title not in aggregated:
            aggregated[title] = ""
        # Concatenate the response to the existing entry
        aggregated[title] += "\n" + response if response else ""
    return aggregated



# Initialize lists to store results
titles = []
responses = []

# Use ThreadPoolExecutor to run queries in parallel
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate responses for each point
aggregated_responses = aggregate_responses(responses)

# Create a DataFrame
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})

# Save the DataFrame to an Excel file
df.to_excel('77326167.xlsx', index=False)

print("Responses have been saved to '75927775.xlsx'.")

In [None]:
import os
import warnings
import requests
from typing import List
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TORCH_USE_CUDA_DSA'] = "0"
warnings.filterwarnings("ignore")

# Custom embedding class for connecting to external embedding API
class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        return response.json()['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

# Load text files from directory
def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text

# Process text into hierarchical knowledge base
def process_text(texts):
    combined_text = "\n".join(texts)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=32, separators=["\n\n", "\n", ".", " "])
    chunks = text_splitter.split_text(combined_text)
    
    embeddings = CustomEmbeddings()
    knowledge_base = FAISS.from_texts(chunks, embedding=embeddings)
    return knowledge_base


queries = {
    "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
    "Clauses specifying  Pre-Qualification Criteria  or eligibility criteria": "Prequalification Criteria",
    "List all mandatory qualification criteria, including Blacklisting and required certifications": "Mandatory Qualification Criteria",
     "Performance criteria including work experience,experience and past performance criteria, emphasizing the need for prior similar project experience, references, and the successful completion of similar contracts": "Performance Criteria",
    "Financial criteria including turnover, Networth": "Financial Criteria",
    "Technical requirements": "Technical Requirements",
     "Work Specifications that bidders must meet to deliver tender requirements": "Specifications",
     "Supporting documents": "Supporting Documents",
    #  "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD date":"Importants Date",
    "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document. This includes but is not limited to the following fields: bid submission end date, tender due date, bid validity, opening date, closing date, pre-bid meeting date, EMD date, tender value, and tender fee. Group all extracted items under the label 'Important Dates and Amounts,' clearly specifying each date, time, or amount and its description as stated in the document.":"Important date",
    # "Extract all key important dates, times, and amounts mentioned in this document. Do not extract any irrelevant information.":"Importants Date",
    "Extract the contact details, including phone number, email address, and officer name. If the details are unavailable, return 'None' for the missing fields.":"Contact details"
  
       
}

def hierarchical_retrieve_and_process(query, title):
    # First-level retrieval to get relevant chunks
    initial_docs = knowledge_base.similarity_search(query, top_k=10)
    
    # Concatenate initial docs for second-level retrieval
    initial_text = "\n".join(doc.page_content for doc in initial_docs)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=32)
    fine_grained_chunks = text_splitter.split_text(initial_text)
    
    # Create sub-knowledge base for second-level retrieval
    sub_knowledge_base = FAISS.from_texts(fine_grained_chunks, embedding=CustomEmbeddings())
    final_docs = sub_knowledge_base.similarity_search(query, top_k=5)
    
    # Initialize LLM and QA chain for answering
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",
        max_tokens=4096,
        temperature=0.1
    )
    chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
    
    # Generate response with hierarchical context
    with get_openai_callback() as cost:
        response = chain.run(input_documents=final_docs, question=query)
    
    return title, response.strip()

# Run queries in parallel
def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = ""
        aggregated[title] += "\n" + response if response else ""
    return aggregated

# Execute hierarchical RAG and save to Excel
# folder_path = f"/data/tendergpt/livetender_txt/76542577"
folder_path = "/data/QAAPI/doc111_txt/74512478"
all_docs_text = load_text_files_from_directory(folder_path)
knowledge_base = process_text(all_docs_text)

titles, responses = [], []
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(hierarchical_retrieve_and_process, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate responses and save to Excel
aggregated_responses = aggregate_responses(responses)
df = pd.DataFrame({'Title': [title for title in queries.values()],
                   'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]})
df.to_excel('77326167.xlsx', index=False)

print("Responses have been saved to '77326167.xlsx'.")


In [None]:
import os
import warnings
import requests
from typing import List
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TORCH_USE_CUDA_DSA'] = "0"
warnings.filterwarnings("ignore")

# Custom embedding with topic-specific handling
class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str], topic: str = None) -> List[List[float]]:
        modified_texts = [f"{topic}: {text}" for text in texts] if topic else texts
        return [get_embedding(text) for text in modified_texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        return response.json()['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

# Load text files from directory
def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text

# Process text with hierarchical RAG and topic weighting
def process_text(texts, important_topics):
    combined_text = "\n".join(texts)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=32, separators=["\n\n", "\n", ".", " "])
    chunks = text_splitter.split_text(combined_text)
    
    # Apply weighted embeddings for key topics
    embeddings = CustomEmbeddings()
    weighted_chunks = []
    for topic in important_topics:
        weighted_chunks += [f"{topic}: {chunk}" for chunk in chunks]
    
    knowledge_base = FAISS.from_texts(weighted_chunks, embedding=embeddings)
    return knowledge_base

# Define queries with higher emphasis for key topics
queries = {
    "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work",
    "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
    "Provide a detailed list of all dates, times, and monetary values in the document, including bid submission end date, opening date, pre-bid meeting date, and other specified dates. Label this list 'Important Dates.'": "Important Dates",
    "Extract contact details of the officer, including name, phone number, and email ID. For any details not available, return 'None' for missing fields.": "Contact Details"
}

important_topics = ["eligibility criteria", "scope of work", "important dates", "contact details"]

def hierarchical_retrieve_and_process(query, title):
    # Initial retrieval step with topic emphasis
    initial_docs = knowledge_base.similarity_search(f"{title}: {query}", top_k=10)
    initial_text = "\n".join(doc.page_content for doc in initial_docs)
    
    # Second-level retrieval with focused chunks for high-importance topics
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
    fine_grained_chunks = text_splitter.split_text(initial_text)
    
    sub_knowledge_base = FAISS.from_texts(fine_grained_chunks, embedding=CustomEmbeddings())
    final_docs = sub_knowledge_base.similarity_search(query, top_k=5)
    
    # Initialize LLM and QA chain
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",
        max_tokens=4096,
        temperature=0.1
    )
    chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
    
    # Run QA chain and retrieve response
    with get_openai_callback() as cost:
        response = chain.run(input_documents=final_docs, question=query)
    
    return title, response.strip()

# Aggregate responses
def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = ""
        aggregated[title] += "\n" + response if response else ""
    return aggregated

# Execution
folder_path = "/data/tendergpt/testing/77326167"
all_docs_text = load_text_files_from_directory(folder_path)
knowledge_base = process_text(all_docs_text, important_topics)

titles, responses = [], []
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(hierarchical_retrieve_and_process, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate responses and save to Excel
aggregated_responses = aggregate_responses(responses)
df = pd.DataFrame({'Title': [title for title in queries.values()],
                   'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]})
df.to_excel('77326167.xlsx', index=False)

print("Responses have been saved to '77326167.xlsx'.")


In [None]:
import os
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.embeddings.base import Embeddings

# Custom embeddings setup
def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

# Text loading and processing
def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text

def process_text(texts):
    combined_text = "\n".join(texts)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,  # Smaller chunks to retain topic accuracy
        chunk_overlap=128,
        length_function=len,
        separators=["\n\n", "\n", ".", " "]
    )
    chunks = text_splitter.split_text(combined_text)
    embeddings = CustomEmbeddings()
    knowledge_base = FAISS.from_texts(chunks, embedding=embeddings)
    return knowledge_base

# Specify folder path and load documents
folder_path = "/data/QAAPI/doc111_txt/74512478"
all_docs_text = load_text_files_from_directory(folder_path)
knowledge_base = process_text(all_docs_text)

# Priority queries with detailed and specific prompts
queries = {
    "Detailed eligibility criteria for this tender, including pre-qualification conditions and any specific requirements": "Eligibility Criteria",
    "Full scope of work and functional requirements specified for bidders": "Scope of Work",
    "All important dates and times, including bid submission end date, opening date, bid validity, and pre-bid meeting date": "Important Dates",
    "Contact details of the officer, including officer’s name, email ID, and phone number": "Contact Details"
}

# Process query with hierarchical response generation
def process_query(query, title):
    docs = knowledge_base.similarity_search(query, k=5)  # Retrieve more relevant documents
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",
        max_tokens=2048,
        temperature=0.1
    )
    chain = load_qa_chain(llm, chain_type="map_reduce", verbose=True)  # Use map_reduce for hierarchical processing
    try:
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    return title, response

def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = ""
        aggregated[title] += "\n" + response if response else ""
    return aggregated

# Retrieve and process all queries
titles = []
responses = []
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate and store responses
aggregated_responses = aggregate_responses(responses)
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})
df.to_excel('77326167.xlsx', index=False)

print("Responses saved to '77326167.xlsx'.")


In [1]:
import os
import numpy as np
import pandas as pd
from typing import List, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.callbacks import get_openai_callback
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
import requests

# Define Custom Embeddings to get embeddings from external API
def get_embedding(text: str) -> List[float]:
    response = requests.post(
        "http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]}
    )
    if response.status_code == 200:
        data = response.json()
        return data['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

# Function to assign weights based on chunk content
def calculate_chunk_weight(chunk: str) -> float:
    keywords = ["scope of work", "eligibility", "performance criteria", "important dates", "contact details"]
    weight = 1.0  # Default weight
    if any(keyword in chunk.lower() for keyword in keywords):
        weight = 1.5  # Higher weight for specific keywords
    return weight

# Function to load text files from a directory
def load_text_files_from_directory(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            docs = loader.load()
            all_text.extend(doc.page_content for doc in docs if hasattr(doc, 'page_content'))
    return all_text

# Process text and create weighted embeddings for the knowledge base
def process_text(texts):
    combined_text = "\n".join(texts)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2048,
        chunk_overlap=32,
        length_function=len,
        separators=["\n\n", "\n", ".", " "]
    )
    chunks = text_splitter.split_text(combined_text)
    
    chunk_weights = [calculate_chunk_weight(chunk) for chunk in chunks]
    embeddings = CustomEmbeddings()
    weighted_embeddings = []
    
    for chunk, weight in zip(chunks, chunk_weights):
        embedding = np.array(embeddings.embed_documents([chunk])[0])
        weighted_embedding = embedding * weight
        weighted_embeddings.append(weighted_embedding.tolist())
    
    knowledge_base = FAISS.from_embeddings(weighted_embeddings, embedding=embeddings)
    return knowledge_base

# Initialize folder path and load documents
folder_path = "/data/QAAPI/doc111_txt/74512478"
all_docs_text = load_text_files_from_directory(folder_path)

# Process texts to create knowledge base with weighted embeddings
knowledge_base = process_text(all_docs_text)

# Define queries
queries = {
    "Identify the functional requirements, also referred to as the scope of work, specified in the document.": "Scope of Work",
    "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
    "List all mandatory qualification criteria, including blacklisting status and required certifications.": "Mandatory Qualification Criteria",
    "Detail performance criteria, such as work experience, experience, and past performance requirements, focusing on similar project experience, references, and successful completion of similar contracts.": "Performance Criteria",
    "Provide the financial criteria outlined in the document, including turnover and net worth requirements.": "Financial Criteria",
    "Outline the technical requirements mentioned in the document.": "Technical Requirements",
    "Summarize the work specifications that bidders must meet to fulfill the tender requirements.": "Specifications",
    "List all supporting documents required for this tender.": "Supporting Documents",
    "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document.": "Important Dates",
    "Extract the contact details of the officer, including name, email, and phone number, from the document.": "Contact Details"
}
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

# (Assume all imports and other function definitions above this point are correct)

# Adjusted process_query function
def process_query(query, title):
    docs = knowledge_base.similarity_search(query)
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
    )
    chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
    
    try:
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response  # Ensure both title and response are returned as a tuple

# Parallel query processing with debugging
responses = []
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result = future.result()
            print(f"Debug: Future result = {result}")  # Debug print to check the result structure
            if isinstance(result, tuple) and len(result) == 2:
                responses.append(result)
            else:
                print(f"Unexpected result format for title {title}: {result}")
                responses.append((title, "Unexpected result format"))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            responses.append((title, f"Error: {e}"))

# Aggregate responses and save to Excel
def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = ""
        aggregated[title] += "\n" + response if response else ""
    return aggregated

# Aggregate and write to DataFrame
aggregated_responses = aggregate_responses(responses)
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})
df.to_excel('77326167.xlsx', index=False)
print("Responses have been saved to '77326167.xlsx'.")


ValueError: too many values to unpack (expected 2)

In [5]:
    # Initialize the language model (Llama 3)
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://10.0.0.19:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
    )

In [21]:
from langchain.chains import RetrievalQA
qa_chain =RetrievalQA.from_chain_type(
llm,
retriever =knowledge_base.as_retriever())

In [15]:
from langchain.chains import RetrievalQA
qa_chain =RetrievalQA.from_chain_type(
llm,chain_type="stuff",return_source_documents=True,
retriever =knowledge_base.as_retriever())

In [25]:
query = "extract the entire pre qualification criteria mentioned in this document"
response =qa_chain.invoke({"query":query})


In [26]:
response

{'query': 'extract the entire pre qualification criteria mentioned in this document',
 'result': "Here is the pre-qualification criteria mentioned in the document:\n\n1. Individual Bidders:\n\t* 1.1 Constitution or legal status of Bidder (attach copy)\n\t* 1.2 Total annual volume of civil engineering construction work executed and payments received in the last five years preceding the year in which bids are invited (attach certificate from Chartered Accountant)\n\t* 1.5 Qualifications of technical personnel proposed for the Contract (refer also to Clause 4.2(e) of the Instructions to Bidders and Clause 9.1 of Part-1 General Conditions of Contract)\n\t* 1.6 Financial reports for the last five years: balance sheets, profit and loss statements, auditors' reports, etc. (list below and attach copies)\n\t* 1.7 Evidence of access to financial resources to meet the qualification requirements: cash in hand, lines of credit, etc. (list below and attach copies of support documents)\n1.8 Name, add

In [29]:
# from langchain.chains import RetrievalQA

# qa_chain = RetrievalQA.from_chain_type(
#     llm,
#     chain_type="stuff",
#     return_source_documents=True,
#     retriever=knowledge_base.as_retriever()
# )

# query = (
#     "Locate the 'Pre-Qualification Criteria' section in this document and extract each criterion listed. "
#     "If the criteria are in a bulleted or numbered format, extract each item separately. Include any criteria "
#     "under alternative section titles, such as 'Qualification Requirements' or 'Eligibility Criteria'."
# )
# ponse = qa_chain.invoke({"query": query})


In [30]:
response 

{'query': "Please extract all details from the 'Pre-Qualification Criteria' section of this document. Include each criterion in a list or structured format, and ensure that no points are omitted.",
 'result': 'Here are the pre-qualification criteria extracted from the document:\n\n**Pre-Qualification Criteria**\n\n**Individual Bidders**\n\n1. **Constitution or legal status of Bidder**\n\t* Attach copy\n\t* Place of registration: ________________________\n\t* Principal place of business: ________________________\n\t* Power of attorney of signatory ______________________ of Bid [Attach]\n2. **Total annual volume of civil engineering construction work executed and payments received in the last five years preceding the year in which bids are invited**\n\t* (Attach certificate from Chartered Accountant)\n\t* Rs. In lakhs: ______________________________________\n3. **Evidence of access to financial resources to meet the qualification requirements**\n\t* Cash in hand, lines of credit, etc.\n\

In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

# Define a more open-ended query to fetch all relevant information
queries = {
           """ Locate and extract the contact information for the officer associated with this tender. This includes:

The officer’s full name, precisely as listed.
The contact phone number in any format.
The official email address.
Each of these details is already present in the tender document. Search thoroughly across all sections to capture this information accurately. For any detail that cannot be found, return 'None.' """:"Contact details"       
                                        
                                        

}



prompt_template = """
You are a  Question Answering assistant. Your primary task is to answer questions based STRICTLY on the provided context. 

RULES:
- ONLY answer if the question relates directly to the provided context.
- Do NOT provide information that is not explicitly mentioned in the context. Avoid speculating or adding details from outside the context.
- If the question does NOT directly match with the context, respond with  I don't know.
- If no context is provided, always respond with I don't know.
- Always use more text to elaborate the answer. However, ensure the elaboration is strictly based on the context.

Remember: Stick to the context. If uncertain, respond with I don't know.

Documents: {context}

Query: {query}
"""


# Function to create and run the chain with the given query and documents
def process_query(query, title):
    # Simulate retrieving documents from the FAISS knowledge base
    docs = knowledge_base.similarity_search(query)  # List of document objects
    
    # Initialize the language model (Llama 3)
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://10.0.0.19:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
     
    )
    
    # Define the template using `PromptTemplate`
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["query", "context"]  # Define input variables
    )
    
    # Create the LLM chain using the prompt and the Llama 3 model
    chain = LLMChain(llm=llm, verbose=True,prompt=prompt)
    
    # Prepare the documents to be passed to the model
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Run the chain and capture the response
    try:
        with get_openai_callback() as cost:
            response = chain.run(query=query, context=context)  # Pass inputs directly
        # Strip unnecessary context or text
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response

# Aggregate responses function
def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        if title not in aggregated:
            aggregated[title] = response
        else:
            aggregated[title] += "\n" + response
    return aggregated

# Initialize lists to store results
titles = []
responses = []

# Use ThreadPoolExecutor to run queries in parallel
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate responses for each point
aggregated_responses = aggregate_responses(responses)

# Create a DataFrame
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})

# Save the DataFrame to an Excel file
df.to_excel('77326167.xlsx', index=False)

print("Responses have been saved to '77326167.xlsx'.")


In [None]:
# import pandas as pd
# from concurrent.futures import ThreadPoolExecutor, as_completed
# from langchain.callbacks import get_openai_callback

# # Define queries with more context for accuracy
# queries = {
#     "Identify the functional requirements, also referred to as the scope of work, specified in the document.": "Scope of Work",
#     "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
#     "List all mandatory qualification criteria, including blacklisting status and required certifications.": "Mandatory Qualification Criteria",
#     "Detail performance criteria, such as work experience, experience, and past performance requirements, focusing on similar project experience, references, and successful completion of similar contracts.": "Performance Criteria",
#     "Provide the financial criteria outlined in the document, including turnover and net worth requirements.": "Financial Criteria",
#     "Outline the technical requirements mentioned in the document.": "Technical Requirements",
#     "Summarize the work specifications that bidders must meet to fulfill the tender requirements.": "Specifications",
#     "List all supporting documents required for this tender.": "Supporting Documents",
#     "Identify and extract all dates mentioned throughout this tender document, regardless of context. List each date along with any associated description or event if provided (e.g., Bid submission end date, Opening date, Bid validity, pre-bid meeting date, etc.). Label this list as 'All Dates in Document.'": "Important Dates",
#     "Extract the contact details of the officer from this document, including their name, email ID, and contact number. Search thoroughly across relevant sections, such as 'Contact Information,' 'Officer Details,' 'Authorized Contact,' or similar headings. If any detail is not found, return 'None' for that field.": "Contact Details"
# }


# # Retrieve main context or summary from the document
# main_context = " ".join(doc.page_content for doc in knowledge_base.similarity_search("tender document summary"))


# def process_query(query, title, main_context):
#     # Incorporate main context into each query for clarity
#     query_with_context = f"{main_context}\n\n{query}"
    
#     # Initialize the language model
#     llm = ChatOpenAI(
#         model_name="meta-llama/Meta-Llama-3-8B-Instruct",
#         openai_api_base="http://localhost:8000/v1",
#         openai_api_key="FAKE",  # Replace with your actual key if needed
#         max_tokens=4096,
#         temperature=0.1
#     )
    
#     # Load the QA chain
#     chain = load_qa_chain(llm, verbose=True, chain_type='stuff')
    
#     # Run the chain and capture the response
#     try:
#         with get_openai_callback() as cost:
#             response = chain.run(input_documents=knowledge_base.similarity_search(query_with_context), question=query)
#         # Strip unnecessary context or text
#         response = response.strip()
#     except Exception as e:
#         response = f"Error processing query: {e}"
    
#     return title, response

# def aggregate_responses(responses):
#     aggregated = {}
#     for title, response in responses:
#         if title not in aggregated:
#             aggregated[title] = ""
#         aggregated[title] += "\n" + response if response else ""
#     return aggregated

# # Initialize lists to store results
# titles = []
# responses = []

# # Use ThreadPoolExecutor to run queries in parallel
# with ThreadPoolExecutor(max_workers=len(queries)) as executor:
#     future_to_query = {executor.submit(process_query, query, title, main_context): title for query, title in queries.items()}
    
#     for future in as_completed(future_to_query):
#         title = future_to_query[future]
#         try:
#             result_title, result_response = future.result()
#             titles.append(result_title)
#             responses.append((result_title, result_response))
#         except Exception as e:
#             print(f"Query processing failed for title {title}: {e}")
#             titles.append(title)
#             responses.append((title, f"Error: {e}"))

# # Aggregate responses for each point
# aggregated_responses = aggregate_responses(responses)

# # Create a DataFrame
# df = pd.DataFrame({
#     'Title': [title for title in queries.values()],
#     'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
# })

# # Save the DataFrame to an Excel file
# df.to_excel('responses.xlsx', index=False)

# print("Responses have been saved to 'responses.xlsx'.")


In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.callbacks import get_openai_callback

queries = {
    "Identify the functional requirements, also referred to as the scope of work, specified in the document.": "Scope of Work",
    "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria",
    "List all mandatory qualification criteria, including blacklisting status and required certifications.": "Mandatory Qualification Criteria",,
    "Summarize the work specifications that bidders must meet to fulfill the tender requirements.": "Specifications",
    "List all supporting documents required for this tender.": "Supporting Documents",
    "Extract a comprehensive list of all dates, times, and monetary values, along with their specific labels or descriptions as mentioned in the document. This includes but is not limited to the following fields: bid submission end date, tender due date, bid validity, opening date, closing date, pre-bid meeting date, EMD date, tender value, and tender fee. Group all extracted items under the label 'Important Dates and Amounts,' clearly specifying each date, time, or amount and its description as stated in the document.":"Important date",
    "Extract the contact details of the officer from this document, including their name, email ID, and contact number. Search thoroughly across relevant sections, such as 'Contact Information,' 'Officer Details,' 'Authorized Contact,' or similar headings. If any detail is not found, return 'None' for that field.": "Contact Details"
}



def process_query(query, title):
    # Simulate retrieving documents from the knowledge base
    docs = knowledge_base.similarity_search(query)
    
    # Initialize the language model
    llm = ChatOpenAI(
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        openai_api_base="http://localhost:8000/v1",
        openai_api_key="FAKE",  # Replace with your actual key if needed
        max_tokens=4096,
        temperature=0.1
    )
    
    # Load the QA chain
    chain = load_qa_chain(llm,verbose=True, chain_type='stuff')
    
    # Run the chain and capture the response
    try:
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        # Strip unnecessary context or text
        response = response.strip()
    except Exception as e:
        response = f"Error processing query: {e}"
    
    return title, response

def aggregate_responses(responses):
    aggregated = {}
    for title, response in responses:
        # Ensure each title entry starts with an empty string if not already in aggregated
        if title not in aggregated:
            aggregated[title] = ""
        # Concatenate the response to the existing entry
        aggregated[title] += "\n" + response if response else ""
    return aggregated



# Initialize lists to store results
titles = []
responses = []

# Use ThreadPoolExecutor to run queries in parallel
with ThreadPoolExecutor(max_workers=len(queries)) as executor:
    future_to_query = {executor.submit(process_query, query, title): title for query, title in queries.items()}
    
    for future in as_completed(future_to_query):
        title = future_to_query[future]
        try:
            result_title, result_response = future.result()
            titles.append(result_title)
            responses.append((result_title, result_response))
        except Exception as e:
            print(f"Query processing failed for title {title}: {e}")
            titles.append(title)
            responses.append((title, f"Error: {e}"))

# Aggregate responses for each point
aggregated_responses = aggregate_responses(responses)

# Create a DataFrame
df = pd.DataFrame({
    'Title': [title for title in queries.values()],
    'Response': [aggregated_responses.get(title, 'No response') for title in queries.values()]
})

# Save the DataFrame to an Excel file
df.to_excel('77326167.xlsx', index=False)

print("Responses have been saved to '75927775.xlsx'.")

In [None]:
import os
import fitz  # PyMuPDF
import shutil

def remove_all_files_in_folder(folder_path):
    if os.path.exists(folder_path):
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.remove(file_path)  # Remove the file
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)  # Remove the directory and its contents
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')
        print(f"All files and subfolders have been removed from {folder_path}.")
    else:
        print(f"The folder {folder_path} does not exist.")

def extract_images_and_text_from_pdf(pdf_path, output_folder):
    pdf_document = fitz.open(pdf_path)
    filename = os.path.basename(pdf_path).split('.')[0]
    output_folder = os.path.join(output_folder, filename)
    os.makedirs(output_folder, exist_ok=True)

    # Variable to store extracted text
    pdf_text = ""

    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)

        # Extract text from the page
        page_text = page.get_text("text")
        pdf_text += page_text  # Append text from each page

        images = page.get_images(full=True)
        for image_index, image in enumerate(images):
            xref = image[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"{filename}_page_{page_number + 1}_image_{image_index + 1}.{image_ext}"
            image_filepath = os.path.join(output_folder, image_filename)
            with open(image_filepath, "wb") as image_file:
                image_file.write(image_bytes)
            print(f"Saved image {image_filename}")
    
    print(f"Extraction complete. Images saved to {output_folder}")
    
    return pdf_text  # Return the extracted text

def process_pdfs_in_folder(input_folder_path, image_folder_path):
    all_pdf_text = ""  # Variable to store text from all PDFs

    for filename in os.listdir(input_folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(input_folder_path, filename)
            # Extract images and text from each PDF
            pdf_text = extract_images_and_text_from_pdf(file_path, image_folder_path)
            all_pdf_text += pdf_text  # Append text from each PDF

    # Return the combined text from all PDFs
    return all_pdf_text

# Example usage

input_folder_path = r'/data/QAAPI/extract_hetvi11'  # Folder where PDFs are stored
image_folder_path = r'/data/QAAPI/doc111F'  # Folder to save images

# Process PDFs and get the combined text
combined_pdf_text = process_pdfs_in_folder(input_folder_path, image_folder_path)

# Now `combined_pdf_text` contains the extracted text from all PDFs in the folder
print(combined_pdf_text)


In [3]:
import os
import fitz  # PyMuPDF
import shutil

def remove_all_files_in_folder(folder_path):
    if os.path.exists(folder_path):
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.remove(file_path)  # Remove the file
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)  # Remove the directory and its contents
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')
        print(f"All files and subfolders have been removed from {folder_path}.")
    else:
        print(f"The folder {folder_path} does not exist.")

def extract_images_and_text_from_pdf(pdf_path, output_folder):
    pdf_document = fitz.open(pdf_path)
    filename = os.path.basename(pdf_path).split('.')[0]
    output_folder = os.path.join(output_folder, filename)
    os.makedirs(output_folder, exist_ok=True)

    # Variable to store extracted text
    pdf_text = ""

    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)

        # Extract text from the page
        page_text = page.get_text("text")
        pdf_text += page_text  # Append text from each page

        images = page.get_images(full=True)
        for image_index, image in enumerate(images):
            xref = image[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"{filename}_page_{page_number + 1}_image_{image_index + 1}.{image_ext}"
            image_filepath = os.path.join(output_folder, image_filename)
            with open(image_filepath, "wb") as image_file:
                image_file.write(image_bytes)
            print(f"Saved image {image_filename}")
    
    print(f"Extraction complete. Images saved to {output_folder}")
    
    return pdf_text  # Return the extracted text

# Function to process a single PDF file by passing its path
def process_single_pdf(pdf_file_path, image_folder_path):
    # Extract images and text from the single PDF
    pdf_text = extract_images_and_text_from_pdf(pdf_file_path, image_folder_path)

    # Return the extracted text from the PDF
    return pdf_text

# Example usage
pdf_file_path = r'/data/QAAPI/extract_hetvi11/Certificate of Incorporation .pdf'  # Path to a specific PDF
image_folder_path = r'/data/QAAPI/doc111'  # Folder to save images

# Process the specific PDF and get the extracted text
extracted_pdf_text = process_single_pdf(pdf_file_path, image_folder_path)

# Now


Saved image Certificate of Incorporation _page_1_image_1.jpeg
Saved image Certificate of Incorporation _page_2_image_1.jpeg
Extraction complete. Images saved to /data/QAAPI/doc111/Certificate of Incorporation 


In [4]:
extracted_pdf_text

''

In [1]:
from google.oauth2 import service_account
from googleapiclient.discovery import build
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
from deep_translator import GoogleTranslator
from langdetect import detect
import io
import concurrent.futures
import os
import time
import json
import hashlib
import pdfplumber
import pandas as pd
from docx import Document
from tabulate import tabulate
from multiprocessing import Pool
import xlrd
from openpyxl import load_workbook
import re
import logging
import random
import string
import subprocess
from bs4 import BeautifulSoup
from docx import Document
from datetime import datetime,timedelta

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:
import os
from pdf2image import convert_from_path
from google.oauth2 import service_account
from googleapiclient.discovery import build
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
import io
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def googleOcr(file_path):
    try:
        creds = service_account.Credentials.from_service_account_file(
            r"/data/imageExtraction/GoogleAPICred/projectoct-436907-a6e51afb9d49.json",
            scopes=['https://www.googleapis.com/auth/drive']
        )
        service = build('drive', 'v3', credentials=creds)
        
        # Handle PDF files by converting them to images first
        if file_path.lower().endswith('.pdf'):
            images = convert_from_path(file_path)
            extracted_text = ""
            for i, img in enumerate(images):
                img_path = f"page_{i}.png"
                print("image:::::",img_path)
                img.save(img_path, 'PNG')  # Save each page as an image
                page_text = googleOcr(img_path)  # Perform OCR on each page
                extracted_text += (page_text or "") + "\n"  # Handle None return case
            return extracted_text.strip()

        mime = 'application/vnd.google-apps.document'
        res = service.files().create(
            body={'name': os.path.basename(file_path), 'mimeType': mime},
            media_body=MediaFileUpload(file_path, mimetype=mime, resumable=True)
        ).execute()

        text_output = io.BytesIO()
        downloader = MediaIoBaseDownload(text_output, service.files().export_media(fileId=res['id'], mimeType="text/plain"))
        
        done = False
        while not done:
            status, done = downloader.next_chunk()

        text_output.seek(0)
        extracted_text = text_output.read().decode('utf-8')
        service.files().delete(fileId=res['id']).execute()
        
        return extracted_text

    except Exception as e:
        logger.info(f"Error in OCR: {e}")
        return None


def process_folder(folder_path):
    extracted_text = ""
    
    # Walk through the folder and process each file
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            
            # Only process supported file types
            if file.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg')):
                logger.info(f"Processing file: {file_path}")
                file_text = googleOcr(file_path)
                extracted_text += f"Text from {file}:\n" + (file_text or "No text found") + "\n\n"
    
    return extracted_text


# Example usage
folder_path = r"/data/QAAPI/extract_hetvi/11111113"
result = process_folder(folder_path)
print(result)


In [None]:
import pandas as pd
import os 
# # Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Step 1: Load the Data
file_path = '/data/QAAPI/PQ.xlsx'
data = pd.read_excel(file_path)

# Step 2: Encode the Labels
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label_tag'])

# Step 3: Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['pq'], data['label_encoded'], test_size=0.2, random_state=42
)

# Step 4: Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = TextDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)

# Step 5: Model Initialization
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_encoder.classes_)
)

# Step 6: Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
)

# Step 7: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Step 8: Train the Model
trainer.train()

# Step 9: Save the Model and Label Encoder
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")
with open("label_encoder.pkl", "wb") as f:
    import pickle
    pickle.dump(label_encoder, f)

# Step 10: Inference Function
def predict_label(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=128,
        padding="max_length",
        truncation=True,
    )
    outputs = model(**inputs)
    pred_label = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([pred_label])[0]

# Example Usage
example_text = "The bidder must have a valid CMMI Level 3 certification."
predicted_label = predict_label(example_text)
print(f"Predicted Label: {predicted_label}")


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Step 1: Load the Data
file_path = '/data/QAAPI/PQ.xlsx'
data = pd.read_excel(file_path)

# Step 2: Encode the Labels
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label_tag'])

# Step 3: Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['pq'], data['label_encoded'], test_size=0.2, random_state=42
)

# Step 4: Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_texts(texts, tokenizer, max_len=128):
    return tokenizer(
        list(texts),
        max_length=max_len,
        truncation=True,
        padding="max_length",
        return_tensors="tf",
    )

train_encodings = encode_texts(train_texts, tokenizer)
val_encodings = encode_texts(val_texts, tokenizer)

# Step 5: Prepare the Data for TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": train_encodings["input_ids"],
        "attention_mask": train_encodings["attention_mask"],
    },
    train_labels
)).batch(8)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": val_encodings["input_ids"],
        "attention_mask": val_encodings["attention_mask"],
    },
    val_labels
)).batch(8)

# Step 6: Model Initialization
model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_encoder.classes_)
)

# Step 7: Compile the Model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Step 8: Train the Model
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3
)

# Step 9: Save the Model and Label Encoder
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

with open("label_encoder.pkl", "wb") as f:
    import pickle
    pickle.dump(label_encoder, f)

# Step 10: Inference Function
def predict_label(text):
    inputs = tokenizer(
        text,
        return_tensors="tf",
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    outputs = model(inputs)
    pred_label = tf.argmax(outputs.logits, axis=1).numpy()[0]
    return label_encoder.inverse_transform([pred_label])[0]



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Predicted Label: Certificates 


In [7]:
# Example Usage
example_text = """

The software being given to MMTC should have been used for processing a total no. of at least 500 e-Tenders and 200 e-Auction during the last three (3) financial years ending on 31.3.2021.
"""
predicted_label = predict_label(example_text)
print(f"Predicted Label: {predicted_label}")


Predicted Label: Client Ref. Letter 


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import LinearSVC

# Load data
df = pd.read_excel('/data/QAAPI/PQ.xlsx')

# Prepare data
X = df['pq']
y = df['label_tag']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline with TF-IDF and classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', LinearSVC())  # You can replace with MultinomialNB()
])

# Train model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Function for new predictions
def predict_label(text):
    return pipeline.predict([text])[0]

# Example usage
new_text = """

Bidders should have successfully managed and executed e-Auctioning of immovable properties of and and building of worth Rs. 300.00 crores or more in last three (3) financial years. (2018-19, 2019-20, 2020-21)


"""
print(f"Predicted Label: {predict_label(new_text)}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Load and preprocess data
df = pd.read_excel('/data/QAAPI/PQ.xlsx')
df = df.dropna(subset=['pq', 'label_tag'])  # Remove rows with missing values

# Text preprocessing
def preprocess_text(text):
    text = str(text).lower()
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

df['processed_text'] = df['pq'].apply(preprocess_text)

# Encode labels
le = LabelEncoder()
df['encoded_label'] = le.fit_transform(df['label_tag'])

# Split data
X = df['processed_text']
y = df['encoded_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Advanced pipeline with hyperparameter tuning
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english', 
        ngram_range=(1, 2),  # Use unigrams and bigrams
        max_features=5000
    )),
    ('classifier', LinearSVC())
])

# Hyperparameter grid
param_grid = {
    'tfidf__max_features': [3000, 5000, 7000],
    'classifier__C': [0.1, 1, 10],
    'classifier__max_iter': [5000]
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Detailed evaluation
print("Best Parameters:", grid_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Prediction function
def predict_label(text):
    processed_text = preprocess_text(text)
    pred_encoded = best_model.predict([processed_text])[0]
    return le.inverse_transform([pred_encoded])[0]

# Example
test_texts = [
    "E-procurement solution for government tender",
    "Annual financial statements for the company",
    "ISO certification details"
]

for text in test_texts:
    print(f"Text: {text}\nPredicted Label: {predict_label(text)}\n")