In [1]:
import os,sys
sys.path.append(r"/data/QAAPI/qa/lib/python3.10/site-packages/")

In [2]:
import os
import re
import json
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

  from tqdm.autonotebook import tqdm, trange


In [3]:
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

In [None]:
class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            # "What are the functional requirements, also known as the scope of work, mentioned in the document?": "Scope of Work"
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.": "Prequalification Criteria"
            # "List all supporting documents required for this tender.": "Supporting Documents",
            # "List of all the dates mentioned in the tender document which should include Bid submission end date or due date of tender, Bid validity, Opening date, closing date, pre bid meeting date, EMD amount,tender fee, tender value": "Important Dates",
            # "Extract the contact details of the officer from this document, including their name, email ID, and contact number.": "Contact Details"
        }
        self.request_count = 0 
    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 1000) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            self.request_count += 1  # Increment the request counter
            
            # Print the current request details
            print(f"Request {self.request_count}:")
            print(f"Query: {query}")
            
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        combined_text = " ".join(chunks)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

In [14]:
def analyze_tender_document(file_path: str) -> Dict[str, str]:
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

In [None]:
def main():
    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    results = analyze_tender_document(input_file)
    print(json.dumps(results, indent=4))
    
    return results

In [15]:
if __name__ == "__main__":
    main()

Request 1:
Query: Extract clauses that specify Pre-Qualification Criteria or eligibility criteria.
{
    "Prequalification Criteria": "Here are the clauses that specify Pre-Qualification Criteria or eligibility criteria:\n\n**Section 3 - Evaluation and Qualification Criteria**\n\n1. **Eligibility** (Criteria Compliance Requirements Documents)\n\t* 2.1.1 Nationality: The Bidder must meet the requirement of having nationality of India.\n\t* 2.1.2 Conflict of Interest: The Bidder must not have any conflict of interest in accordance with ITB Sub-Clause 4.3.\n\t* 2.1.3 Government-owned Entity: The Bidder must not be a government-owned entity unless it meets the requirements of ITB 4.5.\n\t* 2.1.4 Government-owned Entity: The Bidder must not be a government-owned entity unless it meets the requirements of ITB 4.5.\n\t* 2.1.5 UN Eligibility: The Bidder must not be ineligible under the provisions of ITB 18.2 (Bid Securing Declaration)\n2. **Pending Litigation** (Criteria Compliance Requirement

In [16]:
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
import faiss
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import re
model = SentenceTransformer('all-MiniLM-L6-v2')

In [17]:
labels = ["Important Dates", "Eligibility or Prequalification Criteria", "Scope of Work", "Contact Details"]
outFile=r'/data/docQA/docs/ZPPA-PU-ORD-001-14.txt'
with open(outFile, 'r', encoding='utf-8') as f:
    essay = f.read()

In [18]:
essay

'THE GOVERNMENT OF THE REPUBLIC OF ZAMBIA\nZAMBIA PUBLIC PROCUREMENT AUTHORITY\nBIDDING DOCUMENT (SINGLE-STAGE)\nIssued on: May 7, 2014\nProcurement of\nImplementation of the e-Government Procurement System, Supply, \nDelivery, Installation and Commissioning for ICT Infrastructure for Data \nCentre and Enhancement of the ZPPA Existing Website\nITB No: ZPPA/PU/ORD/001/14\nProject: Public Financial Management Reform Programme (PFMRP)\nProject No: I P147343\nPurchaser: Zambia Public Procurement Authority (ZPPA)\n\niii\nCONTENTS\nSection I. Instructions to Bidders (ITB)................................................................................7\nTable of Clauses...................................................................................................................8\nSection II. Bid Data Sheet (BDS).........................................................................................43\nSection III. Eligible Countries for the Provision of Goods, Works, and Services in \nB

In [None]:

single_sentences_list = re.split(r'(?<=[.?!])\s+', essay)
print (f"{len(single_sentences_list)} senteneces were found")
 
sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(single_sentences_list)]
def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '

        combined_sentence += sentences[i]['sentence']

        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        sentences[i]['combined_sentence'] = combined_sentence
    return sentences

sentences = combine_sentences(sentences)
embeddings = model.encode([x['combined_sentence'] for x in sentences])

for i, sentence in enumerate(sentences):
    sentence['combined_sentence_embedding'] = embeddings[i]

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]['distance_to_next'] = distance
    return distances, sentences

distances, sentences = calculate_cosine_distances(sentences)

y_upper_bound = 0.2

breakpoint_percentile_threshold = 95 #95
breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold) # If you want more chunks, lower the percentile cutoff
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # The indices of those breakpoints on your list

start_index = 0

chunks = []
for index in indices_above_thresh:
    end_index = index
    group = sentences[start_index:end_index + 1]
    combined_text = ' '.join([d['sentence'] for d in group])
    chunks.append(combined_text)
    start_index = index + 1

if start_index < len(sentences):    
    combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
    chunks.append(combined_text)

# Semantic Clustering
label_embeddings = model.encode(labels)  # Get embeddings for labels

# Initialize FAISS index with label embeddings
dimension = label_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)  
faiss_index.add(label_embeddings) 

def segment_text_with_faiss_label_assignment(semantic_chunks):
    labeled_segments = defaultdict(list)
    for chunk in semantic_chunks:
        if chunk.strip():  
            paragraph_embedding = model.encode(chunk).reshape(1, -1)
            _, closest_label_index = faiss_index.search(paragraph_embedding, 1)
            closest_label = labels[closest_label_index[0][0]]
            labeled_segments[closest_label].append(chunk) 
    return labeled_segments

RELEVANCE_THRESHOLD = -0.7  # Adjust based on experimentation
def segment_text_with_faiss_label_assignment(semantic_chunks, threshold=RELEVANCE_THRESHOLD):
    labeled_segments = defaultdict(list)
    for chunk in semantic_chunks:
        if chunk.strip():  
            paragraph_embedding = model.encode(chunk).reshape(1, -1)
            distances, label_indices = faiss_index.search(paragraph_embedding, len(labels))
            similarities = 1 - distances  
            assigned_labels = [labels[i] for i, sim in enumerate(similarities[0]) if sim >= threshold]

            if assigned_labels:
                print("Assigned labels : ")
                print(assigned_labels)
                for label in assigned_labels:
                    labeled_segments[label].append(chunk)
            else:
                labeled_segments["Other"].append(chunk)
    
    for label in labels:
        print(label , len(labeled_segments[label]))
    return labeled_segments
 
segmented_result = segment_text_with_faiss_label_assignment(chunks)
out_file_path = r'C:\Users\hetvi.solanki\Desktop\AIProjects\ragllm\ragTechniques\RAG_Techniques\data\out.json'
with open(out_file_path, 'w', encoding='utf-8') as out_file:
    json.dump(segmented_result, out_file, indent=4, ensure_ascii=False)


In [None]:

# LLM 
from openai import OpenAI
import concurrent.futures
client = OpenAI(
    base_url="http://129.154.248.128:8000/v1",
    api_key="token-abc123",
)
model=client.models.list().data[0].id

def paramExtractionUsingLLM(chunks,label=""):
    print(" -- inside llm -- ")
    all_results = []
    # Total Tokens : 98640 - out:1000 - prompt and query: 640 - Rest : 97000 tokens left *4 for characters = 3,88,000 char
    # max_chars = 388000 
    # Total Tokens : 7520 - out:1000 - prompt and query: 640 - Rest : 5880 tokens left *4 for characters = 23,520 char
    max_chars = 23520 
    print("+++++++++++++++++++++++++++++++++++++++")
    print("Len of Chunk : ", len(chunks))
    print(type(chunks))
    print("max len : ",max_chars)
    finalText = ""
    for c in chunks:
        finalText += c
    print("total Len of chunk: ",len(finalText))
    if len(finalText) > max_chars:
        split_texts = [finalText[i:i + max_chars] for i in range(0, len(finalText), max_chars)]
    else:
        split_texts = None

    results = []
    if split_texts:
        print(" -- inside if -- ")
        for text in split_texts:
            print("-- in for loop --")
            prompt = f"""Extract all the details of {label} from the below Text.

            Text: {text}
            
            Output:"""
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "system", "content": "You are an Agent who extracts necessary information from given text."},
                              {"role": "user", "content": prompt}],
                    max_tokens=1000,
                    temperature=0.1
                )
                results.append(response.choices[0].message.content.strip())
            except Exception as e:
                print("error:", e)
                results.append("Token limit exceeded or error")
        
        combined_result = " ".join(results) 
        all_results.append(combined_result)
    else:
        print(" -- inside else -- ")
        prompt = f"""Extract all the details of {label} from the below text.
        Text: {finalText}
        
        Output:"""
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": "You are an Agent who extracts necessary information from given text."},
                            {"role": "user", "content": prompt}],
                max_tokens=1000,
                temperature=0.1
            )
            results.append(response.choices[0].message.content.strip())
        except Exception as e:
            print("error:", e)
            results.append("Token limit exceeded or error")
    
        combined_result = " ".join(results) 
        all_results.append(combined_result)
    return all_results

def main(label):
    eligibility_criteria_chunk = segmented_result[label]
    eligibility_results = paramExtractionUsingLLM(eligibility_criteria_chunk,label=label)
    print(" -- LLM Response -- ")
    print(eligibility_results)
    return eligibility_results

labeled_ans = defaultdict(list)
for label in labels:
    result = main(label)
    labeled_ans[label].append(result)

ll_response_out_file_path = r'C:\Users\hetvi.solanki\Desktop\AIProjects\ragllm\ragTechniques\RAG_Techniques\data\lllm_out.json'
with open(ll_response_out_file_path, 'w', encoding='utf-8') as out_file:
    json.dump(labeled_ans, out_file, indent=4, ensure_ascii=False)

In [6]:
# pip install line_profiler
# !pip install py-spy

In [None]:
import os
import re
import warnings
import numpy as np
from typing import Dict, List, Any
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from concurrent.futures import ThreadPoolExecutor, as_completed

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

class TenderAnalyzer:
    """Main class for analyzing tender documents"""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.llm = ChatOpenAI(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
            openai_api_base="http://localhost:8000/v1",
            openai_api_key="FAKE",
            max_tokens=1024,
            temperature=0.1
        )
        self.chain = load_qa_chain(self.llm, chain_type='stuff')
        self.queries = {
            "Extract clauses that specify Pre-Qualification Criteria or eligibility criteria while strictly avoiding duplicates in any points.": "Prequalification Criteria"
        }
        self.request_count = 0 
    def process_document(self, file_path: str) -> List[str]:
        """Process document and split into chunks"""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        sentences = self._split_into_sentences(text)
        chunks = self._create_chunks(sentences)
        return self._chunk_by_tokens(chunks)

    def _split_into_sentences(self, text: str) -> List[Dict[str, Any]]:
        """Split text into sentences with metadata"""
        sentences = [{'sentence': s, 'index': i} 
                    for i, s in enumerate(re.split(r'(?<=[.?!])\s+', text))]
        return self._combine_sentences(sentences)

    def _combine_sentences(self, sentences: List[Dict[str, Any]], buffer_size: int = 1) -> List[Dict[str, Any]]:
        """Combine sentences with context"""
        combined = []
        for i, sent in enumerate(sentences):
            context = []
            # Add previous sentences
            for j in range(max(0, i - buffer_size), i):
                context.append(sentences[j]['sentence'])
            # Add current and next sentences
            context.append(sent['sentence'])
            for j in range(i + 1, min(len(sentences), i + buffer_size + 1)):
                context.append(sentences[j]['sentence'])
            sent['combined_sentence'] = ' '.join(context)
            combined.append(sent)
        return combined

    def _create_chunks(self, sentences: List[Dict[str, Any]]) -> List[str]:
        """Create document chunks based on semantic similarity"""
        # Create embeddings
        embeddings = self.model.encode([s['combined_sentence'] for s in sentences])
        
        # Calculate distances
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = np.dot(embeddings[i], embeddings[i + 1]) / (
                np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
            distances.append(1 - similarity)
        
        # Split into chunks
        threshold = np.percentile(distances, 95)
        chunks = []
        start_idx = 0
        
        for i, distance in enumerate(distances):
            if distance > threshold:
                chunk = ' '.join([s['sentence'] for s in sentences[start_idx:i + 1]])
                chunks.append(chunk)
                start_idx = i + 1
        
        if start_idx < len(sentences):
            chunk = ' '.join([s['sentence'] for s in sentences[start_idx:]])
            chunks.append(chunk)
        
        return chunks

    def _chunk_by_tokens(self, texts: List[str], max_tokens: int = 1000) -> List[str]:
        """Split texts into smaller chunks based on token count"""
        max_chars = max_tokens * 2
        chunks = []
        for text in texts:
            text_chunks = [text[i:i + max_chars] 
                         for i in range(0, len(text), max_chars)]
            chunks.extend(text_chunks)
        return chunks

    def process_query(self, query: str, text: str) -> str:
        """Process a single query against the text"""
        try:
            self.request_count += 1  # Increment the request counter
            
            # Print the current request details
            print(f"Request {self.request_count}:")
            print(f"Query: {query}")
            
            with get_openai_callback() as cb:
                response = self.chain.run(
                    input_documents=[Document(page_content=text)],
                    question=query
                )
            return response.strip()
        except Exception as e:
            print(f"Error processing query: {e}")
            return f"Error: {str(e)}"

    def analyze_tender(self, file_path: str) -> Dict[str, str]:
        """Main analysis function"""
        # Process document
        chunks = self.process_document(file_path)
        combined_text = " ".join(chunks)
        
        # Process queries in parallel
        results = {}
        with ThreadPoolExecutor(max_workers=len(self.queries)) as executor:
            future_to_query = {
                executor.submit(self.process_query, query, combined_text): title
                for query, title in self.queries.items()
            }
            
            for future in as_completed(future_to_query):
                title = future_to_query[future]
                try:
                    response = future.result()
                    results[title] = response
                except Exception as e:
                    results[title] = f"Error: {str(e)}"
        
        return results

def analyze_tender_document(file_path: str) -> Dict[str, str]:
    """
    Top-level function to analyze a tender document
    
    Args:
        file_path (str): Path to the tender document
    
    Returns:
        Dict[str, str]: Dictionary of analysis results
    """
    analyzer = TenderAnalyzer()
    return analyzer.analyze_tender(file_path)

def main():
    """Main execution function"""
    # Process tender document
    input_file = "/data/Pqmatch/testing/78804029/78804029.txt"
    
    # Analyze and get results
    results = analyze_tender_document(input_file)
    
    # Print results (optional)
    import json
    print(json.dumps(results, indent=4))
    
    return results

if __name__ == "__main__":
    main()

Request 1:
Query: Extract clauses that specify Pre-Qualification Criteria or eligibility criteria while strictly avoiding duplicates in any points.
{
    "Prequalification Criteria": "Here are the clauses that specify Pre-Qualification Criteria or eligibility criteria:\n\n**Section 3 - Evaluation and Qualification Criteria**\n\n1. **Nationality**: The Bidder must meet the requirement of nationality as specified in ITB 4.2.\n2. **Conflict of Interest**: The Bidder must not have any conflict of interest as specified in ITB 4.3.\n3. **Government-owned Entity**: The Bidder must meet the requirements of ITB 4.5.\n4. **ADB Eligibility**: The Bidder must meet the eligibility criteria as specified in ITB 4.3.\n5. **UN Eligibility**: The Bidder must meet the eligibility criteria as specified in ITB 4.3.\n6. **Pending Litigation**: The Bidder must not have any pending litigation as specified in ITB 2.2.\n7. **Financial Situation**: The Bidder must meet the financial situation criteria as specifi