In [1]:
import os,sys
sys.path.append(r"/data/QAAPI/qa/lib/python3.10/site-packages/")

In [2]:
import openai

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [None]:
import os
import torch
import pyodbc
import warnings
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from typing import List
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from opensearchpy import OpenSearch
from elasticsearch import Elasticsearch

# Query Titles
query_titles = {
    "List of all the dates mentioned in the tender document...": "Important Dates",
    "Clauses specifying prequalification requirements...": "Prequalification Criteria",
    "Technical Eligibility criteria": "Technical Requirements",
    "Performance criteria": "Performance Criteria",
    "Financial criteria": "Financial Criteria",
    "List all mandatory qualification criteria...": "Mandatory Qualification Criteria",
    "Specifications that bidders must meet": "Specifications",
    "What are the functional requirements...": "Scope of Work"
}

# Environment setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
warnings.filterwarnings("ignore")

# Embedding model
embedding_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')

# LLM model initialization (using locally hosted LLaMA 3)
llm_model = ChatOpenAI(
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
    openai_api_base="http://localhost:8000/v1",
    openai_api_key="FAKE",
    max_tokens=4096,
    temperature=0.1
)

# # OpenSearch and Elasticsearch clients
# index_name = 'tprocanswers'
# opensearch_client = OpenSearch(
#     hosts=['https://10.0.0.109:9200'],
#     http_auth=("admin", "4Z*lwtz,,2T:0TGu"),
#     use_ssl=True,
#     verify_certs=False,
#     ssl_show_warn=False
# )
# es_client = Elasticsearch(['http://10.0.0.200:9200'])

# # Ensure index exists
# if not opensearch_client.indices.exists(index=index_name):
#     opensearch_client.indices.create(index=index_name)

# Custom Embeddings API request
def get_embedding(text: str) -> List[float]:
    response = requests.post("http://0.0.0.0:5002/embeddings",
        json={"model": "BAAI/bge-small-en-v1.5", "input": [text]})
    if response.status_code == 200:
        return response.json()['data'][0]['embedding']
    else:
        raise Exception(f"API request failed with status code {response.status_code}")

class CustomEmbeddings(HuggingFaceEmbeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

# Load and process text
def process_text(text_docs):
    text = "\n".join([doc.page_content for doc in text_docs])
    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=2048, chunk_overlap=32)
    chunks = text_splitter.split_text(text)
    
    # Use custom embeddings
    embeddings = CustomEmbeddings()
    knowledge_base = FAISS.from_texts(chunks, embedding=embeddings)
    
    return knowledge_base

# Load text files
def load_text_files_from_directory(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            loader = TextLoader(file_path)
            all_docs.extend(loader.load())
    return all_docs

# Process query
def process_query(query, knowledge_base):
    try:
        docs = knowledge_base.similarity_search(query)
        chain = load_qa_chain(llm_model, chain_type='stuff')
        with get_openai_callback() as cost:
            response = chain.run(input_documents=docs, question=query)
        return {'title': query_titles[query], 'response': response}
    except Exception as e:
        return {'title': query_titles[query], 'error': str(e)}

# Update Elasticsearch
def update_elasticsearch(tcno):
    search_query = {"query": {"term": {"tcno": tcno}}}
    indices = ['tbl_tendersadvance_migration_embedding', 'tbl_tendersadvance_migration']
    
    for index in indices:
        es_response = es_client.search(index=index, body=search_query)
        if es_response['hits']['total']['value'] > 0:
            for hit in es_response['hits']['hits']:
                es_client.update(index=index, id=hit['_id'], body={"doc": {"ispq": 1}})
                print(f"Updated tcno {tcno} with ispq = 1 in {index}")

# Update database
def update_database(tcno):
    conn_string = (
        'DRIVER=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.3.1;'
        'SERVER=10.0.0.63;DATABASE=ttneo;UID=aimlpq;PWD=aimlpq;'
        'TrustServerCertificate=yes;'
    )
    with pyodbc.connect(conn_string) as conn:
        cursor = conn.cursor()
        cursor.execute("UPDATE apptender.tbl_tender SET ispq = 1 WHERE tcno = ?", (tcno,))
        conn.commit()
        print(f"Database updated successfully for tcno {tcno}")

# Folder processing
def process_folder(tcno):
    try:
        folder_path = f"/data/txtfolder/dailydocument_11-10-24_txt/{tcno}"
        all_docs = load_text_files_from_directory(folder_path)
        knowledge_base = process_text(all_docs)

        results = []
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(process_query, query, knowledge_base) for query in query_titles.keys()]
            for future in as_completed(futures):
                results.append(future.result())

        json_response = {"results": [{"title": result["title"], "response": result.get("response", result.get("error"))} for result in results]}
        opensearch_client.index(index=index_name, id=tcno, body=json_response)

        update_elasticsearch(tcno)
        update_database(tcno)
        
    except Exception as e:
        print(f"Failed to process folder {tcno}: {str(e)}")

# Process all folders in parallel
def process_folders_in_parallel():
    base_folder_path = "/data/txtfolder/dailydocument_11-10-24_txt"
    tcno_folders = [tcno for tcno in os.listdir(base_folder_path) if os.path.isdir(os.path.join(base_folder_path, tcno))]
    
    with ThreadPoolExecutor(max_workers=24) as executor:
        executor.map(process_folder, tcno_folders)

# Main function
if __name__ == '__main__':
    process_folders_in_parallel()
