In [36]:
import os
import psycopg2
from psycopg2.extras import execute_values
import warnings
import csv
from sentence_transformers import SentenceTransformer
from groq import Groq
import re

warnings.filterwarnings("ignore", category=FutureWarning)

# Load SBERT model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Initialize Groq API
groq_client = Groq(
    api_key="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",
)

# Rephrased System Prompt
system_prompt = (
    "You are a highly precise and concise assistant. Follow these principles when generating responses:\n\n"
    "1. Keep responses as short as possible while fully addressing the user's query.\n"
    "2. Provide only essential information—avoid unnecessary details or elaboration.\n"
    "3. Ensure factual accuracy and avoid guessing or fabricating information.\n"
    "4. Use bullet points or numbered lists for clarity if needed.\n"
    "5. If clarification is required, ask brief and specific follow-up questions.\n"
    "6. Acknowledge uncertainty and suggest external resources only if necessary.\n\n"
    "Additional Behavior:\n"
    "- If asked your name, respond with: 'EVA'.\n"
    "- If asked about your designed or developed or developers or created, respond with: 'Rajat, Satyam, Sandeep, students from Sitare University under the guidance of Dr. Kushal Shah.'\n"
    "- If asked about your purpose or work, respond with: 'I am designed to assist with AI/ML-related queries and provide support for technical tasks.'\n"
    "- Always stay precise, relevant, and focused on the query."
)



# Database connection
def get_db_connection():
    return psycopg2.connect(
        host="localhost",            
        database="smartsearch_db",  
        user="postgres",             
        password="Rajat@1234"      
    )


def init_db():
    conn = get_db_connection()
    cur = conn.cursor()
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector")

    cur.execute("""
        CREATE TABLE IF NOT EXISTS smartsearch (
            id SERIAL PRIMARY KEY,
            book_author TEXT,
            book_name TEXT,
            book_url TEXT,
            chapter_name TEXT,
            chapter_number TEXT,
            page INTEGER,
            paragraph INTEGER,
            text TEXT,
            topic TEXT,
            embedding vector(768)
        )
    """)

    cur.execute("""
        CREATE INDEX IF NOT EXISTS embedding_idx ON smartsearch USING ivfflat (embedding vector_cosine_ops)
    """)

    conn.commit()
    cur.close()
    conn.close()

# Compute embeddings for text
def compute_embedding(text):
    return model.encode(text).tolist()

# Process CSV file
def process_csv_file(csv_file, batch_size=1000):
    conn = get_db_connection()
    cur = conn.cursor()

    with open(csv_file, 'r', encoding='utf-8') as file:
        csvreader = csv.DictReader(file)
        chunk = []
        for row in csvreader:
            chunk.append(row)
            if len(chunk) == batch_size:
                process_csv_chunk(chunk, cur)
                chunk = []

        if chunk:
            process_csv_chunk(chunk, cur)

    conn.commit()
    cur.close()
    conn.close()

# Process a chunk of CSV data
def process_csv_chunk(chunk, cur):
    embeddings = []
    for row in chunk:
        embedding = compute_embedding(row['Text'])
        embeddings.append((
            row['Book Author'], row['Book Name'], row['Book URL'], row['Chapter Name'],
            row['Chapter Number'], int(row['Page']), int(row['Paragraph']),
            row['Text'], row['Topic'], embedding
        ))

    execute_values(cur, """
        INSERT INTO smartsearch (book_author, book_name, book_url, chapter_name, chapter_number, page, paragraph, text, topic, embedding)
        VALUES %s
    """, embeddings)

# Querying the database
def query_paragraphs(question):
    conn = get_db_connection()
    cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

    # Generate embedding for the question
    query_embedding = model.encode(question).tolist()

    # Query the database
    cur.execute("""
        SELECT *, 1 - (embedding <=> %s::vector) AS similarity
        FROM smartsearch
        ORDER BY embedding <=> %s::vector
        LIMIT 10
    """, (query_embedding, query_embedding))

    results = cur.fetchall()
    cur.close()
    conn.close()
    # print("reultes ->  ", results)
    return results


import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Filtering functions
def contains_inappropriate_content(text):
    """
    Check if the text contains inappropriate or flagged content using regex patterns.
    """
    inappropriate_patterns = [
        r"\b(?:f\*?u\*?c\*?k|s\*?h\*?i\*?t|b\*?i\*?t\*?c\*?h)\b",  # Common profanities with optional masking
        r"\b(?:a\*?s\*?s|d\*?a\*?mn|c\*?u\*?n\*?t)\b",
        r"[^\w\s]{3,}",  # Strings with excessive symbols
        r"(.)\1{3,}",  # Repeated characters like "aaaa" or "!!!!"
    ]

    for pattern in inappropriate_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return True

    if is_random_string(text):
        return True

    return False

def is_random_string(text):
    """
    Check for random-like strings using length and entropy heuristics.
    """
    if len(text.split()) < 5:
        return True

    unique_chars = set(text)
    entropy = len(unique_chars) / len(text)
    return entropy > 0.8

# 2. Main refinement function
def refine_and_answer_with_groq(question, paragraphs):
    # Combine all paragraphs into a single context
    context = "\n\n".join([f"Paragraph {idx + 1}: {para['text']}" for idx, para in enumerate(paragraphs)])
    
    # Filter inappropriate paragraphs
    filtered_paragraphs = [para['text'] for para in paragraphs if not contains_inappropriate_content(para['text'])]
    
    # Calculate relevance using cosine similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([question] + filtered_paragraphs)
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    top_indices = similarities.argsort()[-3:][::-1]
    top_paragraphs = [filtered_paragraphs[idx] for idx in top_indices]
    top_context = "\n\n".join([f"Paragraph {idx + 1}: {filtered_paragraphs[idx]}" for idx in top_indices])

    # Generate response using system prompt
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Question: {question}\n\nContext:\n{top_context}"}
    ]
    chat_completion = groq_client.chat.completions.create(
        messages=messages,
        model="llama3-8b-8192",
    )
    response = chat_completion.choices[0].message.content.strip()

    return {
        "response": response,
        "top_paragraphs": top_paragraphs
    }



# Main Function
def main():
    # Initialize the database if needed
    if not os.path.exists('db_initialized.flag'):
        print("Initializing database...")
        init_db()
        with open('db_initialized.flag', 'w') as flag_file:
            flag_file.write('Database initialized')
        print("Database initialized.")
    else:
        print("Database already initialized.")


    # Ask for user input
    question = input("Enter your question: ").strip()
   


    print("\nSearching for relevant paragraphs...\n")
    top_results = query_paragraphs(question)

    if len(top_results)==0:
        print("No relevant results found.")
        return


    answer = refine_and_answer_with_groq(question, top_results)

    print("answer ",answer['top_paragraphs'])
    print("top_results ",top_results)
    print("\n\n")
    print("\nGenerated Answer:")   
    for key,value in answer.items():
        print(key , value)
        print("\n")
        
    arr = answer["top_paragraphs"]
    lst= []
    for ans in arr:
        found = False
        for idx, result in enumerate(top_results):
            # Assuming the 8th element of each result contains the relevant text
            if ans in result[8]:
                print(f"Answer found at index {idx} in top_results")
                lst.append(result)
                found = True
                break
        if not found:
            print("Answer not found in top_results")
    print("ans" , lst)

if __name__ == "__main__":
    main()



Database already initialized.

Searching for relevant paragraphs...

answer  ['In this section we examine what to do when the data are not i.i.d.; when they can change over time. In this case, it matters when we make a prediction, so we will adopt the perspective called online learning: an agent receives an input xj from nature, predicts the corresponding ONLINE LEARNING yj, and then is told the correct answer. Then the process repeats with xj+1, and so on. One might think this task is hopeless—if nature is adversarial, all the predictions may be wrong.', 'AB = BA = In (11.4) The operation of inverting a matrix is indicated by a −1 superscript next to the matrix; for example, A−1. The result of the operation is referred to as the inverse of the original matrix; for example, B is the inverse of A.', 'This chapter covers the following topics: • Traditional vs. modern (big) data • SMACK in a nutshell • S park, the engine • M esos, the container • A kka, the model • C assandra, the storage