In [None]:
import pyserini
from pyserini.search.lucene import LuceneSearcher
from pyserini.index import LuceneIndexReader
from IPython.core.display import display, HTML
from pyserini.search import get_topics
import heapq
import re
from typing import List, Tuple
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import requests

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def initialize_model(index_path: str) -> Tuple[LuceneSearcher, LuceneIndexReader]:
    searcher = LuceneSearcher.from_prebuilt_index(index_path)
    reader = LuceneIndexReader.from_prebuilt_index(index_path)
    return searcher, reader

def generate_token_mapping(docid: str, doc_vec: dict, reader: LuceneIndexReader) -> dict:
    doc = reader.doc(docid).raw().lower()
    mapping = {}
    for word in re.split(r'\s+', doc):
        analyzed = reader.analyze(word)
        for t in doc_vec:
            if t in analyzed:
                word = re.sub(r'\W+', '', word)
                mapping[t] = word
    return mapping

def get_relevant_terms(query: str, n: int, searcher: LuceneSearcher, reader: LuceneIndexReader, max_terms: int = 5) -> List[Tuple[float, str]]:
    hits = searcher.search(query, n)
    all_terms = []
    for i in hits:
        doc_vec = reader.get_document_vector(i.docid)
        m = generate_token_mapping(i.docid, doc_vec, reader)

        for t, f in doc_vec.items():
            tf = f / len(doc_vec)
            try:
                df = reader.get_term_counts(t)[0]
            except:
                df = 0
            idf = reader.stats()['documents'] / df if df != 0 else 0
            tf_idf = tf * idf
            all_terms.append((tf_idf, m[t]))

    return heapq.nlargest(max_terms, all_terms)

def evaluate_follow_up_query(query: str, relevant_terms: List[str], searcher: LuceneSearcher, reader: LuceneIndexReader, k: int = 10) -> float:
    enhanced_query = f"{query} {' '.join(relevant_terms)}"
    hits = searcher.search(enhanced_query, k)
    return sum((i + 1) / len(hits) for i in range(len(hits))) / len(hits)

def extract_keywords_rake(text: str, n: int = 5) -> List[str]:
    stop_words = set(stopwords.words('english'))
    word_scores = {}

    words = word_tokenize(text.lower())
    for word in words:
        if word not in stop_words and word.isalnum():
            if word not in word_scores:
                word_scores[word] = 1
            else:
                word_scores[word] += 1

    return sorted(word_scores, key=word_scores.get, reverse=True)[:n]

def format_response(query: str, relevant_terms: List[Tuple[float, str]], keywords: List[str]) -> str:
    response = f"Your query: '{query}'\n\n"
    response += "Based on your query, here are some relevant terms that might help refine your search:\n"
    for i, (score, term) in enumerate(relevant_terms[:5], 1):
        response += f"{i}. {term} (relevance score: {score:.2f})\n"

    response += "\nKeywords extracted from your query:\n"
    response += ", ".join(keywords)

    response += "\n\nWould you like to refine your search using any of these terms or keywords?"
    return response

def google_search_api(query: str, api_key: str, num_results: int = 10):
    base_url = "https://www.searchapi.io/api/v1/search"
    params = {
        "q": query,
        "num": num_results,
        "engine": "google",
        "api_key": api_key
    }

    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        data = response.json()
        results = []
        if "organic_results" in data:
            for result in data["organic_results"]:
                results.append({
                    "title": result.get("title", "No Title"),
                    "link": result.get("link", "No Link")
                })
        return results
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return []

def run_model_with_google_api(query: str, api_key: str):
    print(f"Running query: {query}")

    search_results = google_search_api(query, api_key)

    if search_results:
        print("\nTop Search Results:")
        for idx, result in enumerate(search_results, start=1):
            print(f"{idx}. {result['title']} - {result['link']}")

        relevant_terms = [result['title'] for result in search_results]

        print("\nRelevant Terms Extracted:")
        for term in relevant_terms[:5]:
            print(term)

        follow_up_query = f"{query} {' '.join(relevant_terms[:3])}"
        print(f"\nFollow-up Query: {follow_up_query}")

        keywords = extract_keywords_rake(query)
        print("\nExtracted Keywords:")
        print(", ".join(keywords))

        formatted_response = format_response(query, [(1.0, term) for term in relevant_terms[:5]], keywords)
        print("\nFormatted Response:")
        print(formatted_response)
    else:
        print("No results returned from the Google Search API.")

def conversational_interface(api_key: str):
    print("Welcome to the Information Retrieval System!")
    print("Enter your query or type 'exit' to quit.")

    while True:
        query = input("\nYour query: ")
        if query.lower() == 'exit':
            break

        run_model_with_google_api(query, api_key)

        refine = input("\nWould you like to refine your search? (yes/no): ")
        if refine.lower() == 'yes':
            refined_query = input("Enter your refined query: ")
            run_model_with_google_api(refined_query, api_key)

    print("Thank you for using the Information Retrieval System!")

if __name__ == "__main__":
    API_KEY = API_KEY = "EKU2jbjfUzRv2uWRTZGSEpyk"  # Replace with your actual API key
    conversational_interface(API_KEY)
