In [1]:
!apt-get install openjdk-21-jre-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-21-openjdk-amd64/bin/java
!java -version

openjdk version "21.0.5" 2024-10-15
OpenJDK Runtime Environment (build 21.0.5+11-Ubuntu-1ubuntu122.04)
OpenJDK 64-Bit Server VM (build 21.0.5+11-Ubuntu-1ubuntu122.04, mixed mode, sharing)


In [24]:
import pyserini
from pyserini.search.lucene import LuceneSearcher
from pyserini.index import LuceneIndexReader
from pyserini.search import get_topics
from IPython.core.display import display, HTML
import heapq
import re
from typing import List, Tuple
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import requests
import math
from bs4 import BeautifulSoup

NUM_TERMS = 10

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

def initialize_model(index_path: str) -> Tuple[LuceneSearcher, LuceneIndexReader]:
    searcher = LuceneSearcher.from_prebuilt_index(index_path)
    reader = LuceneIndexReader.from_prebuilt_index(index_path)
    return searcher, reader

def generate_token_mapping(m: dict, docid: str, doc_vec: dict, reader: LuceneIndexReader) -> dict:
    doc = reader.doc(docid).raw().lower()
    for word in re.split(r'\s+', doc):
        analyzed = reader.analyze(word)
        for t in doc_vec:
            if t in analyzed:
                word = re.sub(r'\W+', '', word)
                if t not in m:
                  m[t] = word

def get_doc_title(docid: str, reader: LuceneIndexReader) -> str:
    soup = BeautifulSoup(reader.doc(docid).raw(), 'html.parser')
    try:
      headline = soup.find("headline").get_text()
      headline = re.sub(r'\s+', ' ', headline)
    except AttributeError:
      headline = ""
    return headline

def get_relevant_terms(query: str, n: int, searcher: LuceneSearcher, reader: LuceneIndexReader) -> List[Tuple[float, str]]:
    hits = searcher.search(query, n)
    all_terms = {}
    m = {}
    appearances = {}
    titles = []
    for i in hits:
        titles.append(get_doc_title(i.docid, reader))
        doc_vec = reader.get_document_vector(i.docid)
        generate_token_mapping(m, i.docid, doc_vec, reader)
        for t in doc_vec:
            tf = doc_vec[t] / len(doc_vec)
            idf = reader.stats()['documents'] / (reader.get_term_counts(t, analyzer=None)[0] + 1)
            tf_idf = math.log(tf * idf) if (tf * idf) > 0 else 0
            if m[t] not in all_terms:
                all_terms[m[t]] = tf_idf
            else:
                all_terms[m[t]] += tf_idf
            # track appearances to add a document appearance threshold
            if t not in appearances:
                appearances[t] = 1
            else:
                appearances[t] += 1
    # penalize terms that only occur in one document
    for t in appearances:
        if appearances[t] < 2:
            all_terms[m[t]] = 0
    most_rel = heapq.nlargest(n, all_terms, key=all_terms.get)
    return titles, most_rel


def evaluate_follow_up_query(query: str, relevant_terms: List[str], searcher: LuceneSearcher, reader: LuceneIndexReader, k: int = 10) -> float:
    enhanced_query = f"{query} {' '.join(relevant_terms)}"
    hits = searcher.search(enhanced_query, k)
    return sum((i + 1) / len(hits) for i in range(len(hits))) / len(hits)

def extract_keywords_rake(text: str, n: int = 5) -> List[str]:
    stop_words = set(stopwords.words('english'))
    word_scores = {}

    words = word_tokenize(text.lower())
    for word in words:
        if word not in stop_words and word.isalnum():
            if word not in word_scores:
                word_scores[word] = 1
            else:
                word_scores[word] += 1

    return sorted(word_scores, key=word_scores.get, reverse=True)[:n]

def format_response(query: str, relevant_terms: List[Tuple[float, str]], keywords: List[str]) -> str:
    response = f"Your query: '{query}'\n\n"
    response += "Based on your query, here are some relevant terms that might help refine your search:\n"
    for i, (score, term) in enumerate(relevant_terms[:5], 1):
        response += f"{i}. {term} (relevance score: {score:.2f})\n"

    response += "\nKeywords extracted from your query:\n"
    response += ", ".join(keywords)

    response += "\n\nWould you like to refine your search using any of these terms or keywords?"
    return response

def google_search_api(query: str, api_key: str, num_results: int = 10):
    base_url = "https://www.searchapi.io/api/v1/search"
    params = {
        "q": query,
        "num": num_results,
        "engine": "google",
        "api_key": api_key
    }

    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        data = response.json()
        results = []
        if "organic_results" in data:
            for result in data["organic_results"]:
                results.append({
                    "title": result.get("title", "No Title"),
                    "link": result.get("link", "No Link")
                })
        return results
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return []

def run_model(query: str, s: LuceneSearcher, r: LuceneIndexReader):
    print(f"Running query: {query}")
    titles, relevant_terms = get_relevant_terms(query, NUM_TERMS, s, r)
    print("\nTop Search Results:")
    for idx, t in enumerate(titles):
        print(f"{idx}. {t}")
    print("\nRelevant Terms Extracted:")
    for term in relevant_terms[:5]:
        print(term)
    follow_up_query = f"{query} {' '.join(relevant_terms[:3])}"
    print(f"\nFollow-up Query: {follow_up_query}")

    keywords = extract_keywords_rake(query)
    print("\nExtracted Keywords:")
    print(", ".join(keywords))

    formatted_response = format_response(query, [(1.0, term) for term in relevant_terms[:5]], keywords)
    print("\nFormatted Response:")
    print(formatted_response)



def run_model_with_google_api(query: str, api_key: str):
    print(f"Running query: {query}")

    search_results = google_search_api(query, api_key)

    if search_results:
        print("\nTop Search Results:")
        for idx, result in enumerate(search_results, start=1):
            print(f"{idx}. {result['title']} - {result['link']}")

        relevant_terms = [result['title'] for result in search_results]

        print("\nRelevant Terms Extracted:")
        for term in relevant_terms[:5]:
            print(term)

        follow_up_query = f"{query} {' '.join(relevant_terms[:3])}"
        print(f"\nFollow-up Query: {follow_up_query}")

        keywords = extract_keywords_rake(query)
        print("\nExtracted Keywords:")
        print(", ".join(keywords))

        formatted_response = format_response(query, [(1.0, term) for term in relevant_terms[:5]], keywords)
        print("\nFormatted Response:")
        print(formatted_response)
    else:
        print("No results returned from the Google Search API.")


def conversational_interface(api_key: str):
    print("Welcome to the Information Retrieval System!")
    print("Enter your query or type 'exit' to quit.")

    s, r = initialize_model("robust04")

    while True:
        query = input("\nYour query: ")
        if query.lower() == 'exit':
            break

        #run_model_with_google_api(query, api_key)
        run_model(query, s, r)

        refine = input("\nWould you like to refine your search? (yes/no): ")
        if refine.lower() == 'yes':
            refined_query = input("Enter your refined query: ")
            #run_model_with_google_api(refined_query, api_key)
            run_model(refined_query)

    print("Thank you for using the Information Retrieval System!")

if __name__ == "__main__":
    API_KEY = API_KEY = "EKU2jbjfUzRv2uWRTZGSEpyk"  # Replace with your actual API key
    conversational_interface(API_KEY)


Welcome to the Information Retrieval System!
Enter your query or type 'exit' to quit.

Your query: telemarketer protection
Running query: telemarketer protection

Top Search Results:
0.  AT&T, MCI SETTLE SUITS INVOLVING THEFT OF CUSTOMERS; COMMUNICATIONS: THE FIRMS ACCUSED EACH OTHER OF SWITCHING CONSUMERS WITHOUT THEIR CONSENT. BOTH NOW WANT THE FCC TO SET CONDUCT STANDARDS. 
1.  FT 18 JUN 92 / Telecommunications In Business (3): Area with high potential -Telemarketing 
2. 
3. 
4.  3 MEN CONVICTED IN $5-MILLION TELEMARKETING SCAM 
5.  'BOILER ROOM' CON ARTISTS USE CRACKS IN LAW 
6.  MAIL FRAUD COUNTS FILED AGAINST BREA MAN 
7.  STIFFER SENTENCES FOR CON MEN CHEER FRAUD TASK FORCE LEADERS 
8.  CINCINNATI BELL BUYS TELEMARKETING FIRM 
9.  VIEWPOINTS; TELEPHONE TECHNOLOGY VS. PRIVACY 

Relevant Terms Extracted:
telemarketers
ratepayers
scam
defrauded
boiler

Follow-up Query: telemarketer protection telemarketers ratepayers scam

Extracted Keywords:
telemarketer, protection

Formatted Res