In [None]:
# Long version - does the entire processing from scratch.

# ---------------------------- SETUP ---------------------------- #

# Imports the necessary libraries
import os
import time
import csv
import pandas as pd
import numpy as np
import openai
import chromadb
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

# Load OpenAI API key from environment or set manually
openai_api_key = os.getenv("OPENAI_API_KEY") or "your api key"
client = openai.OpenAI(api_key=openai_api_key)

# Ensure API key is valid before proceeding
if not openai_api_key or openai_api_key.startswith("your-"):
    raise ValueError("ERROR: OpenAI API key is missing or incorrect.")

print(f"Using OpenAI API Key: {openai_api_key[:5]}********")

# File paths
# Make sure you have the correct paths.
metadata_file = "/Code Execution/gutenberg_metadata.csv"
cleaned_file = "/Code Execution/gutenberg_data_cleaned.csv"
sample_file = "/Code Execution/gutenberg_sample.csv"
evaluation_file = "/Code Execution/evaluation_results.csv"

# Load metadata
df_metadata = pd.read_csv(metadata_file)

In [None]:
# ---------------------------- DATA CLEANING ---------------------------- #
def clean_text(text):
    """
    Cleans raw book text by removing unwanted whitespace, footnotes, 
    and unnecessary headers such as the Project Gutenberg disclaimer.

    Args:
        text (str): Raw text from a book.

    Returns:
        str: Cleaned text.
    """
    if not isinstance(text, str):
        return ""
    
    # Removes newlines, tabs, and excess whitespace
    text = text.replace("\n", " ").replace("\t", " ").replace("\r", " ")
    text = " ".join(text.split())  # Remove extra spaces
    
    # Remove Project Gutenberg headers and footers
    start_marker = "*** START OF THIS PROJECT GUTENBERG"
    end_marker = "*** END OF THIS PROJECT GUTENBERG"
    
    if start_marker in text and end_marker in text:
        text = text.split(start_marker)[1].split(end_marker)[0]
    return text

def download_books():
     """
    Downloads book texts from Project Gutenberg using provided metadata links.
    Saves cleaned texts to a CSV file for further processing.

    This function:
    - Retrieves book text either from GutenbergPy or web scraping.
    - Cleans the text using `clean_text()`.
    - Saves the cleaned books in a CSV file.
    """
    data = {"Author": [], "Title": [], "Link": [], "ID": [], "Bookshelf": [], "Text": []}

    for index, row in df_metadata.iterrows():
        try:
            # Extracts book ID from URL
            book_id = int(row["Link"].split("/")[-1])

            # Retrieves text from Gutenberg website
            page = requests.get(row["Link"])
            soup = BeautifulSoup(page.content, "html.parser")
            text_link = "http://www.gutenberg.org" + soup.find_all("a", string="Plain Text UTF-8")[0]["href"]
            http_response_object = urlopen(text_link)
            text = http_response_object.read().decode("utf-8", errors="ignore")

            # Clean the retrieved text
            text = clean_text(text)

            # Store book metadata and text
            data["Text"].append(text)
            data["ID"].append(book_id)
            data["Title"].append(row["Title"])
            data["Author"].append(row["Author"])
            data["Link"].append(row["Link"])
            data["Bookshelf"].append(row["Bookshelf"])
        except:
            print(f"Could not download {row['Title']} (ID: {book_id})")
            continue

    # Saves cleaned books to a csv file
    df_cleaned = pd.DataFrame(data)
    df_cleaned.to_csv(cleaned_file, index=False)
    print(f"Preprocessed dataset saved as {cleaned_file}")

In [None]:
# ---------------------------- CHUNKING STRATEGY ---------------------------- #

def chunk_text(text, chunk_size=512, overlap=50):
    """
    Splits text into overlapping chunks to maintain contextual continuity.

    Args:
        text (str): Input text.
        chunk_size (int): Number of words per chunk.
        overlap (int): Number of words overlapping between consecutive chunks.

    Returns:
        list: List of text chunks.
    """
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks


In [None]:
# ---------------------------- VECTOR DATABASE (CHROMADB) ---------------------------- #

# Loads sentence embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Initializes ChromaDB client and create a collection for book embeddings
chroma_client = chromadb.PersistentClient(path="chromadb_sample")
collection = chroma_client.get_or_create_collection(name="gutenberg_sample")


def store_embeddings():
    """
    Processes cleaned book texts, generates embeddings, and stores them in ChromaDB.

    - Retrieves book text from `gutenberg_data_cleaned.csv`
    - Splits text into chunks using `chunk_text()`
    - Generates embeddings for each chunk
    - Stores embeddings in ChromaDB
    """
    df_cleaned = pd.read_csv(cleaned_file)

    for index, row in df_cleaned.iterrows():
        book_id = str(row["ID"])
        text = row["Text"]
        if pd.isna(text) or not isinstance(text, str):
            continue

        chunks = chunk_text(text)
        embeddings = embedding_model.encode(chunks, convert_to_numpy=True)

        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            collection.add(
                ids=[f"{book_id}_{i}"],
                embeddings=[embedding.tolist()],
                metadatas=[{"book_id": book_id, "chunk_index": i, "title": row["Title"], "author": row["Author"]}],
                documents=[chunk],
            )
    print("Embedding storage completed.")


In [None]:
# ---------------------------- QUERY EXPANSION & DOCUMENT RETRIEVAL ---------------------------- #
def expand_query(query):
     """
    Expands a query using GPT-4 to improve document retrieval.

    Args:
        query (str): Original user query.

    Returns:
        str: Expanded query with synonyms and related terms.
    """
    prompt = f"Rewrite the following query with synonyms and related terms to improve document retrieval:\n\nQuery: {query}"
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "system", "content": "You are an expert in search query optimization."},
                  {"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

def retrieve_documents(query, top_k=5):
    """Retrieves relevant documents from the vector database."""
    expanded_query = expand_query(query)
    query_embedding = embedding_model.encode([expanded_query], convert_to_numpy=True)
    results = collection.query(query_embeddings=query_embedding.tolist(), n_results=top_k)

    return results["documents"]

In [None]:
# ---------------------------- INTERACTIVE QUERY SYSTEM ---------------------------- #

def generate_response(query):
    """
    Generates a response to the user's query based on retrieved book chunks.

    Args:
        query (str): User's input query.

    Returns:
        str: AI-generated response.
    """
    retrieved_docs = retrieve_documents(query)

    if not retrieved_docs:
        return "No relevant information found."

    context = "\n\n".join(retrieved_docs)
    prompt = f"Answer the following query based only on the retrieved documents:\n\nContext:\n{context}\n\nQuestion: {query}"

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "system", "content": "You are a research assistant."},
                  {"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content.strip()


In [None]:
### **6. Evaluation with Benchmarking**
def evaluate_with_llm(query, expected, generated):
    """Compares the generated answer with the expected answer."""
    prompt = f"Evaluate this response:\n\nQuery: {query}\nExpected Answer: {expected}\nGenerated Answer: {generated}\n\nRate it from 1 (poor) to 5 (excellent)."

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content.strip()

### **7. Interactive Querying**
def interactive_query():
    """
    Enables interactive user input for querying the system.
    """
    while True:
        query = input("Enter a query (or type 'exit' to quit): ")
        if query.lower() == "exit":
            break
        response = generate_response(query)
        print(f"Response: {response}")


# ---------------------------- RUN SYSTEM ---------------------------- #
download_books()
store_embeddings()
interactive_query()