**Βήμα 1: Εγκατάσταση απαραίτητων πακέτων**


Εγκαθιστούμε τις απαραίτητες βιβλιοθήκες που θα χρησιμοποιήσουμε για την επεξεργασία κειμένου και την υλοποίηση της μηχανής αναζήτησης. Το nltk χρησιμοποιείται για την επεξεργασία φυσικής γλώσσας, ενώ pandas και json θα μας βοηθήσουν στη διαχείριση των δεδομένων.

In [None]:
!pip install nltk pandas scikit-learn rank-bm25 beautifulsoup4 requests



Βήμα 2: Συλλογή δεδομένων από τη Wikipedia


Εδώ χρησιμοποιούμε έναν web scraper που εξάγει άρθρα από τη Wikipedia. Αντί να εισάγουμε δεδομένα χειροκίνητα, δημιουργούμε έναν αυτόματο μηχανισμό που συλλέγει και αποθηκεύει τα άρθρα σε αρχείο JSON.

In [None]:
"""
Dimitrakopoulos Stylianos
AM: 18390149
Προγραμμα Σπουδων ΠΑΔΑ
"""



import requests
from bs4 import BeautifulSoup
import json



def scrape_wikipedia_articles(url, num_articles=5):
    base_url = "https://en.wikipedia.org"
    articles = []
    visited_urls = set()

    def get_links(page_url):
        """Extract all Wikipedia article links from a given page."""
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = []
        for link in soup.find_all("a", href=True):
            href = link['href']
            if href.startswith("/wiki/") and ":" not in href:  # Avoid non-article links
                full_url = base_url + href
                if full_url not in visited_urls:
                    links.append(full_url)
        return links

    # Start with the main Wikipedia page
    to_visit = [url]
    while to_visit and len(articles) < num_articles:
        current_url = to_visit.pop(0)
        visited_urls.add(current_url)
        try:
            response = requests.get(current_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find("h1").text.strip()
            content = "\n".join([p.text for p in soup.find_all("p")])
            if content:
                articles.append({"title": title, "url": current_url, "content": content})
                print(f"Scraped: {title}")
            to_visit.extend(get_links(current_url))
        except Exception as e:
            print(f"Failed to scrape {current_url}: {e}")

    return articles

# Scrape articles and save to JSON
url = "https://en.wikipedia.org/wiki/Main_Page"
articles = scrape_wikipedia_articles(url, num_articles=5)

# Save to JSON file
with open("wikipedia_articles.json", "w", encoding="utf-8") as f:
    json.dump(articles, f, ensure_ascii=False, indent=4)

print("Scraping complete. Data saved to wikipedia_articles.json.")

Scraped: Main Page
Scraped: Wikipedia
Scraped: Free content
Scraped: Encyclopedia
Scraped: English language
Scraping complete. Data saved to wikipedia_articles.json.


**Βήμα 3: Προεπεξεργασία Κειμένου**


Σε αυτό το στάδιο, επεξεργαζόμαστε τα δεδομένα με τεχνικές όπως tokenization, stemming, και stop-word removal. Αυτές οι διαδικασίες μας επιτρέπουν να μειώσουμε την πολυπλοκότητα του κειμένου και να βελτιώσουμε την ποιότητα των αποτελεσμάτων αναζήτησης.

In [None]:
"""
Dimitrakopoulos Stylianos
AM: 18390149
Προγραμμα Σπουδων ΠΑΔΑ
"""

import json
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Φoρτωση των άρθρων από  JSON
with open('wikipedia_articles.json', 'r', encoding='utf-8') as f:
    articles_data = json.load(f)

# Μετατροπή των δεδομένων σε DataFrame
articles_df = pd.DataFrame(articles_data)

# Συνάρτηση για την εξαγωγή λέξεων από το κείμενο
def extract_tokens(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return words

# tokenization
articles_df['tokens'] = articles_df['content'].apply(extract_tokens)

nltk.download('stopwords')
nltk.download('punkt')

# Αφαίρεση stop words
stop_words_set = set(stopwords.words('english'))
articles_df['filtered_tokens'] = articles_df['tokens'].apply(
    lambda words: [word for word in words if word not in stop_words_set]
)

# stemming
stemmer = PorterStemmer()
articles_df['stemmed_tokens'] = articles_df['filtered_tokens'].apply(
    lambda words: [stemmer.stem(word) for word in words]
)

# Αποθήκευση του επεξεργασμένου συνόλου δεδομένων σε CSV
articles_df[['title', 'stemmed_tokens']].to_csv('processed_articles.csv', index=False)
print("Processing complete. Data saved to 'processed_articles.csv'.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Processing complete. Data saved to 'processed_articles.csv'.


**Βήμα 4: Δημιουργία Αντεστραμμένου Ευρετηρίου (Inverted Index)**


Η δημιουργία του αντεστραμμένου ευρετηρίου μάς επιτρέπει να εντοπίζουμε γρήγορα τα άρθρα που περιέχουν συγκεκριμένους όρους. Η δομή αυτή χρησιμοποιείται σε πραγματικές μηχανές αναζήτησης για αποδοτική ανάκτηση εγγράφων.

In [None]:
"""
Dimitrakopoulos Stylianos
AM: 18390149
Προγραμμα Σπουδων ΠΑΔΑ
"""

import pandas as pd
from collections import defaultdict
import json
import re

# Eξαγωγη λέξεων από το κείμενο
def process_text(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return words

# Δημιουργία inverted index
def generate_index(data):
    index = defaultdict(set)

    for _, row in data.iterrows():
        doc_title = row['title']
        words = process_text(row['content'])

        for word in words:
            index[word].add(doc_title)

    return {term: list(docs) for term, docs in index.items()}

# Φόρτωση του επεξεργασμένου αρχείου
try:
    articles_data = pd.read_csv("processed_articles.csv")
    articles_data['content'] = articles_data['stemmed_tokens'].apply(eval).apply(' '.join)
    print("Processed data loaded successfully.")
except FileNotFoundError:
    print("Error: 'processed_articles.csv' not found.")
    exit()

# Κατασκευή και αποθήκευση του inverted index
print("Building the inverted index...")
inverted_index = generate_index(articles_data)
print("Inverted index created.")

index_file = "inverted_index.json"
with open(index_file, "w", encoding="utf-8") as f:
    json.dump(inverted_index, f, indent=4)

print(f"Inverted index saved to '{index_file}'.")


Processed data loaded successfully.
Building the inverted index...
Inverted index created.
Inverted index saved to 'inverted_index.json'.


**Βήμα 5: Ανάκτηση Πληροφορίας (Retrieval Models)**


Σε αυτό το σημείο, εφαρμόζουμε τρεις διαφορετικές μεθόδους ανάκτησης πληροφορίας:


*   Boolean Retrieval
*   TF-IDF
*   BM25

In [None]:
"""
Dimitrakopoulos Stylianos
AM: 18390149
Προγραμμα Σπουδων ΠΑΔΑ
"""

import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import numpy as np

# Load εγγράφων από CSV
def load_documents(file_path="processed_articles.csv"):
    try:
        df = pd.read_csv(file_path)
        df['stemmed_tokens'] = df['stemmed_tokens'].apply(eval)
        titles = df['title'].tolist()
        documents = df['stemmed_tokens'].apply(lambda tokens: ' '.join(tokens)).tolist()
        return titles, documents
    except Exception as e:
        print(f"Error loading documents: {e}")
        exit()

# Load inverted index
def load_inverted_index(file_path="inverted_index.json"):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        print("Error: inverted_index.json not found.")
        exit()

# Boolean Retrieval
def boolean_search(query, inverted_index):
    terms = query.split()
    result_docs = set()

    def get_docs(term):
        return set(inverted_index.get(term, []))

    current_docs = set()
    operation = None

    for term in terms:
        if term.upper() in ["AND", "OR", "NOT"]:
            operation = term.upper()
        else:
            docs = get_docs(term)
            if operation == "AND":
                current_docs &= docs
            elif operation == "OR":
                current_docs |= docs
            elif operation == "NOT":
                current_docs -= docs
            else:
                current_docs = docs

    return current_docs

# TF-IDF Retrieval
def tfidf_retrieval(query, documents, inverted_index, titles):
    vectorizer = TfidfVectorizer()
    doc_vectors = vectorizer.fit_transform(documents)
    query_vector = vectorizer.transform([query])

    scores = (doc_vectors @ query_vector.T).toarray().flatten()
    ranked_indices = np.argsort(scores)[::-1]

    # Get Boolean filter
    allowed_docs = boolean_search(query, inverted_index)

    # Filter out disallowed documents
    ranked_indices = [i for i in ranked_indices if titles[i] in allowed_docs]

    return ranked_indices, scores[ranked_indices]

# BM25 Retrieval
def bm25_retrieval(query, documents, inverted_index, titles):
    tokenized_docs = [doc.split() for doc in documents]
    bm25 = BM25Okapi(tokenized_docs)
    query_tokens = query.split()

    scores = bm25.get_scores(query_tokens)
    ranked_indices = np.argsort(scores)[::-1]

    allowed_docs = boolean_search(query, inverted_index)


    ranked_indices = [i for i in ranked_indices if titles[i] in allowed_docs]

    return ranked_indices, scores[ranked_indices]


**Βήμα 6: Παραδείγματα Αναζητήσεων**


Στο στάδιο αυτό, η μηχανή αναζήτησης παρέχει δύο επιλογές στον χρήστη:



*   Εκτέλεση προκαθορισμένων δοκιμαστικών αναζητήσεων – Χρησιμοποιούνται προκαθορισμένα ερωτήματα ώστε να αξιολογηθεί η ακρίβεια και η αποτελεσματικότητα των τριών μοντέλων ανάκτησης πληροφορίας (Boolean Retrieval, TF-IDF, Okapi BM25).
*   Εισαγωγή προσαρμοσμένης αναζήτησης – Ο χρήστης μπορεί να επιλέξει το μοντέλο ανάκτησης που επιθυμεί και να εισάγει δικό του ερώτημα για αναζήτηση στη συλλογή εγγράφων.



In [11]:
"""
Dimitrakopoulos Stylianos
AM: 18390149
Προγραμμα Σπουδων ΠΑΔΑ
"""

#from search_engine import load_documents, load_inverted_index, boolean_search, tfidf_retrieval, bm25_retrieval

# Φόρτωση αρχειων
titles, documents = load_documents("processed_articles.csv")
inverted_index = load_inverted_index("inverted_index.json")

# Test queries
test_queries = [
    {"query": "maria", "expected": {"Main Page"}},
    {"query": "maria AND trubnikova", "expected": {"Main Page"}},
    {"query": "maria OR russian", "expected": {"Main Page", "Wikipedia"}},
    {"query": "russian AND NOT maria", "expected": {"Wikipedia"}},
    {"query": "NOT maria", "expected": set()},  # Expect empty results
    {"query": "maria AND NOT russian", "expected": set()},  # Expect empty if "Main Page" contains "russian"
]

# Function για test όλων των retrieval methods
def run_tests(retrieval_function, method_name):
    print(f"\n🔍 Testing {method_name} Retrieval...")

    for test in test_queries:
        query = test["query"]
        expected = test["expected"]

        print(f"\n[DEBUG] Running {method_name} Search for query: {query}")

        if method_name == "Boolean":
            result = retrieval_function(query, inverted_index)
        else:
            ranked_indices, scores = retrieval_function(query, documents, inverted_index, titles)
            result = {titles[idx] for idx in ranked_indices} if ranked_indices else set()

        print(f"🔎 Expected: {expected}")
        print(f"✅ {method_name} Search Result: {result}")
        print(f"🟢 Pass: {result == expected}\n")

# Function for custom user queries
def run_custom_query():
    print("\nSelect a retrieval model:")
    print("1. Boolean Retrieval")
    print("2. TF-IDF Retrieval")
    print("3. Okapi BM25 Retrieval")

    choice = input("Enter your choice (1/2/3): ").strip()
    model_mapping = {"1": ("Boolean", boolean_search), "2": ("TF-IDF", tfidf_retrieval), "3": ("Okapi BM25", bm25_retrieval)}

    if choice not in model_mapping:
        print("Invalid choice. Exiting...")
        return

    model_name, model_function = model_mapping[choice]
    query = input("\nEnter your query: ").strip()

    print(f"\n🔍 Running {model_name} Retrieval for query: {query}")

    if model_name == "Boolean":
        result = model_function(query, inverted_index)
    else:
        ranked_indices, scores = model_function(query, documents, inverted_index, titles)
        result = {titles[idx] for idx in ranked_indices} if ranked_indices else set()

    print(f"🔎 {model_name} Search Result: {result}\n")

# Main execution flow
if __name__ == "__main__":
    print("Would you like to run the predefined test queries or enter a custom query?")
    print("1. Run predefined test queries")
    print("2. Enter custom query")

    user_choice = input("Enter your choice (1/2): ").strip()

    if user_choice == "1":
        run_tests(boolean_search, "Boolean")
        run_tests(tfidf_retrieval, "TF-IDF")
        run_tests(bm25_retrieval, "Okapi BM25")
    elif user_choice == "2":
        run_custom_query()
    else:
        print("Invalid choice. Exiting...")


Would you like to run the predefined test queries or enter a custom query?
1. Run predefined test queries
2. Enter custom query
Enter your choice (1/2): 1

🔍 Testing Boolean Retrieval...

[DEBUG] Running Boolean Search for query: maria
🔎 Expected: {'Main Page'}
✅ Boolean Search Result: set()
🟢 Pass: False


[DEBUG] Running Boolean Search for query: maria AND trubnikova
🔎 Expected: {'Main Page'}
✅ Boolean Search Result: set()
🟢 Pass: False


[DEBUG] Running Boolean Search for query: maria OR russian
🔎 Expected: {'Main Page', 'Wikipedia'}
✅ Boolean Search Result: {'Wikipedia'}
🟢 Pass: False


[DEBUG] Running Boolean Search for query: russian AND NOT maria
🔎 Expected: {'Wikipedia'}
✅ Boolean Search Result: {'Wikipedia'}
🟢 Pass: True


[DEBUG] Running Boolean Search for query: NOT maria
🔎 Expected: set()
✅ Boolean Search Result: set()
🟢 Pass: True


[DEBUG] Running Boolean Search for query: maria AND NOT russian
🔎 Expected: set()
✅ Boolean Search Result: set()
🟢 Pass: True


🔍 Testing TF-I

In [None]:
"""
Dimitrakopoulos Stylianos
AM: 18390149
Προγραμμα Σπουδων ΠΑΔΑ
"""


import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
#from search_engine import load_documents, load_inverted_index, boolean_search, tfidf_retrieval, bm25_retrieval

# Load necessary data
titles, documents = load_documents("processed_articles.csv")
inverted_index = load_inverted_index("inverted_index.json")

# Define ground truth for evaluation
ground_truth = {
    "maria": {"Main Page"},
    "maria AND trubnikova": {"Main Page"},
    "maria OR russian": {"Main Page", "Wikipedia"},
    "russian AND NOT maria": {"Wikipedia"},
    "NOT maria": set(),
    "maria AND NOT russian": set()
}

# Function to compute evaluation metrics
def evaluate_retrieval(retrieval_function, method_name, is_boolean=False):
    print(f"\n🔍 Evaluating {method_name} Retrieval...")

    precision_scores, recall_scores, f1_scores = [], [], []

    for query, expected in ground_truth.items():
        print(f"\n[DEBUG] Running {method_name} for query: {query}")

        # Boolean retrieval returns a set, TF-IDF and BM25 return (ranked_indices, scores)
        if is_boolean:
            result = retrieval_function(query, inverted_index)
        else:
            ranked_indices, _ = retrieval_function(query, documents, inverted_index, titles)
            result = {titles[idx] for idx in ranked_indices} if ranked_indices else set()

        # Convert results and ground truth to binary format
        all_documents = set(titles)
        y_true = [1 if doc in expected else 0 for doc in all_documents]
        y_pred = [1 if doc in result else 0 for doc in all_documents]

        # Compute metrics
        precision = precision_score(y_true, y_pred, zero_division=1)
        recall = recall_score(y_true, y_pred, zero_division=1)
        f1 = f1_score(y_true, y_pred, zero_division=1)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

        print(f"🔎 Expected: {expected}")
        print(f"✅ {method_name} Search Result: {result}")
        print(f"🎯 Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

    # Compute average scores
    avg_precision = np.mean(precision_scores)
    avg_recall = np.mean(recall_scores)
    avg_f1 = np.mean(f1_scores)

    print(f"\n📊 {method_name} Evaluation Summary:")
    print(f"⚡ Average Precision: {avg_precision:.4f}")
    print(f"📈 Average Recall: {avg_recall:.4f}")
    print(f"📊 Average F1-score: {avg_f1:.4f}\n")

# Run evaluations
if __name__ == "__main__":
    evaluate_retrieval(boolean_search, "Boolean", is_boolean=True)
    evaluate_retrieval(tfidf_retrieval, "TF-IDF")
    evaluate_retrieval(bm25_retrieval, "Okapi BM25")



🔍 Evaluating Boolean Retrieval...

[DEBUG] Running Boolean for query: maria
🔎 Expected: {'Main Page'}
✅ Boolean Search Result: set()
🎯 Precision: 1.0000, Recall: 0.0000, F1-score: 0.0000

[DEBUG] Running Boolean for query: maria AND trubnikova
🔎 Expected: {'Main Page'}
✅ Boolean Search Result: set()
🎯 Precision: 1.0000, Recall: 0.0000, F1-score: 0.0000

[DEBUG] Running Boolean for query: maria OR russian
🔎 Expected: {'Main Page', 'Wikipedia'}
✅ Boolean Search Result: {'Wikipedia'}
🎯 Precision: 1.0000, Recall: 0.5000, F1-score: 0.6667

[DEBUG] Running Boolean for query: russian AND NOT maria
🔎 Expected: {'Wikipedia'}
✅ Boolean Search Result: {'Wikipedia'}
🎯 Precision: 1.0000, Recall: 1.0000, F1-score: 1.0000

[DEBUG] Running Boolean for query: NOT maria
🔎 Expected: set()
✅ Boolean Search Result: set()
🎯 Precision: 1.0000, Recall: 1.0000, F1-score: 1.0000

[DEBUG] Running Boolean for query: maria AND NOT russian
🔎 Expected: set()
✅ Boolean Search Result: set()
🎯 Precision: 1.0000, Recal