In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from nltk.tokenize import word_tokenize
import re

class ArabicTextSearchEngine:
    def __init__(self):
        self.stop_words = set(stopwords.words('arabic'))
        self.stemmer = ISRIStemmer()
        self.load_data()

    def load_data(self):
        self.data = pd.read_csv('data.csv')
        self.setup_documents()

    def normalize_text(self, text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'[إأآا]', 'ا', text)
        text = re.sub(r'[ى]', 'ي', text)
        text = re.sub(r'[ؤئ]', 'ء', text)
        return text

    def tokenize_text(self, text):
        return word_tokenize(text)

    def remove_stopwords_and_stem(self, words):
        return [self.stemmer.stem(word) for word in words if word not in self.stop_words and len(word) > 1]

    def process_text(self, text):
        text = self.normalize_text(text)
        words = self.tokenize_text(text)
        return ' '.join(self.remove_stopwords_and_stem(words))

    def setup_documents(self):
        self.doc_ids = self.data['docno'].tolist()
        self.titles = self.data['titles'].tolist()
        processed_docs = [self.process_text(doc) for doc in self.data['content']]
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(processed_docs)
        print("Documents have been indexed and TF-IDF matrix is ready.")

    def search(self, query):
        processed_query = self.process_text(query)
        query_vector = self.vectorizer.transform([processed_query])
        cos_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        sorted_results = sorted(zip(self.doc_ids, self.titles, cos_similarities), key=lambda x: x[2], reverse=True)[:5]
        print("Search Results:")
        for doc_id, title, score in sorted_results:
            print(f"Document ID: {doc_id}, Title: {title}, Similarity Score: {score:.3f}")
        return sorted_results

    def evaluate_search(self, predicted_docs, actual_docs):
        y_pred = [1 if doc in predicted_docs else 0 for doc in self.doc_ids]
        y_true = [1 if doc in actual_docs else 0 for doc in self.doc_ids]
        precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
        print(f"Evaluation - Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1_score:.3f}")

# Example usage
engine = ArabicTextSearchEngine()

while True:
    query = input("Enter your search query (or type 'exit' to quit): ")
    if query.lower() == 'exit':
        break
    results = engine.search(query)

    # Evaluate (optional)
    actual_docs = [doc_id for doc_id, _, _ in results]  # Assuming all returned docs are relevant for example
    predicted_docs = [doc_id for doc_id, _, _ in results]
    engine.evaluate_search(predicted_docs, actual_docs)


Documents have been indexed and TF-IDF matrix is ready.


Enter your search query (or type 'exit' to quit):  كره القدم


Search Results:
Document ID: d5271, Title: مدير كرة كرة الخليج السابق: المركز الثالث أو الرابع لا يشرفان الكرة البحرينية, Similarity Score: 0.488
Document ID: d4521, Title: البحرين تشارك في دورة ألعاب التضامن الإسلامي الأولى بجدة, Similarity Score: 0.403
Document ID: d5362, Title: مجلس النواب يطالب باستحداث أندية جديدة, Similarity Score: 0.356
Document ID: d5377, Title: ألعاب الرياضة المصاحبة.. فوضى في الصالات, Similarity Score: 0.336
Document ID: d4916, Title: المحرق والأهلي.. مباراة «القمة», Similarity Score: 0.331
Evaluation - Precision: 1.000, Recall: 1.000, F1-Score: 1.000


Enter your search query (or type 'exit' to quit):  السباحه


Search Results:
Document ID: d4900, Title: نادي الرجبي يشيد بالتعاون مع اتحاد السباحة, Similarity Score: 0.645
Document ID: d4594, Title: إيقاف حوض سباحة على كف عفريت, Similarity Score: 0.405
Document ID: d4812, Title: ميداليات ذهبية للمملكة في بطولة مجلس التعاون الثالثة للمجرى القصير بالكويت, Similarity Score: 0.379
Document ID: d5506, Title: الاتحاد الرياضي للأمن العام يشارك في اجتماعات الجمعية العمومية والمكتب التنفيذي, Similarity Score: 0.251
Document ID: d3995, Title: بحريني يطلق سراح فلبيني استحم عاريا في البحر, Similarity Score: 0.220
Evaluation - Precision: 1.000, Recall: 1.000, F1-Score: 1.000


Enter your search query (or type 'exit' to quit):  exit
