السؤال الاول : عمليات preproses 

In [9]:
import nltk
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
import re
import os 
import pandas as pd
import pickle 
from autocorrect import Speller

In [10]:
class Preprocessor():
  
  
  @staticmethod
  def process(doc):
        s = doc
        s = Preprocessor.remove_unwanted_characters(s)
        s = Preprocessor.normalize_text(s)
        s = Preprocessor.correct_spelling(s)
        s = Preprocessor.lemmerDocument(s)
        return s  
  @staticmethod
  def tokenizeDocument(sentence) :
      
     return word_tokenize(sentence)
  
  @staticmethod
  def lemmerDocument(sentence):
      lemmatizer = WordNetLemmatizer()
      tokens = Preprocessor.tokenizeDocument(sentence)
      return [lemmatizer.lemmatize(token) for token in tokens]
   
  @staticmethod
  def stemDocument(sentence):
       stemmer = PorterStemmer()
       tokens = Preprocessor.tokenizeDocument(sentence)
       return [stemmer.stem(token) for token in tokens]
    
    
  @staticmethod 
  def rmvstpwrds(sentence):
        terms=[]   
        stopWords= set(stopwords.words('english'))
        for term in sentence.split() :
            if term not in stopWords :
                terms.append(term)
        return terms
    
  @staticmethod
  def remove_unwanted_characters(sentence):
        
        return re.sub(r'[^a-zA-Z\s]', '', sentence)    
      
      
  @staticmethod
  def normalize_text(sentence):
        
        return sentence.lower()
    
    
  @staticmethod
  def correct_spelling(sentence):
       
        spell = Speller()
        
        return ' '.join([spell(word) for word in sentence.split()])  
 

المهمة الثانية : انشاء نظام الاسترجاع 

In [11]:
class IndexModel:
    def __init__(self, documents_df=None, index_file=None, meta_file=None):
        if index_file is None and meta_file is None and documents_df is not None:
            self.create_new_index(documents_df)
        elif index_file is not None and meta_file is not None:
            self.read_index(index_file, meta_file)
        else:
            raise ValueError("Provide either documents_df for creating a new index or index_file and meta_file for reading an existing index")

    def create_new_index(self, documents_df):
        termdoc = documents_df.to_dict('list')
        unique_terms = set()
        doc_ids = termdoc['id']

        for terms in termdoc['ntext']:
            unique_terms.update(terms)

        term_ids = sorted(list(unique_terms))
        term_to_id = {term: idx for idx, term in enumerate(term_ids)}
        doc_to_id = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}

        self._index_matrix = [[0 for _ in range(len(doc_ids))] for _ in range(len(term_ids))]

        for i, terms in enumerate(termdoc['ntext']):
            for term in terms:
                term_idx = term_to_id[term]
                doc_idx = doc_to_id[doc_ids[i]]
                self._index_matrix[term_idx][doc_idx] = 1

        self._term_to_id = term_to_id
        self._doc_to_id = doc_to_id

    def get_term_vector(self, term):
        if term in self._term_to_id:
            term_idx = self._term_to_id[term]
            return self._index_matrix[term_idx]
        else:
            return [0 for _ in range(len(self._doc_to_id))]

    def read_index(self, index_file, meta_file):
        with open(index_file, 'rb') as f:
            self._index_matrix = pickle.load(f)

        with open(meta_file, 'rb') as f:
            meta_data = pickle.load(f)

        self._term_to_id = meta_data['term_to_id']
        self._doc_to_id = meta_data['doc_to_id']

    def save_index(self, index_path, meta_path):
        with open(index_path, 'wb') as f:
            pickle.dump(self._index_matrix, f)

        meta_data = {'term_to_id': self._term_to_id, 'doc_to_id': self._doc_to_id}
        with open(meta_path, 'wb') as f:
            pickle.dump(meta_data, f)

class Retriever:
    def __init__(self):
        self._terms_operator = ['&', '|', '~']

    def boolean_operator_processing(self, bop, prevV, nextV=None):
        if bop == "&":
            return [a & b for a, b in zip(prevV, nextV)]
        elif bop == "|":
            return [a | b for a, b in zip(prevV, nextV)]
        elif bop == "~":
            return [1 - a for a in prevV]

    def retrieve(self, query_terms, index_model):
        ret_docs = []
        bitwiseop = ""
        result = []
        has_previous_term = False
        has_not_operation = False
        inc_vec_prev = []
        inc_vec_next = []

        for term in query_terms:
            if term not in self._terms_operator:
                if has_not_operation:
                    if has_previous_term:
                        inc_vec_next = self.boolean_operator_processing("~", index_model.get_term_vector(term), inc_vec_next)
                    else:
                        inc_vec_prev = self.boolean_operator_processing("~", index_model.get_term_vector(term), inc_vec_next)
                        result = inc_vec_prev
                    has_not_operation = False
                elif has_previous_term:
                    inc_vec_next = index_model.get_term_vector(term)
                else:
                    inc_vec_prev = index_model.get_term_vector(term)
                    result = inc_vec_prev
                    has_previous_term = True
            elif term == "~":
                has_not_operation = True
            else:
                bitwiseop = term

            if len(inc_vec_next) != 0:
                result = self.boolean_operator_processing(bitwiseop, inc_vec_prev, inc_vec_next)
                inc_vec_prev = result
                has_previous_term = True
                inc_vec_next = []

        for i, res in enumerate(result):
            if res == 1:
                ret_docs.append({'id': i, 'score': res})
        ret_docs = pd.DataFrame(ret_docs, columns=['id', 'score']).sort_values(by=['score'], ascending=False)
        return ret_docs

class SearchEngine:
    def __init__(self, preprocessor, retriever, documents, index_file=None, meta_file=None):
        self.preprocessor = preprocessor
        self.retriever = retriever
        self.documents = None
        self.model = None
        self.rebuild(documents, index_file, meta_file)

    def rebuild(self, documents, index_file=None, meta_file=None):
        self.documents = documents
        self.documents['ntext'] = self.documents['text'].apply(self.preprocessor.process)
        
        if index_file and meta_file and os.path.exists(index_file) and os.path.exists(meta_file):
            self.model = IndexModel(index_file=index_file, meta_file=meta_file)
            print(f"Index loaded from {index_file}")
            print(f"Meta data loaded from {meta_file}")
        else:
            self.model = IndexModel(documents_df=documents)
            if index_file and meta_file:
                self.model.save_index(index_file, meta_file)
                print(f"Index saved to {index_file}")
                print(f"Meta data saved to {meta_file}")

    def querying(self, query):
        query_terms = self.preprocessor.process(query)
        term_vectors = [self.model.get_term_vector(term) for term in query_terms]

        if not term_vectors:
            return pd.DataFrame(columns=['id', 'score', 'content'])

        scores = [sum(doc_scores) for doc_scores in zip(*term_vectors)]
        results = [{'id': i, 'score': score, 'content': self.documents.iloc[i]['text']} for i, score in enumerate(scores) if score > 0]
        results_df = pd.DataFrame(results).sort_values(by='score', ascending=False)
        return results_df

    def display_documents(self):
        return self.documents[['id', 'ntext']]

def loadDocuments(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                doc_id = len(documents)
                content = file.read()
                documents.append({'id': doc_id, 'text': content})
    return pd.DataFrame(documents)

In [12]:
documents_directory = 'Project1_datacoll'
documents_df = loadDocuments(documents_directory)


In [13]:
index_file_path = 'myindex.index'
meta_file_path = 'idim.meta'

In [15]:
import nltk

nltk.download('punkt')  
nltk.download('stopwords')  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Error with downloaded zip file
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
search_engine = SearchEngine(Preprocessor(), Retriever(), documents_df, index_file=index_file_path, meta_file=meta_file_path)

processed_documents = search_engine.display_documents()
processed_documents

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Lenovo/nltk_data'
    - 'c:\\Program Files\\Python312\\nltk_data'
    - 'c:\\Program Files\\Python312\\share\\nltk_data'
    - 'c:\\Program Files\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\Lenovo\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


: 

In [None]:
search_engine.querying("success fast")

Unnamed: 0,id,score,content
0,0,1,"﻿If you want to go fast, go alone. If you want..."
1,2,1,﻿A team is like a pack of wolves—always hungry...
2,4,1,﻿Coming together is a beginning. Keeping toget...


In [None]:
search_engine.querying("success together")


Unnamed: 0,id,score,content
3,4,2,﻿Coming together is a beginning. Keeping toget...
0,0,1,"﻿If you want to go fast, go alone. If you want..."
1,2,1,﻿A team is like a pack of wolves—always hungry...
2,3,1,"﻿Alone, we can do so little; together, we can ..."


In [None]:
queries_df = pd.read_excel('Project1_Queries_Qrels (2).xlsx')
queries_df

Unnamed: 0,qid,query
0,1,fast success 💪
1,2,teemwrk feelures
2,3,success v.s failure?????
3,4,qoates about success and cooperations
4,5,funnyyyy teamss prgress


In [None]:
import pandas as pd

# قراءة ملفات الاستعلامات والوثائق ذات الصلة
queries_df = pd.read_excel('Project1_Queries_Qrels (2).xlsx', sheet_name='Queries')
qrels_df = pd.read_excel('Project1_Queries_Qrels (2).xlsx', sheet_name='Qrels')

print(queries_df.head())
print(qrels_df.head())


In [None]:
qrels = queries_df.get_qrels()
qrels.head()

AttributeError: 'DataFrame' object has no attribute 'get_qrels'

In [None]:
import pyterrier as pt
if not pt.started():# this code for checking if jvm is working, if not
   pt.init()

RuntimeError: ("Pyterrier requires Java 11 or newer, we only found Java version %s; install a more recent Java, or change os.environ['JAVA_HOME'] to point to the proper Java installation", '1.8.0_241')

In [None]:
import pandas as pd
queries = pd.DataFrame([["q1", "success together"], ["q2", "success fast"]], columns=["qid", "query"])
br.transform(queries)

NameError: name 'br' is not defined

In [None]:
eval = pt.Utils.evaluate(tf_res,qrels[['qid','docno','label']],
                         metrics=["map","recall","P"])
eval