In [14]:
from bs4 import BeautifulSoup

In [253]:
def read_dataset(xml_path):
    with open(xml_path, 'r', encoding='utf-8') as file:
        content = file.read()

    soup = BeautifulSoup(content)
    
    documents = []
    for doc in soup.find_all('doc'):
        doc_id = int(doc.docno.text.strip())
        title = doc.title.text.strip() if doc.title else ''
        text = doc.text.strip() if doc.text else ''
        
        full_text = f'{title}\n{text}'.strip()
        
        documents += [{'doc_id': doc_id, 'text': full_text}]

    return documents

In [254]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Proshir-
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [255]:
def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text)

    # Remove punctuation
    words = [word for word in words if word.isalnum()]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]

    return " ".join(stemmed_words)

In [256]:
xml_path = 'datas/cran.all.1400.xml'
documents = read_dataset(xml_path)
for i in range(len(documents)):
    documents[i]['text'] = preprocess_text(documents[i]['text'])
print(documents[1])

{'doc_id': 2, 'text': 'simpl shear flow past a flat plate in an incompress fluid of small viscos 2 simpl shear flow past a flat plate in an incompress fluid of small viscos depart of aeronaut engin renssela polytechn institut troy simpl shear flow past a flat plate in an incompress fluid of small viscos in the studi of viscou flow past a bodi it is usual necessari to consid a curv shock wave emit from the nose or lead edg of the bodi consequ there exist an inviscid rotat flow region between the shock wave and the boundari layer such a situat aris for instanc in the studi of the hyperson viscou flow past a flat plate the situat is somewhat differ from prandtl classic problem in origin problem the inviscid free stream outsid the boundari layer is irrot while in a hyperson problem the inviscid free stream must be consid as rotat the possibl effect of vortic have been recent discuss by ferri and libbi in the present paper the simpl shear flow past a flat plate in a fluid of small viscos is

In [259]:
def read_queries(xml_path):
    with open(xml_path, 'r', encoding='utf-8') as file:
        content = file.read()

    soup = BeautifulSoup(content, features='xml')

    queries = []
    for top in soup.find_all('top'):
        query_id = int(top.num.text.strip())
        query_text = top.title.text.strip()

        queries.append({'query_id': query_id, 'query_text': query_text})

    return queries

In [261]:
queries_path = 'datas/cran.qry.xml'
queries = read_queries(queries_path)
print(queries[2])

{'query_id': 4, 'query_text': 'what problems of heat conduction in composite slabs have been solved so\nfar .'}


In [262]:
def read_relevance(relevance_path):
    relevance = {}
    with open(relevance_path, 'r') as file:
        for line in file:
            topic, iteration, docno, relevancy = map(str.strip, line.split())
            topic = int(topic)
            iteration = int(iteration)
            relevancy = int(relevancy)

            if topic not in relevance:
                relevance[topic] = {}

            relevance[topic][docno] = relevancy

    return relevance

In [263]:
relevance_path = 'datas/cranqrel.trec.txt'
relevances = read_relevance(relevance_path)

In [264]:
!pip install -U scikit-learn




[notice] A new release of pip available: 22.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [265]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [266]:
def compute_tfidf_scores(docs):
    corpus = [doc['text'] for doc in docs]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()

    tfidf_scores = {}
    for i, doc in enumerate(docs):
        doc_id = doc['doc_id']
        print(docs)
        feature_index = tfidf_matrix[i, :].nonzero()[1]
        feature_names_doc = [feature_names[index] for index in feature_index]
        tfidf_values_doc = tfidf_matrix[i, feature_index].toarray()[0]
        tfidf_scores[doc_id] = dict(zip(feature_names_doc, tfidf_values_doc))

    return tfidf_scores

In [268]:
tfidf_scores = compute_tfidf_scores(documents)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [269]:
def rank_documents_with_relevance(query, tfidf_scores, tfidf_vectorizer, relevance_judgments):
    query = preprocess_text(query)

    print(tfidf_scores)
    # Get feature names from tfidf_scores
    feature_names = list(tfidf_scores.keys())
    
    tfidf_matrix = tfidf_vectorizer.transform([query])
    
    similarities = {}
    for i, doc_id in enumerate(feature_names):
        similarity = (tfidf_matrix * tfidf_matrix[i].reshape(-1, 1)).sum()
        similarities[doc_id] = similarity

    ranked_docs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    ranked_docs_with_relevance = [(doc_id, similarity, relevance_judgments[doc_id]) for doc_id, similarity in ranked_docs]
    print(feature_names)
    return ranked_docs_with_relevance


In [270]:
corpus_texts = [doc['text'] for doc in documents]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_texts)

In [272]:
for query_info in queries:
    query_id = query_info['query_id']
    query_text = query_info['query_text']

    ranked_documents_with_relevance = rank_documents_with_relevance(query_text, tfidf_scores, tfidf_vectorizer, relevances)

    print("Results for Query {}: {}".format(query_id, query_text))
    for rank, (doc_id, similarity, relevance) in enumerate(ranked_documents_with_relevance, start=1):
        print("Rank {}: Document ID: {}, Similarity: {}, Relevance: {}".format(rank, doc_id, similarity, relevance))
    print("\n")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



IndexError: row index (1) out of range