In [2]:
import math
import nltk
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

documents = [
    'Information Retrieval is a field of computer science',
    'Natural Language Processing is a subfield of computer science',
    'Machine Learning overlaps with Information Retrieval and Natural Language Processing',
    'Probabilistic models are used for prediction in many fields including IR and NLP',
    'Evaluation of Information Retrieval systems is crucial'
]

query = 'Probabilistic Information Retrieval'

def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    return tokens

tokenized_documents = [preprocess(doc) for doc in documents]
tokenized_query = preprocess(query)

tf = defaultdict(int)
for term in tokenized_query:
    for i, doc in enumerate(tokenized_documents):
        tf[(term, i+1)] = doc.count(term)

df = defaultdict(int)
for term in tokenized_query:
    for doc in tokenized_documents:
        if term in doc:
            df[term] += 1

idf = {term: math.log(len(tokenized_documents) / df[term]) for term in tokenized_query}

backup = 0.5
scores = []
for i, doc in enumerate(tokenized_documents, start=1):
    score = 1
    for term in tokenized_query:
        if term not in idf:
            continue
        R = tf[(term, i)]
        r = df[term]

        numerator = (R - r + backup)
        denominator = (r - R + backup)
        score *= numerator / denominator

    scores.append((i, score))

sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
for doc_id, score in sorted_scores:
    print(f'Document {doc_id}: {score:.4f}')

Document 4: 0.5102
Document 1: -0.1200
Document 3: -0.1200
Document 5: -0.1200
Document 2: -0.1701


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/romanmolochkov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/romanmolochkov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
