In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# function to preprocess text data
def preprocess(text):
    # tokenize the text into words
    tokens = word_tokenize(text.lower())
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if not token in stop_words]
    # lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # return the preprocessed text as a string
    return ' '.join(tokens)

In [3]:
# function to calculate the cosine similarity between a document and a query
def calculate_similarity(query, document):
    tfidf_vectorizer = TfidfVectorizer()
    # calculate the tf-idf scores for the query and document
    tfidf_query = tfidf_vectorizer.fit_transform([query])
    tfidf_doc = tfidf_vectorizer.transform([document])
    # calculate the cosine similarity between the query and document
    cosine_sim = cosine_similarity(tfidf_query, tfidf_doc)[0][0]
    return cosine_sim

In [4]:
# function to group documents/web pages based on their similarity to the query
def group_documents(query, documents):
    # initialize the clusters
    cluster1 = []
    cluster2 = []
    cluster3 = []
    # loop through each document
    for document in documents:
        # preprocess the document
        document_preprocessed = preprocess(document)
        # calculate the similarity between the query and document
        similarity = calculate_similarity(query, document_preprocessed)
        # group the document into the appropriate cluster based on the number of terms that match
        if similarity == 1:
            cluster1.append(document)
        elif similarity > 0 and similarity < 1:
            cluster2.append(document)
        else:
            cluster3.append(document)
    # return the clusters
    return cluster1, cluster2, cluster3

In [5]:
# function to rank the documents in each cluster based on their similarity to the query
def rank_documents(query, cluster):
    # initialize the dictionary to store the document and its similarity score
    documents_similarity = {}
    # preprocess the query
    query_preprocessed = preprocess(query)
    # loop through each document in the cluster
    for document in cluster:
        # preprocess the document
        document_preprocessed = preprocess(document)
        # calculate the similarity between the query and document
        similarity = calculate_similarity(query_preprocessed, document_preprocessed)
        # add the document and its similarity score to the dictionary
        documents_similarity[document] = similarity
    # rank the documents based on their similarity score and return the ranked list
    ranked_documents = sorted(documents_similarity, key=documents_similarity.get, reverse=True)
    return ranked_documents

In [6]:
# sample usage
query = "Q: data science"
documents = ["A: Introduction to Data Science", "B: Data Science for Business", 
             "C: Python for Data Science Handbook", "D: Data Mining (Concepts and Techniques)"]
cluster1, cluster2, cluster3 = group_documents(query, documents)
ranked_cluster1 = rank_documents(query, cluster1)
ranked_cluster2 = rank_documents(query, cluster2)
ranked_cluster3 = rank_documents(query, cluster3)
ranked_documents = ranked_cluster1 + ranked_cluster2 + ranked_cluster3
print("Ranked Documents:\n")
for document in ranked_documents:
    print(document)

Ranked Documents:

D: Data Mining (Concepts and Techniques)
A: Introduction to Data Science
B: Data Science for Business
C: Python for Data Science Handbook
