<a href="https://colab.research.google.com/github/pnabende/ahumain-big-data-course-development/blob/main/basic_query_processing_and_rankig.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This is a basic demo of query processing and ranking using Python.
# We use TF-IDF (Term Frequency - Inverse Document Frequency) method for ranking documents based on a given query
# We utilize the `sklearn' library for this purpose.

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Sample dataset of documents
documents = [
    "The people in Uganda are said to be very hospitable.",
    "I have never imagined being late for good things",
    "Wide smiles really bring a lot of happiness.",
    "There are many countries in Africa",
    "One of the regions in Africa is referred to as sub Saharan region",
    "The longest river in Africa is considered to be the river Nile",
    "The source of the Nile is really not very clear"
]

In [4]:
# Query

query = "is Uganda in Africa?"

In [5]:
# Step 1: Initialize the TF-IDF Vectorizer

vectorizer = TfidfVectorizer()

In [6]:
# Step 2: Fit the vectorizer on the documents and transform the documents into TF-IDF matrix

tfidf_matrix = vectorizer.fit_transform(documents)


In [7]:
# Step 3: Transform the query into the TF-IDF matrix

query_tfidf = vectorizer.transform([query])

In [8]:
# Step 4: Compute the cosine similarity between the query and the documents

cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

In [9]:
# Step 5: Rank the documents based on consine similarity scores
ranked_indices = cosine_similarities.argsort()[::-1]

In [10]:
# Step 6: Display the results
ranked_documents = [documents[i] for i in ranked_indices]
ranked_scores = [cosine_similarities[i] for i in ranked_indices]

# Create a DataFrame to display the ranked results
results_df = pd.DataFrame({
    'Document': ranked_documents,
    'Score': ranked_scores
})

# Display the DataFrame
results_df

Unnamed: 0,Document,Score
0,The people in Uganda are said to be very hospi...,0.329846
1,One of the regions in Africa is referred to as...,0.287077
2,The longest river in Africa is considered to b...,0.273262
3,There are many countries in Africa,0.267301
4,The source of the Nile is really not very clear,0.118277
5,Wide smiles really bring a lot of happiness.,0.0
6,I have never imagined being late for good things,0.0
