In [1]:
# sample_dataset.py
documents = [
    "The sun rises in the east and sets in the west.",
    "Python is a popular programming language for machine learning.",
    "The Taj Mahal is located in Agra, India.",
    "Renewable energy sources include solar, wind, and hydro power.",
    "Football is the most popular sport in the world."
]

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
from sample_dataset import documents

# Build TF-IDF index
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Query
query = "solar energy in India"
query_vec = vectorizer.transform([query])

# Compute similarity
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Rank results
ranked_indices = similarities.argsort()[::-1]
for idx in ranked_indices:
    print(f"Score: {similarities[idx]:.3f} | {documents[idx]}")

ModuleNotFoundError: No module named 'sample_dataset'

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create sample documents instead of importing from non-existent module
documents = [
    "Solar energy is becoming popular in India due to government initiatives.",
    "Renewable energy sources include solar, wind, and hydro power.",
    "India has set ambitious targets for solar power generation.",
    "Climate change is driving the adoption of clean energy worldwide.",
    "Solar panels are becoming more affordable for residential use."
]

# Build TF-IDF index
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Query
query = "solar energy in India"
query_vec = vectorizer.transform([query])

# Compute similarity
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Rank results
ranked_indices = similarities.argsort()[::-1]
for idx in ranked_indices:
    print(f"Score: {similarities[idx]:.3f} | {documents[idx]}")

Score: 0.526 | Solar energy is becoming popular in India due to government initiatives.
Score: 0.231 | India has set ambitious targets for solar power generation.
Score: 0.181 | Renewable energy sources include solar, wind, and hydro power.
Score: 0.096 | Climate change is driving the adoption of clean energy worldwide.
Score: 0.074 | Solar panels are becoming more affordable for residential use.


In [4]:
from sentence_transformers import SentenceTransformer, util

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode docs + query
doc_embeddings = model.encode(documents, convert_to_tensor=True)
query = "solar energy in India"
query_embedding = model.encode(query, convert_to_tensor=True)

# Compute similarity
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]

# Rank results
for idx in cosine_scores.argsort(descending=True):
    print(f"Score: {cosine_scores[idx]:.3f} | {documents[idx]}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Score: 0.830 | Solar energy is becoming popular in India due to government initiatives.
Score: 0.765 | India has set ambitious targets for solar power generation.
Score: 0.565 | Renewable energy sources include solar, wind, and hydro power.
Score: 0.538 | Solar panels are becoming more affordable for residential use.
Score: 0.329 | Climate change is driving the adoption of clean energy worldwide.


In [5]:
import numpy as np

alpha = 0.5  # weight for dense vs sparse
combined_scores = alpha * similarities + (1 - alpha) * cosine_scores.cpu().numpy()

ranked_indices = np.argsort(combined_scores)[::-1]
for idx in ranked_indices:
    print(f"Score: {combined_scores[idx]:.3f} | {documents[idx]}")

Score: 0.678 | Solar energy is becoming popular in India due to government initiatives.
Score: 0.498 | India has set ambitious targets for solar power generation.
Score: 0.373 | Renewable energy sources include solar, wind, and hydro power.
Score: 0.306 | Solar panels are becoming more affordable for residential use.
Score: 0.212 | Climate change is driving the adoption of clean energy worldwide.
