<a href="https://colab.research.google.com/github/sayandas96476/RAG/blob/main/Sparse_retrieval_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TF-IDF method was used to retrieve related docs based on a query

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [19]:


# Step 1: Define the documents (your corpus)
documents = [
    "I love programming in Python.",
    "Python is a versatile programming language.",
    "Machine learning and AI are exciting fields.",
    "I enjoy solving problems using Python.",
    "Cooking and baking are creative activities."
]



In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    words = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

preprocessed_docs = [preprocess_text(doc) for doc in documents]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
type(preprocessed_docs)

list

In [16]:
# Step 2: Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Step 3: Fit and transform the documents to generate the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(preprocessed_docs)



In [17]:
# Step 4: Define the search query
query = "Python programming"

# Step 5: Transform the query into a TF-IDF vector
query_vector = vectorizer.transform([query])


In [8]:
type(tfidf_matrix)

In [9]:
# Get number of documents (rows)
n_docs = tfidf_matrix.shape[0]
print("Number of vectors", n_docs)

# Get number of terms/features (columns)
n_terms = tfidf_matrix.shape[1]
print("features of a vector", n_terms)

Number of vectors 5
features of a vector 18


In [18]:

# Step 6: Compute cosine similarity between the query and the documents
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# Step 7: Get the top 3 results based on similarity scores
top_n = 3
top_indices = np.argsort(cosine_similarities)[::-1][:top_n]

# Step 8: Print the results
print("Top 3 Search Results:")
for i, idx in enumerate(top_indices):
    print(f"{i + 1}. Document: {documents[idx]} (Score: {cosine_similarities[idx]:.4f})")

Top 3 Search Results:
1. Document: I love programming in Python. (Score: 0.7237)
2. Document: Python is a versatile programming language. (Score: 0.5956)
3. Document: I enjoy solving problems using Python. (Score: 0.2028)
