In [13]:
!pip install faiss-cpu



In [14]:
import pandas as pd
import numpy as np
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer

filename = 'names_dataset.csv'

try:
    df = pd.read_csv(filename)
    print(f"‚úÖ Successfully loaded '{filename}'")

    df['Name'] = df['Name'].astype(str)

    names_list = df['Name'].tolist()
    print(f"üìä Total Names in Database: {len(names_list)}")
    print(df.head())

except FileNotFoundError:
    print(f"‚ùå Error: File '{filename}' not found.")
    print("Please upload 'names_dataset.csv' to the Colab Files section.")

‚úÖ Successfully loaded 'names_dataset.csv'
üìä Total Names in Database: 100
   ID    Name
0   1  Geetha
1   2    Gita
2   3   Geeta
3   4    Gitu
4   5   Githa


In [15]:
# 1. Initialize Vectorizer (TF-IDF with Character N-Grams)
# analyzer='char_wb' looks at characters inside word boundaries
# ngram_range=(2, 4) looks at groups of 2, 3, and 4 characters (captures spelling well)
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))

# 2. Fit and Transform the names into vectors
name_vectors = vectorizer.fit_transform(names_list)

# 3. Convert to Dense Array (FAISS requires dense float32 arrays)
# Note: For very large datasets (millions), you'd use a sparse index,
# but for thousands, dense is faster and simpler.
dataset_vectors = name_vectors.toarray().astype('float32')

# 4. Normalize Vectors for Cosine Similarity
# FAISS IndexFlatIP calculates Inner Product.
# Inner Product of normalized vectors = Cosine Similarity (0 to 1 score)
faiss.normalize_L2(dataset_vectors)

# 5. Build FAISS Index
dimension = dataset_vectors.shape[1]
index = faiss.IndexFlatIP(dimension) # IP = Inner Product
index.add(dataset_vectors)

print(f"‚úÖ FAISS Index built with {index.ntotal} vectors of dimension {dimension}.")

‚úÖ FAISS Index built with 100 vectors of dimension 676.


In [19]:
def search_name(query, k=5):
    """
    Searches for the query name in the FAISS index.
    """
    # 1. Vectorize the query
    query_vec = vectorizer.transform([query]).toarray().astype('float32')

    # 2. Normalize the query vector
    faiss.normalize_L2(query_vec)

    # 3. Search the index
    # D = Distances (Scores), I = Indices (Row numbers)
    D, I = index.search(query_vec, k)

    return D[0], I[0]

# --- Main Interaction Loop ---
if 'index' in locals():
    user_input = input("Enter a name to search: ")

    if user_input.strip():
        # Get top 5 matches
        scores, indices = search_name(user_input, k=5)

        print("\n" + "="*50)
        print(f"üîç FAISS SEARCH RESULTS FOR: '{user_input}'")
        print("="*50)

        # Check if the best score is too low (e.g., < 0.3 means likely no good match)
        if scores[0] < 0.1:
             print("‚ö†Ô∏è No relevant matches found.")
        else:
            # --- Expected Output Format ---

            # Best Match (Top result)
            best_idx = indices[0]
            best_score = scores[0]
            best_name = names_list[best_idx]

            print(f"\nüåü BEST MATCH:")
            print(f"   Name:  {best_name}")
            print(f"   Relevance Score: {best_score:.4f}")

            # List of Matches (Ranked)
            print(f"\nüìã RELEVANT NAMES (Decreasing Order):")
            print(f"   {'-'*40}")
            print(f"   {'Rank':<5} | {'Name':<20} | {'Score'}")
            print(f"   {'-'*40}")

            for rank, (score, idx) in enumerate(zip(scores, indices)):
                # Skip if score is 0 (irrelevant)
                if score > 0:
                    found_name = names_list[idx]
                    print(f"   {rank+1:<5} | {found_name:<20} | {score:.4f}")
    else:
        print("‚ö†Ô∏è Please enter a valid name.")
else:
    print("‚ö†Ô∏è Please run Cell 3 first to build the index.")

Enter a name to search: 121212

üîç FAISS SEARCH RESULTS FOR: '121212'
‚ö†Ô∏è No relevant matches found.
