<a href="https://colab.research.google.com/github/s-ravi18/LLMs-From-Scratch/blob/main/Text_classification_using_Sentence_Transformers_and_FAISS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task
Replicate the text classification solution using semantic search and FAISS as described in the article "Transforming Text Classification with Semantic Search Techniques: FAISS". This involves analyzing the article, installing necessary libraries (`faiss-cpu`, `transformers`), preparing sample data, generating text embeddings, building and populating a FAISS index, implementing semantic search and classification logic, and testing the solution.

In [4]:
!pip install -qq faiss-cpu sentence-transformers pandas numpy

In [5]:
import pandas as pd
import numpy as np
import re
import string
import time
import os
import faiss
from typing import List
from collections import Counter
from sentence_transformers import SentenceTransformer

# 1. DATA CLEANING
def clean(text):
    if text is None:
        return ""
    text = text.lower()
    # Remove URLs
    url_removed = re.sub(r'https\S+', '', text, flags=re.MULTILINE)
    # Remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", url_removed)
    # Remove extra dots and punctuation
    text = re.sub("\.+", " ", text)
    text = "".join([word for word in text if word not in string.punctuation])
    # Remove extra whitespace
    text = re.sub("\s\s+", " ", text).strip()
    return text

# 2. EMBEDDING GENERATION
# Using 'all-MiniLM-L6-v2' as specified in the article
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_embeddings(model, sentences: List[str], parallel: bool = True):
    start = time.time()
    if parallel:
        # Multi-process encoding for faster performance
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        pool = model.start_multi_process_pool(target_devices=["cpu"] * 5)
        embeddings = model.encode_multi_process(sentences, pool, batch_size=16)
        model.stop_multi_process_pool(pool)
    else:
        os.environ["TOKENIZERS_PARALLELISM"] = "true"
        embeddings = model.encode(
            sentences,
            batch_size=32,
            show_progress_bar=True,
            convert_to_tensor=True,
        )

    # Detach and convert to numpy for Faiss
    if hasattr(embeddings, 'detach'):
        embeddings = embeddings.detach().cpu().numpy()

    print(f"Time taken to encode {len(sentences)} items: {round(time.time() - start, 2)}s")
    return embeddings

# 3. FAISS INDEX MANAGEMENT
def create_index(samples, mappings):
    """
    Creates a Flat Index with Inner Product (Cosine Similarity after normalization)
    """
    dimension = samples.shape[1]
    # Using IndexIDMap to associate embeddings with specific IDs
    index = faiss.IndexIDMap(faiss.IndexFlatIP(dimension))

    # Normalize for Cosine Similarity
    faiss.normalize_L2(samples)

    # Add vectors with their corresponding IDs (keys from mappings)
    ids = np.array(list(mappings.keys())).astype('int64')
    index.add_with_ids(samples, ids)

    # Save the index locally
    faiss.write_index(index, "news_train_index")
    print("Index created and saved successfully.")
    return index

# 4. PREDICTION LOGIC
def predict_embeddings(query):
    query_embedding = model.encode([query])
    query_embedding = np.asarray(query_embedding, dtype="float32")
    return query_embedding

def most_frequent(list_of_categories):
    occurence_count = Counter(list_of_categories)
    return occurence_count.most_common(1)[0][0]

def predict(query, index, train_df, top_n=10):
    # Clean query
    cleaned_query = clean(query)
    # Get embedding
    query_embedding = predict_embeddings(cleaned_query)
    # Normalize for search
    faiss.normalize_L2(query_embedding)

    # Search index (D = distances/scores, I = indices/IDs)
    D, I = index.search(query_embedding, top_n)

    # Map IDs back to categories
    results = []
    retrieved_categories = []

    for idx, score in zip(I[0], D[0]):
        # Assuming the train_df index matches the IDs in the Faiss index
        category = train_df.iloc[idx]['category'] # 'title' field in the article
        retrieved_categories.append(category)
        results.append({
            'text': train_df.iloc[idx]['cleaned_data'],
            'category': category,
            'score': score
        })

    final_class = most_frequent(retrieved_categories)
    return final_class, results



  text = re.sub("\.+", " ", text)
  text = re.sub("\s\s+", " ", text).strip()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# --- EXAMPLE USAGE WORKFLOW ---

# 1. Prepare dummy data (Replace with your actual news dataset)
data = {
    'text': [
        "The Mars rover landed successfully.",
        "New breakthroughs in cancer research.",
        "Nvidia launches new GPU architecture.",
        "The history of religious architecture."
    ],
    'category': ['space', 'sci.med', 'graphics', 'religion']
}
train = pd.DataFrame(data)



In [7]:
# 2. Process Data
train["cleaned_data"] = train["text"].apply(clean)
sentences = train["cleaned_data"].tolist()
embeddings = get_embeddings(model, sentences, parallel=False)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Time taken to encode 4 items: 0.26s


In [8]:
sentences

['the mars rover landed successfully',
 'new breakthroughs in cancer research',
 'nvidia launches new gpu architecture',
 'the history of religious architecture']

In [9]:
embeddings

array([[ 0.03203396,  0.01380073,  0.01797963, ...,  0.05775586,
        -0.05601345, -0.02733229],
       [ 0.00484106,  0.00347968, -0.00616621, ..., -0.14910999,
         0.04509946,  0.01670169],
       [ 0.01182352, -0.03713594, -0.03886833, ..., -0.09716566,
         0.05545173, -0.00934162],
       [ 0.05258647,  0.14346823, -0.0277888 , ...,  0.01697277,
        -0.02092341, -0.0092363 ]], dtype=float32)

In [10]:

# 3. Create Mapping and Index
# Mapping of row index to itself for ID retrieval
mappings = {i: i for i in range(len(train))}
index = create_index(embeddings, mappings)

Index created and saved successfully.


In [11]:
index

<faiss.swigfaiss_avx2.IndexIDMap; proxy of <Swig Object of type 'faiss::IndexIDMapTemplate< faiss::Index > *' at 0x7b5eefa66700> >

In [16]:
# 4. Perform Inference
query_text = "Mars"
predicted_category, details = predict(query_text, index, train)

print(f"\nQuery: {query_text}")
print(f"Predicted Class: {predicted_category}")


Query: Mars
Predicted Class: religion
