# Ranking prototype (BM25 + Embeddings)

In [2]:
import numpy as np
import string
from rank_bm25 import BM25Okapi
from pathlib import Path
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.vectorstores import VectorStoreRetriever

DATA_DIR_PATH = Path.cwd().resolve() / "data"
STOPWORDS_FILE_PATH = DATA_DIR_PATH / "stopwords.txt"
CHROMA_DIR_PATH = Path.cwd().resolve() / "vector_store"


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Sample dataset of 10 creators
# dict where key = creator niche, value = creator bio

sample_dataset = {
    "fashion_minimalist_style": """I post daily outfit inspirations, seasonal lookbooks, and minimalist fashion tips focused on affordable pieces. 
My recent content includes capsule wardrobe guides, styling basics, and color-coordination breakdowns. 
Instagram posts feature outfit photos, thrifted finds, and short styling reels.
""",

"learning_to_code_journey": """I document my journey learning to code from scratch, posting study routines, project logs, and beginner-friendly explanations. 
Recent videos show progress building Python scripts, simple web apps, and solving DSA problems as a newcomer to tech. 
Instagram posts include accountability updates, motivational captions, and resources for absolute beginners.
""",

"backend_go_engineer": """I share backend engineering tutorials in Go, focusing on concurrency, networking, microservice design, and Docker-based workflows. 
My recent content includes deep dives into goroutines, REST API design, production logging, and containerizing real-world services. 
Instagram posts feature short Go tips, dev memes, and workflow optimizations for backend developers.
""",

"fitness_home_workouts": """I create daily home workout routines targeting fat loss, mobility, and functional strength using bodyweight or minimal equipment. 
My latest videos include 10-minute HIIT sessions, beginner full-body circuits, and nutrition tips for sustainable weight loss. 
Instagram posts share motivational progress photos, short workout reels, and simple healthy meal ideas.
""",

"beauty_skincare_reviewer": """I review skincare products and routines for acne-prone, oily, and sensitive skin types, focusing on ingredient science. 
Recent videos compare retinol serums, sunscreen textures, exfoliants, and Korean skincare routines. 
Instagram posts share product flatlays, morning/night routines, and short ingredient breakdowns.
""",

"personal_finance_educator": """I simplify personal finance topics such as index investing, budgeting systems, emergency funds, and tax optimization. 
My recent content includes ETF comparisons, beginner investment strategies, and monthly market breakdowns. 
Instagram posts include budgeting templates, money habits, and short explainers on compounding and inflation.
""",

"healthy_cooking_mealprep": """I share easy, healthy recipes and weekly meal prep ideas designed for busy students and working professionals. 
My latest videos feature quick high-protein dinners, 15-minute lunches, and budget-friendly vegetarian meal preps. 
Instagram posts include grocery hauls, step-by-step meal reels, and simple nutrition tips.
""",

"yoga_mindfulness_coach": """I guide yoga flows, stretching routines, and mindfulness practices aimed at improving mobility, posture, and mental clarity. 
Recent videos include morning yoga sessions, hip-opening flows, and breathwork techniques for stress relief. 
Instagram posts share short mobility drills, inspirational quotes, and meditation reminders.
""",

"cloud_devops_engineer": """I break down cloud engineering concepts including AWS, Kubernetes, Docker, and CI/CD automation. 
My latest tutorials explain Terraform modules, EKS deployments, load balancing, and building secure production pipelines. 
Instagram posts include cloud diagrams, quick DevOps tips, and workflow comparisons for SREs and platform engineers.
""",

"frontend_react_engineer": """I teach React, TypeScript, and modern frontend engineering with a focus on clean UI patterns and reusable components. 
Recent videos cover state management, hooks, responsive layouts, and building production-ready interfaces with React and Tailwind. 
On Instagram, I post quick JavaScript tips, UI design breakdowns, and small project showcases.
"""
}

In [4]:
# Helper functions for pre-processing text for OkapiBM25

def _remove_all_punctuation_lowercase(text: str) -> str:
    tt = str.maketrans("", "", string.punctuation)
    return text.translate(tt).lower()

def _tokenize(text: str) -> list[str]:
    return text.lower().split()

def _remove_stop_words(tokens: list[str]) -> list[str]:
    with open(STOPWORDS_FILE_PATH, "r") as f:
        stop_words = f.readlines()
    
    result = []
    for token in tokens:
        if token in stop_words:
            continue
        result.append(token)

    return result

def _stem_tokens(tokens: list[str]) -> list[str]:
    stemmer = PorterStemmer()
    return list(map(lambda token: stemmer.stem(token), tokens))

def _process_text_to_tokens(text: str) -> list[str]:
    tokens = _remove_all_punctuation_lowercase(text=text)
    tokens = _tokenize(tokens)
    tokens = _remove_stop_words(tokens)
    tokens = _stem_tokens(tokens)
    return tokens

In [None]:
# Keyword search implementation

class HybridSearch:
    def __init__(self, documents: dict = sample_dataset, model_name="all-MiniLM-L6-v2"):
        self.documents = documents
        self.index = None

        # list of lists, each list one tokenized creator 
        self.tokenized_corpus = []

        # save creator niches in same order in which added to corpus
        self.ordered_creators = []

        self.model = HuggingFaceEmbeddings(model_name=model_name)

    def _create_bm25_index(self):
        # tokenized_creators = []
        # creators_ordered = []
        for creator, bio in self.documents.items():
            tokenized_creator = _process_text_to_tokens(bio)
            self.tokenized_corpus.append(tokenized_creator)
            self.ordered_creators.append(creator)

        # self.tokenized_corpus = tokenized_creators
        # self.ordered_creators = creators_ordered

        self.index = BM25Okapi(self.tokenized_corpus)
    
    def _bm25_search(self, query: str):
        if self.index is None:
            self._create_bm25_index()

        tokenized_query = _process_text_to_tokens(query)

        # scores of all the Documents
        scores = self.index.get_scores(tokenized_query)

        results = []
        for i, score in enumerate(scores):
            creator = self.ordered_creators[i]
            results.append((creator, score))

        # sort by score
        sorted_results = sorted(results, key=lambda t: t[1], reverse=True)
        
        final_results = []
        for c, _ in sorted_results:
            final_results.append(c)
        
        return final_results[:5]
    
    def _build_vector_db(self):
        all_docs: list[Document] = []

        # one Document per creator bio
        for creator, bio in self.documents.items():
            doc = Document(
                page_content=bio, 
                metadata={"source": creator}
            )
            all_docs.append(doc)

        # split into chunks, keep constant chunking config for now
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=100,
            chunk_overlap=20,
            length_function=len,
            is_separator_regex=False
        )
        all_chunks = text_splitter.split_documents(all_docs)
        
        # create vector store, vs retriever
        CHROMA_DIR_PATH.mkdir(parents=True, exist_ok=True)
        print(f"- Creating a Vector Store at '{CHROMA_DIR_PATH.name}'...")
        vs = Chroma.from_documents(documents=all_chunks,
                                   embedding=self.model,
                                   collection_name="influencer-pokedex-corpus",
                                   persist_directory=str(CHROMA_DIR_PATH))
        print("- Vector Store created ✔️")
        return vs.as_retriever(search_type="similarity",
                                    search_kwargs={"k":10})
    
    def _load_or_create_vector_db(self) -> VectorStoreRetriever:
        if CHROMA_DIR_PATH.exists() and (CHROMA_DIR_PATH / "chroma.sqlite3").exists():
            print(f"- Loading Vector Store from dsk at '{CHROMA_DIR_PATH.name}'...")
            # load the db, retriever
            vs = Chroma(collection_name="influencer-pokedex-corpus",
                        embedding_function=self.model,
                        persist_directory=str(CHROMA_DIR_PATH))
            print("- Vector Store loaded ✔️")
            return vs.as_retriever(search_type="similarity",
                                   search_kwargs={"k":10})
        else:
            return self._build_vector_db()
        
    def _semantic_search(self, query: str):
        vs_retriever = self._load_or_create_vector_db()

        # chunks (most relevant)
        retrieved_docs = vs_retriever.invoke(query)

        results = []

        # for calculating RRF, only need the rank
        for i, doc in enumerate(retrieved_docs):
            creator = doc.metadata["source"]
            results.append(creator)

        return results[:5]
    
    def _hybrid_search(self, query):
        # cmbine ranking instead of raw scores
        # removes need for normlizing scores
        # formula -> rrf = 1 / (k + rank)
        # where k - rrf smoothing constant (lower gives more weight to top ranked results)

        # both simply return a list of creator strings ranked
        bm25_results = self._bm25_search(query=query)
        semantic_results = self._semantic_search(query=query)

        print(f"Searching for '{query}'...")
        results = []
        for creator, bio in self.documents.items():
            bm25_rank = bm25_results.index(creator) + 1 if creator in bm25_results else len(bm25_results) + 1
            semantic_rank = semantic_results.index(creator) + 1 if creator in semantic_results else len(semantic_results) + 1

            rrf = 0.0
            rrf += _rrf_score(rank=bm25_rank)
            rrf += _rrf_score(rank=semantic_rank)

            results.append({
                "creator": creator,
                "bio": bio[:100],
                "bm25": bm25_rank,
                "semantic": semantic_rank,
                "rrf": rrf
            })
        sorted_results = sorted(results, key=lambda d: d["rrf"], reverse=True)
        return sorted_results
    
    def search(self, query: str):
        results = self._hybrid_search(query=query)
        for i, result in enumerate(results, 1):
            print(f"{i}. {result['creator'].upper()}")
            print(f"RRF Score: {result['rrf']:.4f}")
            print(f"BM25 Rank: {result['bm25']}, Semantic Rank: {result['semantic']}")
            print(f"{result['bio']}")



def _rrf_score(rank: int, k: int=60):
    return 1 / (k + rank)

In [7]:
# HYBRID SEARCHING
example_query = "fitness routine for beginners"

searcher = HybridSearch()
searcher.search(example_query)

- Creating a Vector Store at 'vector_store'...
- Vector Store created ✔️
Searching for 'fitness routine for beginners'...
1. FITNESS_HOME_WORKOUTS
RRF Score: 0.03252247488101534
BM25 Rank: 2, Semantic Rank: 1
I create daily home workout routines targeting fat loss, mobility, and functional strength using bod
2. LEARNING_TO_CODE_JOURNEY
RRF Score: 0.03177805800756621
BM25 Rank: 1, Semantic Rank: 5
I document my journey learning to code from scratch, posting study routines, project logs, and begin
3. YOGA_MINDFULNESS_COACH
RRF Score: 0.03125
BM25 Rank: 4, Semantic Rank: 4
I guide yoga flows, stretching routines, and mindfulness practices aimed at improving mobility, post
4. BEAUTY_SKINCARE_REVIEWER
RRF Score: 0.031024531024531024
BM25 Rank: 3, Semantic Rank: 6
I review skincare products and routines for acne-prone, oily, and sensitive skin types, focusing on 
5. PERSONAL_FINANCE_EDUCATOR
RRF Score: 0.030536130536130537
BM25 Rank: 5, Semantic Rank: 6
I simplify personal finance topics suc