In [91]:
%pip install rapidfuzz

Note: you may need to restart the kernel to use updated packages.


In [92]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


df = pd.read_csv("books_10k.csv")  # or any sample csv

df.fillna("", inplace=True)

#Combine searchable fields
df["search_blob"] = df["Book Title"] + " " + df["Authors"]

#Build the TF-IDF index (mimicking Zebra index) 
#This step is already performed in KOHA library. Its time complexity is O(n).
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["search_blob"])

#KOHA-like search
#Time complexity is O(log n)
def zebra_style_search(query, top_k=200):
    query_vec = vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_vec, tfidf_matrix).flatten()
    
    top_indices = cosine_similarities.argsort()[::-1][:top_k]
    
    results = df.iloc[top_indices][["Book Title", "Authors"]].copy()
    results["Score"] = cosine_similarities[top_indices]
    return results.reset_index(drop=True)



In [93]:
from rapidfuzz import process, fuzz

def fuzzy_rerank(user_query, zebra_results, top_k=5):
    # Combine title and author for fuzzy matching
    combined_strings = (
        zebra_results["Book Title"] + zebra_results["Authors"]
    ).tolist()

    # Use RapidFuzz to re-rank based on full string
    matches = process.extract(
        user_query,
        combined_strings,
        scorer=fuzz.token_sort_ratio,
        limit=top_k
    )
    
    # Prepare final output
    final_results = []
    for _, fuzzy_score, index in matches:
        row = zebra_results.iloc[index]
        final_results.append({
            "Book Title": row["Book Title"],
            "Authors": row["Authors"],
            "Zebra Score": round(row["Score"], 4),
            "Fuzzy Score": fuzzy_score
        })
    
    return pd.DataFrame(final_results)


In [94]:
import time
'''  In KOHA, the indexing phase (O(n)) is performed only once when books are added,
   and search is performed in real time using Zebra (O(log n)). My local system replicates
   this by separating the indexing and only measuring the time for real-time search (TF-
   IDF + Fuzzy logic). It returns top 5 accurate results in about X seconds on a dataset of N books. '''

'  In KOHA, the indexing phase (O(n)) is performed only once when books are added,\n   and search is performed in real time using Zebra (O(log n)). My local system replicates\n   this by separating the indexing and only measuring the time for real-time search (TF-\n   IDF + Fuzzy logic). It returns top 5 accurate results in about X seconds on a dataset of N books. '

In [95]:
def hybrid_koha_search(query, zebra_limit=200, fuzzy_limit=5):
    start = time.time()

    zebra_results = zebra_style_search(query, top_k=zebra_limit)
    final_results = fuzzy_rerank(query, zebra_results, top_k=fuzzy_limit)

    end = time.time()
    print(f"\nTotal time taken (excluding indexing): {round(end - start, 4)} seconds\n")
    b10=round(end - start, 4)
    
    return final_results

In [96]:
query = "Heinz Graff"
results = hybrid_koha_search(query)

print(results)


Total time taken (excluding indexing): 0.0069 seconds

                            Book Title              Authors  Zebra Score  \
0  Electrical engineering fundamentals          Heinz Graff       0.8721   
1                Electrostatic hazards          Heinz Haase       0.2896   
2                     Bremen wird hell  Heinz-Gerd Hofschen       0.2169   
3                 Powerline technician                            0.0000   
4                  Dzhordzh Vestingauz       Henry G. Prout       0.0000   

   Fuzzy Score  
0    38.596491  
1    37.209302  
2    34.782609  
3    32.258065  
4    31.818182  


In [97]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


df = pd.read_csv("books_50k.csv")  # or any sample csv

df.fillna("", inplace=True)

#Combine searchable fields
df["search_blob"] = df["Book Title"] + " " + df["Authors"]

#Build the TF-IDF index (mimicking Zebra index) 
#This step is already performed in KOHA library. Its time complexity is O(n).
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["search_blob"])

#KOHA-like search
#Time complexity is O(log n)
def zebra_style_search(query, top_k=200):
    query_vec = vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_vec, tfidf_matrix).flatten()
    
    top_indices = cosine_similarities.argsort()[::-1][:top_k]
    
    results = df.iloc[top_indices][["Book Title", "Authors"]].copy()
    results["Score"] = cosine_similarities[top_indices]
    return results.reset_index(drop=True)
from rapidfuzz import process, fuzz

def fuzzy_rerank(user_query, zebra_results, top_k=5):
    # Combine title and author for fuzzy matching
    combined_strings = (
        zebra_results["Book Title"] + zebra_results["Authors"]
    ).tolist()

    # Use RapidFuzz to re-rank based on full string
    matches = process.extract(
        user_query,
        combined_strings,
        scorer=fuzz.token_sort_ratio,
        limit=top_k
    )
    
    # Prepare final output
    final_results = []
    for _, fuzzy_score, index in matches:
        row = zebra_results.iloc[index]
        final_results.append({
            "Book Title": row["Book Title"],
            "Authors": row["Authors"],
            "Zebra Score": round(row["Score"], 4),
            "Fuzzy Score": fuzzy_score
        })
    
    return pd.DataFrame(final_results)

import time
def hybrid_koha_search(query, zebra_limit=200, fuzzy_limit=5):
    start = time.time()

    zebra_results = zebra_style_search(query, top_k=zebra_limit)
    final_results = fuzzy_rerank(query, zebra_results, top_k=fuzzy_limit)

    end = time.time()
    print(f"\nTotal time taken (excluding indexing): {round(end - start, 4)} seconds\n")
    b50=round(end - start, 4)
    return final_results


In [98]:

query = "Heinz Graff"
results = hybrid_koha_search(query)

print(results)


Total time taken (excluding indexing): 0.0099 seconds

                            Book Title      Authors  Zebra Score  Fuzzy Score
0                          Think In 4D  Erica Heinz       0.2824    48.484848
1                     Kunst ist Utopie   Heinz Ohff       0.2260    43.243243
2                       Gazō to seigyo                    0.0000    40.000000
3  Electrical engineering fundamentals  Heinz Graff       0.8253    38.596491
4                Electrostatic hazards  Heinz Haase       0.2707    37.209302


In [99]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


df = pd.read_csv("books_100k.csv")  # or any sample csv

df.fillna("", inplace=True)

#Combine searchable fields
df["search_blob"] = df["Book Title"] + " " + df["Authors"]

#Build the TF-IDF index (mimicking Zebra index) 
#This step is already performed in KOHA library. Its time complexity is O(n).
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["search_blob"])

#KOHA-like search
#Time complexity is O(log n)
def zebra_style_search(query, top_k=200):
    query_vec = vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_vec, tfidf_matrix).flatten()
    
    top_indices = cosine_similarities.argsort()[::-1][:top_k]
    
    results = df.iloc[top_indices][["Book Title", "Authors"]].copy()
    results["Score"] = cosine_similarities[top_indices]
    return results.reset_index(drop=True)
from rapidfuzz import process, fuzz

def fuzzy_rerank(user_query, zebra_results, top_k=5):
    # Combine title and author for fuzzy matching
    combined_strings = (
        zebra_results["Book Title"] + zebra_results["Authors"]
    ).tolist()

    # Use RapidFuzz to re-rank based on full string
    matches = process.extract(
        user_query,
        combined_strings,
        scorer=fuzz.token_sort_ratio,
        limit=top_k
    )
    
    # Prepare final output
    final_results = []
    for _, fuzzy_score, index in matches:
        row = zebra_results.iloc[index]
        final_results.append({
            "Book Title": row["Book Title"],
            "Authors": row["Authors"],
            "Zebra Score": round(row["Score"], 4),
            "Fuzzy Score": fuzzy_score
        })
    
    return pd.DataFrame(final_results)

import time
def hybrid_koha_search(query, zebra_limit=200, fuzzy_limit=5):
    start = time.time()

    zebra_results = zebra_style_search(query, top_k=zebra_limit)
    final_results = fuzzy_rerank(query, zebra_results, top_k=fuzzy_limit)

    end = time.time()
    print(f"\nTotal time taken (excluding indexing): {round(end - start, 4)} seconds\n")
    b100=round(end - start, 4)
    return final_results


In [100]:

query = "Heinz Graff"
results = hybrid_koha_search(query)

print(results)


Total time taken (excluding indexing): 0.0093 seconds

                            Book Title         Authors  Zebra Score  \
0                          Think In 4D     Erica Heinz       0.2575   
1                           Mathematik  Heinz Birnbaum       0.3080   
2                     Kunst ist Utopie      Heinz Ohff       0.2031   
3  Electrical engineering fundamentals     Heinz Graff       0.8188   
4                Electrostatic hazards     Heinz Haase       0.2393   

   Fuzzy Score  
0    48.484848  
1    45.714286  
2    43.243243  
3    38.596491  
4    37.209302  
