In this mini-project, we assume that we get the entire KOHA book catalogue via its API — not filtered, just dumped — and then perform fuzzy search locally over it. While this is technically possible, it's extremely inefficient as KOHA is not designed to serve 10k–50k entries on every query. This method only serves to demonstrate why local indexing or Zebra-style narrowing is necessary.

In [69]:
import pandas as pd
from rapidfuzz import process, fuzz
import time

# Load dataset (same as yours)
df = pd.read_csv("books_10k.csv")

df.fillna("", inplace=True)

# Create combined field for fuzzy match
df["query_blob"] = df["Book Title"] + df["Authors"]
query_strings = df["query_blob"].tolist()

# Direct fuzzy search on all rows
def direct_fuzzy_search(query, top_k=5):
    start = time.time()

    matches = process.extract(
        query,
        query_strings,
        scorer=fuzz.token_sort_ratio,
        limit=top_k
    )

    end = time.time()
    print(f"\nTime taken for direct fuzzy search (O(n)): {round(end - start, 4)} seconds")

    results = []
    for _, score, idx in matches:
        results.append({
            "Book Title": df.loc[idx, "Book Title"],
            "Authors": df.loc[idx, "Authors"],
            "Fuzzy Score": score
        })

    return pd.DataFrame(results)

# Example usage
query = "Heinz Graff"
results = direct_fuzzy_search(query)

print(results)



Time taken for direct fuzzy search (O(n)): 0.0108 seconds
                            Book Title      Authors  Fuzzy Score
0                             Frequenz                 42.105263
1                             Frequenz                 42.105263
2                  [Electrical section                 40.000000
3  Electrical engineering fundamentals  Heinz Graff    38.596491
4                Electrostatic hazards  Heinz Haase    37.209302


In [70]:
import pandas as pd
from rapidfuzz import process, fuzz
import time

# Load dataset (same as yours)
df = pd.read_csv("books_50k.csv")

df.fillna("", inplace=True)

# Create combined field for fuzzy match
df["query_blob"] = df["Book Title"] + df["Authors"]
query_strings = df["query_blob"].tolist()

# Direct fuzzy search on all rows
def direct_fuzzy_search(query, top_k=5):
    start = time.time()

    matches = process.extract(
        query,
        query_strings,
        scorer=fuzz.token_sort_ratio,
        limit=top_k
    )

    end = time.time()
    print(f"\nTime taken for direct fuzzy search (O(n)): {round(end - start, 4)} seconds")

    results = []
    for _, score, idx in matches:
        results.append({
            "Book Title": df.loc[idx, "Book Title"],
            "Authors": df.loc[idx, "Authors"],
            "Fuzzy Score": score
        })

    return pd.DataFrame(results)

# Example usage
query = "Heinz Graff"
results = direct_fuzzy_search(query)

print(results)



Time taken for direct fuzzy search (O(n)): 0.0457 seconds
    Book Title          Authors  Fuzzy Score
0  Think In 4D      Erica Heinz    48.484848
1     Men Talk  Alvin S. Baraff    47.058824
2       Hunger       Roxane Gay    44.444444
3       Hunter  Giancarlo Genta    43.750000
4  Drama Kings       Dalma Heyn    43.750000


In [71]:
import pandas as pd
from rapidfuzz import process, fuzz
import time

# Load dataset (same as yours)
df = pd.read_csv("books_100k.csv")

df.fillna("", inplace=True)

# Create combined field for fuzzy match
df["query_blob"] = df["Book Title"] + df["Authors"]
query_strings = df["query_blob"].tolist()

# Direct fuzzy search on all rows
def direct_fuzzy_search(query, top_k=5):
    start = time.time()

    matches = process.extract(
        query,
        query_strings,
        scorer=fuzz.token_sort_ratio,
        limit=top_k
    )

    end = time.time()
    print(f"\nTime taken for direct fuzzy search (O(n)): {round(end - start, 4)} seconds")

    results = []
    for _, score, idx in matches:
        results.append({
            "Book Title": df.loc[idx, "Book Title"],
            "Authors": df.loc[idx, "Authors"],
            "Fuzzy Score": score
        })

    return pd.DataFrame(results)

# Example usage
query = "Heinz Graff"
results = direct_fuzzy_search(query)

print(results)



Time taken for direct fuzzy search (O(n)): 0.0926 seconds
     Book Title            Authors  Fuzzy Score
0   Think In 4D        Erica Heinz    48.484848
1      Men Talk    Alvin S. Baraff    47.058824
2    Mathematik     Heinz Birnbaum    45.714286
3  Construction  Gerhard Heinzmann    45.000000
4        Hunger         Roxane Gay    44.444444
