Run first the [setup notebook](./00-setup.ipynb)

# Simple text retrieval with Woosh

In [7]:
from whoosh import index
from whoosh.fields import Schema, TEXT, ID, NUMERIC
from whoosh.qparser import MultifieldParser
from whoosh.query import And, NumericRange
from whoosh import scoring
import os, shutil
from datasets import imdb
import ipywidgets as widgets

## Create the Schema

In [4]:
schema = Schema(
    id=ID(stored=True, unique=True),
    title=TEXT(stored=True),
    overview=TEXT(stored=True),
    rating=NUMERIC(stored=True, numtype=float),
    year=NUMERIC(stored=True, numtype=int)
)

## Load the IMDB data set

In [5]:
# loading the imdb data set (1000 movies)
collection = imdb.load()
def doc_format_imdb(doc: dict) -> str:
    trim = lambda s,n: len(s) > n and s[:n] + "\u2026" or s
    title_ex = '{title_short} ({year}, {runtime}m, {rating})'.format(title_short=trim(doc['title'], 30), **doc)
    return '{title_ex:<50} {genre_short:<20} {summary} [{actors}]'.format(title_ex=title_ex, genre_short=trim(doc['genre'], 18), **doc)

for item in collection[:5]:
    print(doc_format_imdb(item))

collection[0]

The Shawshank Redemption (1994, 142m, 9.3)         Drama                Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. [Tim Robbins Morgan Freeman Bob Gunton William Sadler]
The Godfather (1972, 175m, 9.2)                    Crime Drama          An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son. [Marlon Brando Al Pacino James Caan Diane Keaton]
The Dark Knight (2008, 152m, 9.0)                  Action Crime Drama   When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice. [Christian Bale Heath Ledger Aaron Eckhart Michael Caine]
The Godfather: Part II (1974, 202m, 9.0)           Crime Drama          The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his 

{'title': 'The Shawshank Redemption',
 'year': 1994,
 'runtime': 142,
 'rating': 9.3,
 'genre': 'Drama',
 'actors': 'Tim Robbins Morgan Freeman Bob Gunton William Sadler',
 'summary': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'}

## Build a new index

In [6]:
# --- Create a new index
if os.path.exists("movie_index"):
    # remove the folder
    shutil.rmtree("movie_index")
    
os.mkdir("movie_index")
ix = index.create_in("movie_index", schema)
writer = ix.writer()

# --- insert movie data
for id, doc in enumerate(collection):
    writer.add_document(
        id=str(id),
        title=doc["title"],
        overview=doc["summary"],
        year=doc["year"],
        rating=doc["rating"]
    )
writer.commit()


## Search for a movie

In [None]:
# --- Search for "Star Wars" before year 2000 using BM25 across title & overview ---

with ix.searcher(weighting=scoring.BM25F()) as searcher:
    # Search across both title and overview
    parser = MultifieldParser(["title", "overview"], schema=ix.schema)
    query = parser.parse("Star Wars")

    # Filter results by year < 2000
    combined_query = And([query, NumericRange("year", None, 2000, startexcl=False, endexcl=True)])

    # Run the search
    results = searcher.search(combined_query, limit=10)

    for r in results:
        print(f"{r['title']} ({r['year']}) - Rating: {r['rating']}")

Star Wars (1977) - Rating: 8.6
Star Wars: Episode VI - Return of the Jedi (1983) - Rating: 8.3
Star Wars: Episode V - The Empire Strikes Back (1980) - Rating: 8.7


---