Run first the [setup notebook](./00-setup.ipynb)

# Classic Text Retrieval with SQL

In [2]:
from helpers import bag_of_words, tokenize, eliminate_stopwords
from itertools import groupby
import sqlite3
from datasets import imdb
import ipywidgets as widgets

## Create Inverted Index with SQL

### Create Schema

In [3]:
# helper function to create dict results for SELECT statements
def dict_from_row(cursor, row):
    return {
        col[0]: row[idx]
        for idx, col in enumerate(cursor.description)
    }

# open in-memory database - we only want to demonstrate the SQL interface
db = sqlite3.connect(":memory:")
db.row_factory = dict_from_row

# create table for inverted index in SQL
db.execute("CREATE TABLE document(id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, year INTEGER, runtime INTEGER, rating REAL, genre TEXT, actors TEXT, summary TEXT)")
db.execute("CREATE TABLE vocabulary(term TEXT PRIMARY KEY, df INTEGER, idf REAL)")
db.execute("CREATE TABLE posting(term TEXT, docId INTEGER, tf INTEGER)")
db.execute("CREATE TEMPORARY TABLE query(term TEXT, tf INTEGER)")
db.execute("CREATE INDEX inverted_list ON posting(term)");

### Load IMDB data set

In [4]:
# loading the imdb data set (1000 movies)
collection = imdb.load()
def doc_format_imdb(doc: dict) -> str:
    trim = lambda s,n: len(s) > n and s[:n] + "\u2026" or s
    title_ex = '{title_short} ({year}, {runtime}m, {rating})'.format(title_short=trim(doc['title'], 30), **doc)
    return '{title_ex:<50} {genre_short:<20} {summary} [{actors}]'.format(title_ex=title_ex, genre_short=trim(doc['genre'], 18), **doc)

for item in collection[:5]:
    print(doc_format_imdb(item))

collection[0]

The Shawshank Redemption (1994, 142m, 9.3)         Drama                Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. [Tim Robbins Morgan Freeman Bob Gunton William Sadler]
The Godfather (1972, 175m, 9.2)                    Crime Drama          An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son. [Marlon Brando Al Pacino James Caan Diane Keaton]
The Dark Knight (2008, 152m, 9.0)                  Action Crime Drama   When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice. [Christian Bale Heath Ledger Aaron Eckhart Michael Caine]
The Godfather: Part II (1974, 202m, 9.0)           Crime Drama          The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his 

{'title': 'The Shawshank Redemption',
 'year': 1994,
 'runtime': 142,
 'rating': 9.3,
 'genre': 'Drama',
 'actors': 'Tim Robbins Morgan Freeman Bob Gunton William Sadler',
 'summary': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'}

### Build index in SQL

In [5]:
def get_vector(text: str) -> set:
    tokens = tokenize(text)
    tokens = eliminate_stopwords(tokens)
    # tokens = reduce_to_stems(tokens)
    return bag_of_words(tokens)

def add_document(doc: dict):
    # add new document and fetch doc id
    db.execute("INSERT INTO document(title, year, runtime, rating, genre, actors, summary) VALUES (:title, :year, :runtime, :rating, :genre, :actors, :summary)", doc)
    doc_id = db.execute("SELECT last_insert_rowid() AS docId").fetchone()['docId']
    # create vector from str-properties
    text = ' '.join([value for key, value in doc.items() if type(value) == str])
    vector = get_vector(text)
    # insert postings into table posting
    db.executemany("INSERT INTO posting(term, docId, tf) VALUES(?,?,?)", [(term, doc_id, tf) for term, tf in vector.items()])
    
def build_index(collection: list[dict]):    
    # remove current entries
    db.execute("DELETE FROM posting")
    db.execute("DELETE FROM vocabulary")
    db.execute("DELETE FROM document")
    # add all documents in collection
    for doc in collection:
        add_document(doc)
    # build inverted index table
    n_docs = db.execute("SELECT count(*) AS count FROM document").fetchone()['count']
    db.execute("INSERT INTO vocabulary(term, df, idf) SELECT term, count(*), ln(1.0 * (? + 1) / (count(*) + 1)) FROM posting GROUP BY term", (n_docs, ));
    # commit changes    
    db.commit()

# build index for movie data set
build_index(collection)

# print number of documents
n_docs = db.execute("SELECT count(*) AS count FROM document").fetchone()['count']
print('{count} documents in collection'.format(count=n_docs))

# print number of terms
nTerms = db.execute("SELECT count(*) AS count FROM vocabulary").fetchone()['count']
print('{count} distinct terms in collection'.format(count=nTerms))

# print number of postings
nPostings = db.execute("SELECT count(*) AS count FROM posting").fetchone()['count']
print('{count} postings'.format(count=nPostings))

1000 documents in collection
9836 distinct terms in collection
26200 postings


### Print document table

In [6]:
cur = db.execute("SELECT * FROM document")
for doc in cur.fetchmany(size=10):
    print(f"{doc['id']:>2} {doc_format_imdb(doc)}")

 1 The Shawshank Redemption (1994, 142m, 9.3)         Drama                Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. [Tim Robbins Morgan Freeman Bob Gunton William Sadler]
 2 The Godfather (1972, 175m, 9.2)                    Crime Drama          An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son. [Marlon Brando Al Pacino James Caan Diane Keaton]
 3 The Dark Knight (2008, 152m, 9.0)                  Action Crime Drama   When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice. [Christian Bale Heath Ledger Aaron Eckhart Michael Caine]
 4 The Godfather: Part II (1974, 202m, 9.0)           Crime Drama          The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and t

### Print terms

In [7]:
cur = db.execute("SELECT * FROM vocabulary ORDER BY df DESC")
for term in cur.fetchmany(size=20):
    print(term)

{'term': 'drama', 'df': 724, 'idf': 0.32258312446054577}
{'term': 'comedy', 'df': 233, 'idf': 1.4534336639575192}
{'term': 'crime', 'df': 214, 'idf': 1.5381167511875578}
{'term': 'adventure', 'df': 198, 'idf': 1.6154499545907282}
{'term': 'action', 'df': 191, 'idf': 1.6512594072874391}
{'term': 'thriller', 'df': 137, 'idf': 1.9815010941580158}
{'term': 'young', 'df': 131, 'idf': 2.02595285672885}
{'term': 'romance', 'df': 126, 'idf': 2.064567692856629}
{'term': 'man', 'df': 125, 'idf': 2.0724728723637424}
{'term': 'family', 'df': 111, 'idf': 2.1902559080201263}
{'term': 'biography', 'df': 109, 'idf': 2.2082744135228043}
{'term': 'two', 'df': 102, 'idf': 2.274025791085585}
{'term': 'mystery', 'df': 100, 'idf': 2.293634262473961}
{'term': 'life', 'df': 98, 'idf': 2.3136349291806306}
{'term': 'war', 'df': 91, 'idf': 2.3869662022661804}
{'term': 'animation', 'df': 82, 'idf': 2.4899141715186226}
{'term': 'john', 'df': 82, 'idf': 2.4899141715186226}
{'term': 'world', 'df': 77, 'idf': 2.55204

### Print postings

In [8]:
cur = db.execute("SELECT * FROM posting")
for posting in cur.fetchmany(size=20):
    print(posting)

{'term': 'shawshank', 'docId': 1, 'tf': 1}
{'term': 'redemption', 'docId': 1, 'tf': 2}
{'term': 'drama', 'docId': 1, 'tf': 1}
{'term': 'tim', 'docId': 1, 'tf': 1}
{'term': 'robbins', 'docId': 1, 'tf': 1}
{'term': 'morgan', 'docId': 1, 'tf': 1}
{'term': 'freeman', 'docId': 1, 'tf': 1}
{'term': 'bob', 'docId': 1, 'tf': 1}
{'term': 'gunton', 'docId': 1, 'tf': 1}
{'term': 'william', 'docId': 1, 'tf': 1}
{'term': 'sadler', 'docId': 1, 'tf': 1}
{'term': 'two', 'docId': 1, 'tf': 1}
{'term': 'imprisoned', 'docId': 1, 'tf': 1}
{'term': 'men', 'docId': 1, 'tf': 1}
{'term': 'bond', 'docId': 1, 'tf': 1}
{'term': 'number', 'docId': 1, 'tf': 1}
{'term': 'years', 'docId': 1, 'tf': 1}
{'term': 'finding', 'docId': 1, 'tf': 1}
{'term': 'solace', 'docId': 1, 'tf': 1}
{'term': 'eventual', 'docId': 1, 'tf': 1}


## Queries

### Boolean search: AND with 2 terms

In [9]:
cur = db.execute("""
    SELECT d.* 
      FROM document d, posting a, posting b 
     WHERE a.term = 'star' AND
           b.term = 'wars' AND
           a.docId = b.docId AND
           a.docId = d.id
""")
for doc in cur.fetchmany(size=10):
    print(doc_format_imdb(doc))

Star Wars: Episode V - The Emp… (1980, 124m, 8.7)  Action Adventure F…  After the Rebels are brutally overpowered by the Empire on the ice planet Hoth, Luke Skywalker begins Jedi training with Yoda, while his friends are pursued by Darth Vader and a bounty hunter named Boba Fett all over the galaxy. [Mark Hamill Harrison Ford Carrie Fisher Billy Dee Williams]
Star Wars (1977, 121m, 8.6)                        Action Adventure F…  Luke Skywalker joins forces with a Jedi Knight, a cocky pilot, a Wookiee and two droids to save the galaxy from the Empire's world-destroying battle station, while also attempting to rescue Princess Leia from the mysterious Darth Vader. [Mark Hamill Harrison Ford Carrie Fisher Alec Guinness]
Star Wars: Episode VI - Return… (1983, 131m, 8.3)  Action Adventure F…  After a daring mission to rescue Han Solo from Jabba the Hutt, the Rebels dispatch to Endor to destroy the second Death Star. Meanwhile, Luke struggles to help Darth Vader back from the dark side witho

### Boolean search: OR with 2 terms

In [10]:
cur = db.execute("""
    SELECT d.* 
      FROM document d, posting a 
     WHERE a.term IN ('star', 'wars') AND
           a.docId = d.id
""")
for doc in cur.fetchmany(size=10):
    print(doc_format_imdb(doc))

Star Wars: Episode V - The Emp… (1980, 124m, 8.7)  Action Adventure F…  After the Rebels are brutally overpowered by the Empire on the ice planet Hoth, Luke Skywalker begins Jedi training with Yoda, while his friends are pursued by Darth Vader and a bounty hunter named Boba Fett all over the galaxy. [Mark Hamill Harrison Ford Carrie Fisher Billy Dee Williams]
Star Wars (1977, 121m, 8.6)                        Action Adventure F…  Luke Skywalker joins forces with a Jedi Knight, a cocky pilot, a Wookiee and two droids to save the galaxy from the Empire's world-destroying battle station, while also attempting to rescue Princess Leia from the mysterious Darth Vader. [Mark Hamill Harrison Ford Carrie Fisher Alec Guinness]
Sunset Blvd. (1950, 110m, 8.4)                     Drama Film-Noir      A screenwriter develops a dangerous relationship with a faded film star determined to make a triumphant return. [William Holden Gloria Swanson Erich von Stroheim Nancy Olson]
Star Wars: Episode VI - Re

### Boolean search: AND with arbitrary number of terms

In [11]:
def search_bool(query: str, k: int):
    query_vector = get_vector(query)
    db.execute("DELETE FROM query")
    db.executemany("INSERT INTO query(term, tf) VALUES(?,?)", [(term, tf) for term, tf in query_vector.items()])
    cur = db.execute("""
        SELECT d.* 
          FROM document d, posting p, query q 
         WHERE p.term = q.term AND
               p.docId = d.id
      GROUP BY p.docId
        HAVING COUNT(p.term) = (SELECT COUNT(*) FROM query)
    """)
    print("\n    r   id   document\n" + '-'*160)
    for rank, doc in enumerate(cur.fetchmany(size=k)):
        print('  {rank:>3d} {id:>4d}   {doc}'.format(rank=rank + 1, id=doc['id'], doc=doc_format_imdb(doc)))
    db.commit()

# options for the dialog
queries = ['star wars', 'drama morgan freeman', 'comedy']

# interactive selection of scenario
widgets.interact(search_bool, 
    query=widgets.Dropdown(options=queries), 
    k=widgets.IntSlider(min=5, max=50, step=5, value=20),
);

interactive(children=(Dropdown(description='query', options=('star wars', 'drama morgan freeman', 'comedy'), v…

### Boolean search: OR with arbitrary number of terms

In [12]:

def search_bool(query: str, k: int):
    query_vector = get_vector(query)
    db.execute("DELETE FROM query")
    db.executemany("INSERT INTO query(term, tf) VALUES(?,?)", [(term, tf) for term, tf in query_vector.items()])
    cur = db.execute("""
        SELECT d.* 
          FROM document d, posting p, query q 
         WHERE p.term = q.term AND
               p.docId = d.id
      GROUP BY p.docId
    """)
    print("\n    r   id   document\n" + '-'*160)
    for rank, doc in enumerate(cur.fetchmany(size=k)):
        print('  {rank:>3d} {id:>4d}   {doc}'.format(rank=rank + 1, id=doc['id'], doc=doc_format_imdb(doc)))
    db.commit()

# options for the dialog
queries = ['star wars', 'drama morgan freeman', 'comedy']

# interactive selection of scenario
widgets.interact(search_bool, 
    query=widgets.Dropdown(options=queries), 
    k=widgets.IntSlider(min=5, max=50, step=5, value=20),
);

interactive(children=(Dropdown(description='query', options=('star wars', 'drama morgan freeman', 'comedy'), v…

### Vector Space Model with dot-product

In [13]:
def search_vsm(query: str, k: int):
    query_vector = get_vector(query)
    db.execute("DELETE FROM query")
    db.executemany("INSERT INTO query(term, tf) VALUES(?,?)", [(term, tf) for term, tf in query_vector.items()])
    cur = db.execute("""
        SELECT SUM(p.tf * v.idf * q.tf * v.idf) AS score, d.*
          FROM document d, posting p, query q, vocabulary v
         WHERE p.term = q.term AND
               p.term = v.term AND
               p.docId = d.id
      GROUP BY p.docId
      ORDER BY 1 DESC
    """)
    print("\n    r   id  score  document\n" + '-'*160)
    for rank, doc in enumerate(cur.fetchmany(size=k)):
        print('  {rank:>3d} {id:>4d} ({score:5.1f})  {doc}'.format(rank=rank + 1, id=doc['id'], score=doc['score'], doc=doc_format_imdb(doc)))
    db.commit()

# options for the dialog
queries = ['star wars', 'drama morgan freeman', 'comedy']

# interactive selection of scenario
widgets.interact(search_vsm, 
    query=widgets.Dropdown(options=queries), 
    k=widgets.IntSlider(min=5, max=50, step=5, value=20),
);

interactive(children=(Dropdown(description='query', options=('star wars', 'drama morgan freeman', 'comedy'), v…

### Vector Space Model with dot-product with predicate (year > 1990)

In [14]:
def search_vsm(query: str, k: int):
    query_vector = get_vector(query)
    db.execute("DELETE FROM query")
    db.executemany("INSERT INTO query(term, tf) VALUES(?,?)", [(term, tf) for term, tf in query_vector.items()])
    cur = db.execute("""
        SELECT SUM(p.tf * v.idf * q.tf * v.idf) AS score, d.*
          FROM document d, posting p, query q, vocabulary v
         WHERE p.term = q.term AND
               p.term = v.term AND
               p.docId = d.id AND
               d.year > 1990
      GROUP BY p.docId
      ORDER BY 1 DESC
    """)
    print("\n    r   id  score  document\n" + '-'*160)
    for rank, doc in enumerate(cur.fetchmany(size=k)):
        print('  {rank:>3d} {id:>4d} ({score:5.1f})  {doc}'.format(rank=rank + 1, id=doc['id'], score=doc['score'], doc=doc_format_imdb(doc)))
    db.commit()

# options for the dialog
queries = ['star wars', 'drama morgan freeman', 'comedy']

# interactive selection of scenario
widgets.interact(search_vsm, 
    query=widgets.Dropdown(options=queries), 
    k=widgets.IntSlider(min=5, max=50, step=5, value=20),
);

interactive(children=(Dropdown(description='query', options=('star wars', 'drama morgan freeman', 'comedy'), v…

---