Run first the [setup notebook](./00-setup.ipynb)

# Simple text retrieval with PostgreSQL Full Text Search

In [13]:
import psycopg2
from datasets import imdb

## Connect to database

In [14]:
# --- Local PostgreSQL configuration ---
db_name = "postgres"          # replace with your database
db_user = "postgres"          # replace with your local user
db_password = "postgres"      # replace with your password
db_host = "localhost"
db_port = 5432

# --- Connect to local PostgreSQL ---
conn = psycopg2.connect(
    dbname=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)
cur = conn.cursor()

## Create the Schema

In [15]:
# --- first drop the table (if it exists) ---
cur.execute("""
DROP TABLE IF EXISTS movies;

CREATE TABLE movies (
    id SERIAL PRIMARY KEY,
    title TEXT NOT NULL,
    overview TEXT NOT NULL,
    rating REAL,
    year INT,
    tsv tsvector
);
""")
conn.commit()

## Load the IMDB data set

In [16]:
# loading the imdb data set (1000 movies)
collection = imdb.load()
def doc_format_imdb(doc: dict) -> str:
    trim = lambda s,n: len(s) > n and s[:n] + "\u2026" or s
    title_ex = '{title_short} ({year}, {runtime}m, {rating})'.format(title_short=trim(doc['title'], 30), **doc)
    return '{title_ex:<50} {genre_short:<20} {summary} [{actors}]'.format(title_ex=title_ex, genre_short=trim(doc['genre'], 18), **doc)

for item in collection[:5]:
    print(doc_format_imdb(item))

collection[0]

The Shawshank Redemption (1994, 142m, 9.3)         Drama                Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. [Tim Robbins Morgan Freeman Bob Gunton William Sadler]
The Godfather (1972, 175m, 9.2)                    Crime Drama          An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son. [Marlon Brando Al Pacino James Caan Diane Keaton]
The Dark Knight (2008, 152m, 9.0)                  Action Crime Drama   When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice. [Christian Bale Heath Ledger Aaron Eckhart Michael Caine]
The Godfather: Part II (1974, 202m, 9.0)           Crime Drama          The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his 

{'title': 'The Shawshank Redemption',
 'year': 1994,
 'runtime': 142,
 'rating': 9.3,
 'genre': 'Drama',
 'actors': 'Tim Robbins Morgan Freeman Bob Gunton William Sadler',
 'summary': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'}

## Load the data and add it to the movies table

In [17]:
# --- Assuming 'collection' is a list of dictionaries with keys: title, summary, year, rating
for id, doc in enumerate(collection):
    cur.execute("""
    INSERT INTO movies (title, overview, rating, year)
    VALUES (%s, %s, %s, %s)
    """, (doc["title"], doc["summary"], doc["rating"], doc["year"]))

conn.commit()


# --- Populate the tsvector column for full-text search ---
cur.execute("""
UPDATE movies
SET tsv = setweight(to_tsvector('english', title), 'A') ||
          setweight(to_tsvector('english', overview), 'B');
""")
conn.commit()


# --- Create GIN index for performance ---
cur.execute("""
CREATE INDEX movies_tsv_idx ON movies USING GIN(tsv);
""")
conn.commit()

## Search for a movie

In [18]:
# --- Search for "Star Wars" before year 2000 using BM25 across title & overview ---

cur.execute("""
SELECT id, title, year, ts_rank(tsv, to_tsquery('english', 'star & wars')) AS rank
FROM movies
WHERE tsv @@ to_tsquery('english', 'star & wars')
  AND year < 2000
ORDER BY rank DESC
LIMIT 10;
""")

results = cur.fetchall()
for r in results:
    print(f"{r[1]} ({r[2]}) - Rating: {r[3]}")

Star Wars: Episode VI - Return of the Jedi (1983) - Rating: 0.99103653
Star Wars: Episode V - The Empire Strikes Back (1980) - Rating: 0.9910322
Star Wars (1977) - Rating: 0.9910322


## cleanup

In [19]:
# --- Close connection ---
cur.close()
conn.close()

---