In [None]:
!pip install llama-index-core # install the package for DfEmbedVectorStore - LlamaIndex's integration with DfEmbedder
!pip install lancedb # install for usage with LanceDB
!pip install pandas # for the lancedb usage
!pip install llama-index-llms-ollama # for the LlamaIndex usage

In [2]:
DATA_BASE_NAME = "example_lancedb"
TABLE_NAME = "films_table"
CSV_FILE = "TMDB_movie_dataset_v11.csv" # Using a sample CSV

In [3]:
# create the dataset we will work with with Polars
import polars as pl
# Load data and index it using DfEmbedder
print(f"Reading data from {CSV_FILE}")
df = pl.scan_csv(CSV_FILE)
df = df.limit(200)
df = df.select(["id", "title", "overview", "genres", "vote_average"])
df = df.rename({"title": "movie_title", "overview": "movie_description", "genres": "movie_genres", "vote_average": "movie_vote_average"})
df = df.collect()
df.head()

Reading data from TMDB_movie_dataset_v11.csv


id,movie_title,movie_description,movie_genres,movie_vote_average
i64,str,str,str,f64
27205,"""Inception""","""Cobb, a skilled thief who comm…","""Action, Science Fiction, Adven…",8.364
157336,"""Interstellar""","""The adventures of a group of e…","""Adventure, Drama, Science Fict…",8.417
155,"""The Dark Knight""","""Batman raises the stakes in hi…","""Drama, Action, Crime, Thriller""",8.512
19995,"""Avatar""","""In the 22nd century, a paraple…","""Action, Adventure, Fantasy, Sc…",7.573
24428,"""The Avengers""","""When an unexpected enemy emerg…","""Science Fiction, Action, Adven…",7.71


In [4]:
# getting the arrow table from the polars dataframe
arrow_table = df.to_arrow() 
# with pandas that can be done by:
# import pyarrow as pa
# arrow_table = pa.Table.from_pandas(pandas_df)

In [5]:

from dfembed import DfEmbedder
embedder = DfEmbedder(database_name=DATA_BASE_NAME) # Using default params for simplicity
embedder.index_table(arrow_table, table_name=TABLE_NAME)

[2m2025-04-16T10:38:26.302814Z[0m [32m INFO[0m [2mdfembed[0m[2m:[0m Initializing Embedder
[2m2025-04-16T10:38:26.646403Z[0m [32m INFO[0m [2mdfembed[0m[2m:[0m Embedder initialized
Analyzing PyArrow table...
Found __arrow_c_stream__ method, using C Data Interface...
Successfully converted using Arrow C Data Interface
[2m2025-04-16T10:38:26.949419Z[0m [32m INFO[0m [2mdfembed::indexer[0m[2m:[0m Starting indexer with 16 workers and embedding chunk size 500 and write buffer size 2000
[2m2025-04-16T10:38:25.870122Z[0m [32m INFO[0m [2mdfembed::indexer[0m[2m:[0m Created embedder for thread id ThreadId(28)
[2m2025-04-16T10:38:25.862744Z[0m [32m INFO[0m [2mdfembed::indexer[0m[2m:[0m Created embedder for thread id ThreadId(27)
[2m2025-04-16T10:38:25.870259Z[0m [32m INFO[0m [2mdfembed::indexer[0m[2m:[0m Embedding thread finished.. closing channel
[2m2025-04-16T10:38:25.870271Z[0m [32m INFO[0m [2mdfembed::indexer[0m[2m:[0m Embedding thread id T

In [6]:
query = "jungle adventures kids"
similar = embedder.find_similar(query, TABLE_NAME, k=5)
for film in similar:
    print(film)

id is 353486; movie_title is Jumanji: Welcome to the Jungle; movie_description is Four teenagers in detention discover an old video game console with a game they’ve never heard of. When they decide to play, they are immediately sucked into the jungle world of Jumanji in the bodies of their avatars. They’ll have to complete the adventure of their lives filled with fun, thrills and danger or be stuck in the game forever!; movie_genres is Adventure, Action, Comedy, Fantasy; movie_vote_average is 6.827
id is 329; movie_title is Jurassic Park; movie_description is A wealthy entrepreneur secretly creates a theme park featuring living dinosaurs drawn from prehistoric DNA. Before opening day, he invites a team of experts and his two eager grandchildren to experience the park and help calm anxious investors. However, the park is anything but amusing as the security systems go off-line and the dinosaurs escape.; movie_genres is Adventure, Science Fiction; movie_vote_average is 7.941
id is 269149

In [7]:
# use lancedb to query and get the results
import lancedb

db = lancedb.connect(DATA_BASE_NAME)
table = db.open_table(TABLE_NAME)

query = "jungle adventures kids"
query_vector = embedder.embed_string(query)

results = table.search(query_vector).limit(5).to_pandas()
results

Unnamed: 0,filename,text,vector,_distance
0,id is 353486; movie_title is Jumanji: Welcome ...,id is 353486; movie_title is Jumanji: Welcome ...,"[-0.024141256, 0.05927249, -0.022344539, 0.012...",1.183995
1,id is 329; movie_title is Jurassic Park; movie...,id is 329; movie_title is Jurassic Park; movie...,"[-0.02603601, 0.026940161, -0.0929412, 0.02070...",1.482222
2,id is 269149; movie_title is Zootopia; movie_d...,id is 269149; movie_title is Zootopia; movie_d...,"[-0.04254988, 0.05369326, -0.013325058, -0.005...",1.492725
3,id is 425; movie_title is Ice Age; movie_descr...,id is 425; movie_title is Ice Age; movie_descr...,"[-0.0047214017, -0.010727687, -0.031148905, 0....",1.544784
4,id is 135397; movie_title is Jurassic World; m...,id is 135397; movie_title is Jurassic World; m...,"[-0.045647327, -0.021175912, -0.09689002, 0.03...",1.557309


In [8]:
# LlamaIndex integration

from dfembed import DfEmbedVectorStore # the LlamaIndex's vector store for DfEmbedder
from llama_index.llms.ollama import Ollama
from llama_index.core.settings import Settings
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.embeddings import MockEmbedding


# because we use our own embedding model
Settings.embed_model = MockEmbedding(embed_dim=1024)
vector_store = DfEmbedVectorStore(
    df_embedder=embedder,
    table_name=TABLE_NAME
)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

llm = Ollama(model="llama3:8b", request_timeout=120.0)
query_engine = index.as_query_engine(similarity_top_k=5, llm=llm)

# Perform a query
query_str = "Please recommend me a movie about adventures in the jungle"
print(f"\nQuery: '{query_str}'")
response = query_engine.query(query_str)
print(str(response))

DfEmbedVectorStore initialized for table: 'films_table'

Query: 'Please recommend me a movie about adventures in the jungle'
DfEmbedVectorStore: Received query: 'Please recommend me a movie about adventures in the jungle', k=5
DfEmbedVectorStore: find_similar returned 5 results.
[2m2025-04-16T10:38:28.829676Z[0m [32m INFO[0m [1mDatasetRecordBatchStream[0m[2m:[0m [2mlance::execution[0m[2m:[0m [3mtype[0m[2m=[0m"plan_run" [3moutput_rows[0m[2m=[0m5 [3miops[0m[2m=[0m12 [3mrequests[0m[2m=[0m4 [3mbytes_read[0m[2m=[0m825433 [3mindices_loaded[0m[2m=[0m0 [3mparts_loaded[0m[2m=[0m0 [3mindex_comparisons[0m[2m=[0m0
I'd be happy to help you with that! A movie about adventures in the jungle that I think you might enjoy is... Jumanji: Welcome to the Jungle (id = 353486). Give it a try and see how you like it!
