In [1]:
#%%
# Cell 1: Find your downloaded IMDB dataset
import os

# kagglehub typically stores here
possible_paths = [
    os.path.expanduser("~/.cache/kagglehub/datasets/ashirwadsangwan/imdb-dataset"),
    os.path.expanduser("~/Documents/github/date-night-ai/data/imdb"),
]

for base_path in possible_paths:
    if os.path.exists(base_path):
        print(f"üìÇ Found: {base_path}")
        for root, dirs, files in os.walk(base_path):
            for file in files:
                if file.endswith('.tsv') or file.endswith('.csv'):
                    full_path = os.path.join(root, file)
                    size = os.path.getsize(full_path) / (1024*1024)
                    print(f"   üìÑ {file} ({size:.1f} MB)")
                    print(f"      Path: {full_path}")

üìÇ Found: /Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb
   üìÑ title.ratings.tsv (27.1 MB)
      Path: /Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb/title.ratings.tsv
   üìÑ imdb_top_1000.csv (0.4 MB)
      Path: /Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb/imdb_top_1000.csv
   üìÑ title.principals.tsv (4153.6 MB)
      Path: /Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb/title.principals.tsv
   üìÑ title.akas.tsv (2639.0 MB)
      Path: /Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb/title.akas.tsv
   üìÑ name.basics.tsv (887.8 MB)
      Path: /Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb/name.basics.tsv
   üìÑ title.basics.tsv (1012.2 MB)
      Path: /Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb/title.basics.tsv


In [1]:
#%% [markdown]
# # üé¨ Building RAG with IMDB Movies (TSV Files)

#%%
# Cell 1: Set path to your downloaded dataset
import os
import pandas as pd

# Set the path where your TSV files are located
# (Update this based on output from the finder above!)
path = "/Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb"

# List all TSV files
tsv_files = []
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.tsv'):
            full_path = os.path.join(root, file)
            size = os.path.getsize(full_path) / (1024*1024)
            tsv_files.append((file, full_path, size))
            print(f"üìÑ {file} ({size:.1f} MB)")

print(f"\n‚úÖ Found {len(tsv_files)} TSV files")

üìÑ title.ratings.tsv (27.1 MB)
üìÑ title.principals.tsv (4153.6 MB)
üìÑ title.akas.tsv (2639.0 MB)
üìÑ name.basics.tsv (887.8 MB)
üìÑ title.basics.tsv (1012.2 MB)

‚úÖ Found 5 TSV files


In [8]:
#%%
# Cell 2: Load the main movie data (title.basics.tsv)

# Find the basics file
basics_path = None
for name, full_path, size in tsv_files:
    if 'title.basics' in name.lower():
        basics_path = full_path
        break

if basics_path is None:
    # Use first TSV if no basics file found
    basics_path = tsv_files[0][1]

print(f"üìÇ Loading: {basics_path}")

# Load TSV file
movies_df = pd.read_csv(
    basics_path,
    sep='\t',           # Tab-separated
    low_memory=False,
    na_values='\\N'     # IMDB uses \N for null
)

print(f"\nüìä Total rows: {len(movies_df):,}")
print(f"üìã Columns: {list(movies_df.columns)}")
print(f"\nüé¨ Sample:")
print(movies_df.head())

üìÇ Loading: /Users/ramanujam.solaimalai/Documents/github/date-night-ai/data/imdb/title.basics.tsv

üìä Total rows: 12,256,479
üìã Columns: ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']

üé¨ Sample:
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short            Poor Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

   isAdult  startYear  endYear runtimeMinutes                    genres  
0        0     1894.0      NaN              1         Documentary,Short  
1        0     1892.0      NaN              5           Animation,Short  
2        0     1892.0      NaN              5  Animation

In [10]:
#%%
# Cell 3: Filter and clean data

# Filter for movies only (not TV, shorts, etc.)
if 'titleType' in movies_df.columns:
    movies_clean = movies_df[movies_df['titleType'] == 'movie'].copy()
    print(f"üìä Movies only: {len(movies_clean):,}")
else:
    movies_clean = movies_df.copy()

# Remove adult content
if 'isAdult' in movies_clean.columns:
    movies_clean = movies_clean[movies_clean['isAdult'] == 0]
    print(f"üìä After removing adult: {len(movies_clean):,}")

# Keep only movies with titles and genres
title_col = 'primaryTitle' if 'primaryTitle' in movies_clean.columns else 'title'
movies_clean = movies_clean.dropna(subset=[title_col])

if 'genres' in movies_clean.columns:
    movies_clean = movies_clean.dropna(subset=['genres'])

print(f"üìä After cleaning: {len(movies_clean):,}")

# Use subset for faster processing
movies_subset = movies_clean.head(10000)
print(f"\n‚úÖ Using {len(movies_subset):,} movies for RAG")
print(movies_subset.head())

üìä Movies only: 737,679
üìä After removing adult: 728,405
üìä After cleaning: 650,676

‚úÖ Using 10,000 movies for RAG
        tconst titleType                   primaryTitle  \
8    tt0000009     movie                     Miss Jerry   
144  tt0000147     movie  The Corbett-Fitzsimmons Fight   
331  tt0000335     movie          Soldiers of the Cross   
570  tt0000574     movie    The Story of the Kelly Gang   
587  tt0000591     movie               The Prodigal Son   

                     originalTitle  isAdult  startYear  endYear  \
8                       Miss Jerry        0     1894.0      NaN   
144  The Corbett-Fitzsimmons Fight        0     1897.0      NaN   
331          Soldiers of the Cross        0     1900.0      NaN   
570    The Story of the Kelly Gang        0     1906.0      NaN   
587              L'enfant prodigue        0     1907.0      NaN   

    runtimeMinutes                      genres  
8               45                     Romance  
144            100   

In [11]:
#%%
# Cell 4: Prepare documents for ChromaDB
documents = []
metadata_list = []
ids = []

# Detect column names
title_col = 'primaryTitle' if 'primaryTitle' in movies_subset.columns else 'title'
year_col = 'startYear' if 'startYear' in movies_subset.columns else 'year'
genre_col = 'genres' if 'genres' in movies_subset.columns else 'genre'

print(f"Using columns: title={title_col}, year={year_col}, genre={genre_col}")

for idx, row in movies_subset.iterrows():
    title = row.get(title_col, 'Unknown')
    year = row.get(year_col, 'Unknown')
    genre = row.get(genre_col, 'Unknown')
    runtime = row.get('runtimeMinutes', 'Unknown')
    
    doc_text = f"Title: {title}. Year: {year}. Genre: {genre}. Runtime: {runtime} minutes."
    
    documents.append(doc_text)
    metadata_list.append({
        "title": str(title) if pd.notna(title) else "Unknown",
        "year": str(year) if pd.notna(year) else "Unknown",
        "genre": str(genre) if pd.notna(genre) else "Unknown",
    })
    ids.append(f"movie_{idx}")

print(f"‚úÖ Prepared {len(documents):,} documents")
print(f"\nüìÑ Example:\n{documents[0]}")

Using columns: title=primaryTitle, year=startYear, genre=genres
‚úÖ Prepared 10,000 documents

üìÑ Example:
Title: Miss Jerry. Year: 1894.0. Genre: Romance. Runtime: 45 minutes.


In [12]:
#%%
# Cell 5: Store in ChromaDB
import chromadb

client = chromadb.PersistentClient(path="../vectorstore")

# Delete old collection
try:
    client.delete_collection("movies")
except:
    pass

collection = client.create_collection(name="movies")

# Add in batches
batch_size = 500
total = len(documents)

for i in range(0, total, batch_size):
    end_idx = min(i + batch_size, total)
    collection.add(
        documents=documents[i:end_idx],
        metadatas=metadata_list[i:end_idx],
        ids=ids[i:end_idx]
    )
    print(f"   Progress: {end_idx:,}/{total:,}")

print(f"\n‚úÖ ChromaDB ready with {collection.count():,} movies!")

   Progress: 500/10,000
   Progress: 1,000/10,000
   Progress: 1,500/10,000
   Progress: 2,000/10,000
   Progress: 2,500/10,000
   Progress: 3,000/10,000
   Progress: 3,500/10,000
   Progress: 4,000/10,000
   Progress: 4,500/10,000
   Progress: 5,000/10,000
   Progress: 5,500/10,000
   Progress: 6,000/10,000
   Progress: 6,500/10,000
   Progress: 7,000/10,000
   Progress: 7,500/10,000
   Progress: 8,000/10,000
   Progress: 8,500/10,000
   Progress: 9,000/10,000
   Progress: 9,500/10,000
   Progress: 10,000/10,000

‚úÖ ChromaDB ready with 10,000 movies!


In [13]:
#%%
# Cell 6: Test search
def search_movies(query: str, n_results: int = 5):
    return collection.query(query_texts=[query], n_results=n_results)

queries = ["Italian crime mafia", "Japanese anime", "romantic comedy", "space sci-fi"]

for q in queries:
    print(f"\nüîç '{q}':")
    results = search_movies(q, 3)
    for m in results['metadatas'][0]:
        print(f"   ‚Üí {m['title']} ({m['year']}) - {m['genre']}")


üîç 'Italian crime mafia':
   ‚Üí The Last of the Mafia (1915.0) - Drama
   ‚Üí The Italian (1915.0) - Drama
   ‚Üí Italy's Flaming Front (1918.0) - Drama,War

üîç 'Japanese anime':
   ‚Üí Josei wa tsuyoshi (1924.0) - Drama
   ‚Üí Meoto boshi (1927.0) - Drama
   ‚Üí Die Japanerin (1919.0) - Crime,Mystery

üîç 'romantic comedy':
   ‚Üí Experimental Marriage (1919.0) - Comedy,Romance
   ‚Üí A California Romance (1922.0) - Comedy,Drama
   ‚Üí Romance and Arabella (1919.0) - Comedy,Romance

üîç 'space sci-fi':
   ‚Üí Metropolis (1927.0) - Drama,Sci-Fi
   ‚Üí A Trip to Mars (1918.0) - Adventure,Fantasy,Sci-Fi
   ‚Üí Earthbound (1920.0) - Crime,Drama


In [20]:
#%%
# Cell 7: Build RAG with LLM
from langchain_ollama import OllamaLLM
from langchain_core.prompts import PromptTemplate

llm = OllamaLLM(model="llama3")

PROMPT = PromptTemplate(
    input_variables=["movie_info", "title"],
    template="""You are a date night food expert! üé¨üçï

Movie: {movie_info}

Suggest perfect food pairing for "{title}":
1. üçΩÔ∏è Main Dish
2. ü•§ Drink  
3. üí° Why it works

Be brief and fun!"""
)

def get_pairing(movie_name: str):
    results = search_movies(movie_name, 2)
    if not results['documents'][0]:
        return "Movie not found!"
    
    info = results['documents'][0][0]
    meta = results['metadatas'][0][0]
    
    response = llm.invoke(PROMPT.format(movie_info=info, title=meta['title']))
    return {"title": meta['title'], "pairing": response}

# Test
result = get_pairing("Turbo")
print(f"üé¨ {result['title']}\n{result['pairing']}")

üé¨ The Racing Strain
What a vintage thriller! Here's my expert recommendation for the perfect date night pairing:

**Main Dish:** Classic Italian Subs with Spicy Pepperoni
Imagine the excitement of a high-stakes horse race, just like in The Racing Strain. These subs will fuel your passion and energy!

**Drink:** Sparkling Chianti Spritzer
A crisp, fruity spritzer that's as refreshing as a victory lap! This Italian-inspired drink complements the spicy pepperoni and adds to the night's thrill.

**Why it works:**
The Racing Strain is an intense drama about horse racing, passion, and betrayal. The classic Italian flavors in these subs evoke the movie's Mediterranean setting, while the spicy pepperoni represents the thrilling competition. The Sparkling Chianti Spritzer adds a touch of sophistication and fun to your date night, just like the movie's high-stakes action!
