In [None]:
from google.colab import files
files.upload()

In [None]:
import pandas as pd
df = pd.read_csv('df.csv')
df

In [None]:
import ast
df['genres'] = df['genres'].apply(ast.literal_eval)
df['watch_providers'] = df['watch_providers'].apply(ast.literal_eval)
df

In [None]:
!pip install keybert
from keybert import KeyBERT
kw_model = KeyBERT()

overview = "In a dystopian future, a lone warrior fights against a tyrannical regime to free the oppressed citizens."
keywords = kw_model.extract_keywords(overview, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7)

print([kw[0] for kw in keywords])

In [None]:
from tqdm import tqdm
tqdm.pandas()

def extract_key_phrases(text):
    if pd.isnull(text) or not isinstance(text, str) or text.strip() == "":
        return []
    try:
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 2),  # unigrams + bigrams
            stop_words='english',
            use_mmr=True,
            diversity=0.7,
            top_n=5
        )
        return [kw[0] for kw in keywords]
    except Exception as e:
        return []

# Apply the function with progress bar
df['key_phrases'] = df['overview'].progress_apply(extract_key_phrases)
df

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and accurate

# Use full overview or joined key phrases
df['text_for_search'] = df['key_phrases'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Compute embeddings
corpus_embeddings = model.encode(df['text_for_search'].tolist(), convert_to_tensor=True)

In [None]:
user_prompt = "a romance"
query_embedding = model.encode(user_prompt, convert_to_tensor=True)

# Cosine similarity
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]

# Top 5 results
top_results = cos_scores.argsort(descending=True)[:5]
top_results = top_results.tolist()

for idx in top_results:
    print(f"\nID : {idx}")
    print(f"\nTitle: {df.iloc[idx]['title']}")
    print(f"Overview: {df.iloc[idx]['overview']}")
    print(f"Keywords: {df.iloc[idx]['key_phrases']}")
    print(f"Score: {cos_scores[idx].item():.4f}")

In [None]:
df['embeddings'] = [emb for emb in corpus_embeddings]
df

In [None]:
!pip install qdrant-client
qdrant = '<YOUR-QDRANT-API-KEY>'
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="YOUR-QDRANT-URL",
    api_key=qrdant,
)

print(qdrant_client.get_collections())

In [None]:
from qdrant_client.models import VectorParams, Distance

qdrant_client.recreate_collection(
    collection_name="movies",
    vectors_config={
        "default": VectorParams(size=384, distance=Distance.COSINE)
    }
)

In [None]:
from qdrant_client.models import PointStruct

points = [
    PointStruct(
        id=i,
        vector={"default":vec.tolist()},
        payload={
            "title": row.title,
            "overview": row.overview,
            "original_language": row.original_language,
            "genres": row.genres,
            "key_phrases": row.key_phrases,
            "watch_providers": row.watch_providers,
            "movie_id": row.id,
            "vote_average": row.vote_average,
        }
    )
    for i, (vec, row) in enumerate(zip(df['embeddings'], df.itertuples()))
]

# qdrant_client.upsert(collection_name="movies", points=points)
len(points)

In [None]:
points1 = points[:2902]
points2 = points[2902:]

In [None]:
qdrant_client.upsert(collection_name="movies", points=points1)

In [None]:
qdrant_client.upsert(collection_name="movies", points=points2)

In [None]:
!pip install qdrant-client fastembed

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, SearchRequest
from fastembed.embedding import DefaultEmbedding

client = QdrantClient(
    url="<YOUR-QDRANT-URL>",
    api_key="<YOUR-QDRANT-API-KEY>",
)

In [None]:
from fastembed import TextEmbedding
embedder = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2", cache_dir=".cache")

In [None]:
query = "space exploration and alien contact"
query_vector = list(embedder.embed([query]))[0]  # convert to list for Qdrant

In [None]:
search_result = client.search(
    collection_name="movies",
    query_vector=("default", query_vector),
    limit=5
)

for hit in search_result:
    print(f"Score: {hit.score}")
    print(f"Payload: {hit.payload}")
    print("------")

In [None]:
from qdrant_client import models
filter = models.Filter(
    must=[
        models.FieldCondition(
            key="genres",
            match=models.MatchValue(value="Fantasy")
        ),
        models.FieldCondition(
            key="original_language",
            match=models.MatchValue(value="English")
        )
    ]
)

In [None]:
search_result = client.search(
    collection_name="movies",
    query_vector=("default", query_vector),
    query_filter=filter,
    limit=5
)

for hit in search_result:
    print(f"Score: {hit.score}")
    print(f"Payload: {hit.payload}")
    print("------")

In [None]:
from qdrant_client.models import PayloadSchemaType

client.create_payload_index(
    collection_name="movies",
    field_name="genres",
    field_schema="keyword",
)
client.create_payload_index(
    collection_name="movies",
    field_name="original_language",
    field_schema="keyword",
)
client.create_payload_index(
    collection_name="movies",
    field_name="watch_providers",
    field_schema="keyword",
)