In [19]:
import os
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
import pinecone
from pinecone import Pinecone as p1, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import Ollama

In [6]:
import os
os.environ['PINECONE_API_KEY'] = '39c3b55b-2ae4-44ee-a9cd-83a99876c828'
pc = p1(
    api_key=os.environ.get("PINECONE_API_KEY")
)

In [7]:
INDEX_NAME = "mongo"
PINECONE_NAMESPACE = "real"

In [8]:
dataset = load_dataset("MongoDB/embedded_movies")
filtered_dataset = dataset['train'].filter(lambda x: x['fullplot'] is not None)


In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')




In [10]:
def generate_embeddings(text):
    return model.encode(text).tolist()

In [11]:
filtered_dataset = filtered_dataset.map(lambda x: {'plot_embedding': generate_embeddings(x['fullplot'])})


Map: 100%|██████████| 1452/1452 [01:38<00:00, 14.74 examples/s]


In [12]:
df = pd.DataFrame(filtered_dataset)

In [13]:
client = MongoClient('mongodb://localhost:27017/')
db = client['movie_db']
collection = db['movies']

In [16]:
pinecone_index = pc.Index(INDEX_NAME)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings, namespace=PINECONE_NAMESPACE)

  warn_deprecated(


In [17]:
for _, row in df.iterrows():
    document = {
        "title": row['title'],
        "runtime": row['runtime'],
        "genres": row['genres'],
        "fullplot": row['fullplot'],
        "directors": row['directors'],
        "writers": row['writers'],
        "countries": row['countries'],
        "poster": row['poster'],
        "languages": row['languages'],
        "cast": row['cast'],
        "num_mflix_comments": row['num_mflix_comments'],
        "rated": row['rated'],
        "imdb": row['imdb'],
        "awards": row['awards'],
        "type": row['type'],
        "metacritic": row['metacritic'],
        "plot_embedding": row['plot_embedding']
    }
    
    result = collection.insert_one(document)
    mongo_id = result.inserted_id
    
    pinecone_index.upsert(
        vectors=[
            {
                "id": str(mongo_id),  
                "values": row['plot_embedding'], 
                "metadata": {}  
            }
        ],
        namespace=PINECONE_NAMESPACE
    )

In [30]:
def retrieve_similar_movies(query_text, top_n=3):
    query_embedding = embeddings.embed_query(query_text)
    
    search_results = pinecone_index.query(
        top_k=top_n,
        vector=query_embedding,
        namespace=PINECONE_NAMESPACE
    )
    
    similar_ids = [match['id'] for match in search_results['matches']]
    
    movies = list(collection.find({"_id": {"$in": [ObjectId(id) for id in similar_ids]}}))
    
    return movies

In [33]:
from pymongo import MongoClient
from bson import ObjectId
from langchain.prompts import PromptTemplate


In [37]:
def retrieve_similar_movies(query_text, top_n=3):
    query_embedding = embeddings.embed_query(query_text)
    
    search_results = pinecone_index.query(
        top_k=top_n,
        vector=query_embedding,
        namespace=PINECONE_NAMESPACE
    )
    
    similar_ids = [match['id'] for match in search_results['matches']]
    
    movies = list(collection.find({"_id": {"$in": [ObjectId(id) for id in similar_ids]}}))
    
    return movies

In [45]:
from langchain import PromptTemplate

prompt_template = PromptTemplate(
    input_variables=["context", "query"],
    template="""
    You are a movie recommendation assistant. Given the context and query below, provide a detailed response by recommending only the movies from the context that match the query. Do not suggest any movies that are not in the context.

    Context:
    {context}

    Query:
    {query}

    Response:
    """
)


In [46]:
def generate_response(query_text):
    similar_movies = retrieve_similar_movies(query_text)
    
    llama = Ollama(model="llama3")
    
    context = "\n".join([f"Title: {movie.get('title')}\nPlot: {movie.get('fullplot')}" for movie in similar_movies])
    
    prompt = prompt_template.format(context=context, query=query_text)
    
    response = llama.generate([prompt])
    
    return response



In [47]:
query_text = "A thrilling adventure through space and time."
response = generate_response(query_text)

In [48]:
text_output = response.generations[0][0].text
print(text_output)

What an exciting query!

Based on the context, I highly recommend two movies that match your request:

1. **A.P.E.X.**: This movie fits perfectly with your query, as it involves a thrilling adventure through space and time. The plot revolves around a scientist who finds himself in an alternate timeline, where he must navigate a war-torn world filled with killer robots and find a way to prevent the disaster from occurring.
2. **Space Raiders**: Another great match! This movie is all about an exciting adventure when a 10-year-old boy is accidentally kidnapped by a spaceship filled with space pirates. It's a futuristic tale of confrontation and exploration that should satisfy your craving for a thrilling ride through space.

These two movies are the only ones in the context that fit your query, so I hope you enjoy them!
