In [19]:
import os
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
import pinecone
from pinecone import Pinecone as p1, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import Ollama

In [6]:
import os
os.environ['PINECONE_API_KEY'] = '39c3b55b-2ae4-44ee-a9cd-83a99876c828'
pc = p1(
    api_key=os.environ.get("PINECONE_API_KEY")
)

In [7]:
INDEX_NAME = "mongo"
PINECONE_NAMESPACE = "real"

In [8]:
dataset = load_dataset("MongoDB/embedded_movies")
filtered_dataset = dataset['train'].filter(lambda x: x['fullplot'] is not None)


In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')




In [10]:
def generate_embeddings(text):
    return model.encode(text).tolist()

In [11]:
filtered_dataset = filtered_dataset.map(lambda x: {'plot_embedding': generate_embeddings(x['fullplot'])})


Map: 100%|██████████| 1452/1452 [01:38<00:00, 14.74 examples/s]


In [12]:
df = pd.DataFrame(filtered_dataset)

In [13]:
client = MongoClient('mongodb://localhost:27017/')
db = client['movie_db']
collection = db['movies']

In [16]:
pinecone_index = pc.Index(INDEX_NAME)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings, namespace=PINECONE_NAMESPACE)

  warn_deprecated(


In [17]:
for _, row in df.iterrows():
    document = {
        "title": row['title'],
        "runtime": row['runtime'],
        "genres": row['genres'],
        "fullplot": row['fullplot'],
        "directors": row['directors'],
        "writers": row['writers'],
        "countries": row['countries'],
        "poster": row['poster'],
        "languages": row['languages'],
        "cast": row['cast'],
        "num_mflix_comments": row['num_mflix_comments'],
        "rated": row['rated'],
        "imdb": row['imdb'],
        "awards": row['awards'],
        "type": row['type'],
        "metacritic": row['metacritic'],
        "plot_embedding": row['plot_embedding']
    }
    
    result = collection.insert_one(document)
    mongo_id = result.inserted_id
    
    pinecone_index.upsert(
        vectors=[
            {
                "id": str(mongo_id),  
                "values": row['plot_embedding'], 
                "metadata": {}  
            }
        ],
        namespace=PINECONE_NAMESPACE
    )

In [30]:
def retrieve_similar_movies(query_text, top_n=3):
    query_embedding = embeddings.embed_query(query_text)
    
    # Search Pinecone for similar embeddings
    search_results = pinecone_index.query(
        top_k=top_n,
        vector=query_embedding,
        namespace=PINECONE_NAMESPACE
    )
    
    # Extract IDs of similar documents
    similar_ids = [match['id'] for match in search_results['matches']]
    
    # Retrieve corresponding documents from MongoDB using IDs
    movies = list(collection.find({"_id": {"$in": [ObjectId(id) for id in similar_ids]}}))
    
    return movies

In [33]:
from pymongo import MongoClient
from bson import ObjectId
from langchain.prompts import PromptTemplate


In [37]:
def retrieve_similar_movies(query_text, top_n=3):
    query_embedding = embeddings.embed_query(query_text)
    
    search_results = pinecone_index.query(
        top_k=top_n,
        vector=query_embedding,
        namespace=PINECONE_NAMESPACE
    )
    
    similar_ids = [match['id'] for match in search_results['matches']]
    
    movies = list(collection.find({"_id": {"$in": [ObjectId(id) for id in similar_ids]}}))
    
    return movies

In [38]:
prompt_template = PromptTemplate(
    input_variables=["context", "query"],
    template="""
    You are a movie recommendation assistant. Given the following context and query, provide a detailed response:

    Context:
    {context}

    Query:
    {query}

    Response:
    """
)



In [39]:
def generate_response(query_text):
    similar_movies = retrieve_similar_movies(query_text)
    
    llama = Ollama(model="llama3")
    
    context = "\n".join([f"Title: {movie.get('title')}\nPlot: {movie.get('fullplot')}" for movie in similar_movies])
    
    prompt = prompt_template.format(context=context, query=query_text)
    
    response = llama.generate([prompt])
    
    return response



In [40]:
query_text = "A thrilling adventure through space and time."
response = generate_response(query_text)


In [36]:
text_output = response.generations[0][0].text
print(text_output)


What an exciting query! If you're looking for a thrilling adventure that combines elements of space travel, time manipulation, and action-packed excitement, I'd highly recommend the following movies:

**1. Interstellar (2014)**: Directed by Christopher Nolan, this visually stunning film explores the possibilities of wormhole travel and the consequences of playing with time. The movie follows Cooper, a former NASA pilot, as he embarks on a mission to find a new home for humanity. With its mind-bending concepts and breathtaking visuals, Interstellar is sure to thrill you.

**2. Edge of Tomorrow (2014)**: This sci-fi action film stars Tom Cruise as Major William Cage, a soldier who finds himself reliving the same day over and over again in a battle against an alien invasion. As he tries to prevent the apocalypse, Cage must confront his own mortality and the consequences of time travel.

**3. Looper (2012)**: Bruce Willis and Joseph Gordon-Levitt star in this thought-provoking thriller abo