In [4]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np

dataset = load_dataset("MongoDB/embedded_movies")

filtered_dataset = dataset['train'].filter(lambda x: x['fullplot'] is not None)

model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(text):
    return model.encode(text).tolist()

filtered_dataset = filtered_dataset.map(lambda x: {'embedding': generate_embeddings(x['fullplot'])})


Filter: 100%|██████████| 1500/1500 [00:00<00:00, 1611.01 examples/s]
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
Map: 100%|██████████| 1452/1452 [01:30<00:00, 16.06 examples/s]


In [6]:
import pandas as pd
from IPython.display import display

df = pd.DataFrame(filtered_dataset)

display(df.head())  


Unnamed: 0,plot,runtime,genres,fullplot,directors,writers,countries,poster,languages,cast,title,num_mflix_comments,rated,imdb,awards,type,metacritic,plot_embedding,embedding
0,Young Pauline is left a lot of money when her ...,199.0,[Action],Young Pauline is left a lot of money when her ...,"[Louis J. Gasnier, Donald MacKenzie]","[Charles W. Goddard (screenplay), Basil Dickey...",[USA],https://m.media-amazon.com/images/M/MV5BMzgxOD...,[English],"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",The Perils of Pauline,0,,"{'id': 4465, 'rating': 7.6, 'votes': 744}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.0007293965299999999, -0.026834568000000003,...","[-0.06366507709026337, 0.05893127992749214, -0..."
1,A penniless young man tries to save an heiress...,22.0,"[Comedy, Short, Action]",As a penniless man worries about how he will m...,"[Alfred J. Goulding, Hal Roach]",[H.M. Walker (titles)],[USA],https://m.media-amazon.com/images/M/MV5BNzE1OW...,[English],"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",From Hand to Mouth,0,TV-G,"{'id': 10146, 'rating': 7.0, 'votes': 639}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.022837115, -0.022941574000000003, 0.014937...","[-0.047600891441106796, -0.008872508071362972,..."
2,"Michael ""Beau"" Geste leaves England in disgrac...",101.0,"[Action, Adventure, Drama]","Michael ""Beau"" Geste leaves England in disgrac...",[Herbert Brenon],"[Herbert Brenon (adaptation), John Russell (ad...",[USA],,[English],"[Ronald Colman, Neil Hamilton, Ralph Forbes, A...",Beau Geste,0,,"{'id': 16634, 'rating': 6.9, 'votes': 222}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.00023330492999999998, -0.028511643000000003...","[0.022996382787823677, 0.1080184355378151, -0...."
3,"Seeking revenge, an athletic young man joins t...",88.0,"[Adventure, Action]",A nobleman vows to avenge the death of his fat...,[Albert Parker],"[Douglas Fairbanks (story), Jack Cunningham (a...",[USA],https://m.media-amazon.com/images/M/MV5BMzU0ND...,,"[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",The Black Pirate,1,,"{'id': 16654, 'rating': 7.2, 'votes': 1146}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[-0.005927917, -0.033394486, 0.0015323418, -0....","[-0.07819890975952148, 0.11125773936510086, -0..."
4,An irresponsible young millionaire changes his...,58.0,"[Action, Comedy, Romance]","The Uptown Boy, J. Harold Manners (Lloyd) is a...",[Sam Taylor],"[Ted Wilde (story), John Grey (story), Clyde B...",[USA],https://m.media-amazon.com/images/M/MV5BMTcxMT...,[English],"[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...",For Heaven's Sake,0,PASSED,"{'id': 16895, 'rating': 7.6, 'votes': 918}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.0059373598, -0.026604708, -0.0070914757000...","[-0.014855039305984974, 0.0959319919347763, -0..."


In [9]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
db = client["movie_database"]
collection = db["movie_embeddings"]

def store_embeddings_to_mongo(dataset):
    for item in dataset:
        data = {
            "title": item.get("title"),
            "fullplot": item.get("fullplot"),
            "year": item.get("year"),
            "genre": item.get("genre"),
            "runtime": item.get("runtime"),
            "directors": item.get("directors"),
            "writers": item.get("writers"),
            "countries": item.get("countries"),
            "languages": item.get("languages"),
            "cast": item.get("cast"),
            "num_mflix_comments": item.get("num_mflix_comments"),
            "rated": item.get("rated"),
            "imdb": item.get("imdb"),
            "awards": item.get("awards"),
            "type": item.get("type"),
            "metacritic": item.get("metacritic"),
            "poster": item.get("poster"),  
            "embedding": item.get("embedding") 
        }
        collection.insert_one(data)

store_embeddings_to_mongo(df.to_dict(orient="records"))

print("Data successfully inserted into MongoDB!")

Data successfully inserted into MongoDB!


In [13]:
def find_similar_movies(query_text, top_k=4):
    query_embedding = model.encode(query_text).tolist()
    
    pipeline = [
        {
            "$addFields": {
                "similarity": {
                    "$let": {
                        "vars": {
                            "embedding_vector": "$embedding",
                            "query_vector": query_embedding
                        },
                        "in": {
                            "$reduce": {
                                "input": {
                                    "$range": [0, {"$size": "$$embedding_vector"}]
                                },
                                "initialValue": 0,
                                "in": {
                                    "$add": [
                                        "$$value",
                                        {
                                            "$multiply": [
                                                {"$arrayElemAt": ["$$embedding_vector", "$$this"]},
                                                {"$arrayElemAt": ["$$query_vector", "$$this"]}
                                            ]
                                        }
                                    ]
                                }
                            }
                        }
                    }
                }
            }
        },
        {"$sort": {"similarity": -1}},
        {"$limit": top_k},
        {
            "$project": {
                "title": 1,
                "fullplot": 1,
                "similarity": 1
            }
        }
    ]
    
    results = list(collection.aggregate(pipeline))
    return results




Title: The War of the Worlds, Similarity: 0.5173340385577094
Full Plot: H.G. Well's classic novel is brought to life is this tale of alien invasion. The residents of a small town in California are excited when a flaming meteor lands in the hills. Their joy is tempered somewhat when they discover that it has passengers who are not very friendly. The movie itself is understood better when you consider that it was made at the height of the Cold War--just replace Martian with Russian....

Title: Red Planet, Similarity: 0.4236816560884935
Full Plot: In the near future, Earth is dying. A new colony on Mars could be humanity's only hope. A team of American astronauts, each a specialist in a different field, is making the first manned expedition to the red planet and must struggle to overcome the differences in their personalities, backgrounds and ideologies for the overall good of the mission. When their equipment suffers life-threatening damage and the crew must depend on one another for sur

In [14]:
query = "A story about space exploration and alien contact."
similar_movies = find_similar_movies(query)

for movie in similar_movies:
    print(f"Title: {movie['title']}, Similarity: {movie['similarity']}")
    print(f"Full Plot: {movie['fullplot']}\n")

Title: The War of the Worlds, Similarity: 0.5173340385577094
Full Plot: H.G. Well's classic novel is brought to life is this tale of alien invasion. The residents of a small town in California are excited when a flaming meteor lands in the hills. Their joy is tempered somewhat when they discover that it has passengers who are not very friendly. The movie itself is understood better when you consider that it was made at the height of the Cold War--just replace Martian with Russian....

Title: Red Planet, Similarity: 0.4236816560884935
Full Plot: In the near future, Earth is dying. A new colony on Mars could be humanity's only hope. A team of American astronauts, each a specialist in a different field, is making the first manned expedition to the red planet and must struggle to overcome the differences in their personalities, backgrounds and ideologies for the overall good of the mission. When their equipment suffers life-threatening damage and the crew must depend on one another for sur

In [18]:
from langchain_community.llms import Ollama
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
llm = Ollama(model="llama3")

prompt_template = PromptTemplate(
    template="I give you multiple documents from my knowledge base. Combine and explain the following documents related to the query: '{query}'. Provide a comprehensive analysis.\n\nDocuments:\n{documents}. Give only names and some explanation if movie is relateable to my query",
    input_variables=["query", "documents"]
)

llm_chain = LLMChain(
    llm=llm,
    prompt=prompt_template
)

In [19]:
def generate_movie_response(query):
    similar_movies = find_similar_movies(query)
    
    documents_text = "\n\n".join([f"Title: {movie['title']}\nDescription: {movie['fullplot']}" for movie in similar_movies if movie["fullplot"]])
    
    response = llm_chain.run({
        "query": query,
        "documents": documents_text
    })
    
    return response

response = generate_movie_response("What are some good movies about space exploration?")
print("Response from Llama 3:", response)

  warn_deprecated(
