In [32]:
import numpy as np
import pandas as pd

In [33]:
df=pd.read_csv("../data/cleaned_movies.csv")

In [34]:
df.shape

(278756, 9)

In [35]:
df.head()

Unnamed: 0,id,title,vote_average,overview,popularity,genres,spoken_languages,imdb_rating,combined_text
0,3,Shadows in Paradise,7.3,"Nikander, a rubbish collector and would-be ent...",2.4839,"Comedy, Drama, Romance","svenska, suomi, English",7.4,"nikander, a rubbish collector and would-be ent..."
1,5,Four Rooms,5.876,It's Ted the Bellhop's first night on the job....,4.2181,Comedy,English,6.7,it's ted the bellhop's first night on the job....
2,6,Judgment Night,6.5,"Four young friends, while taking a shortcut en...",4.273,"Action, Crime, Thriller",English,6.6,"four young friends, while taking a shortcut en..."
3,8,Life in Loops (A Megacities RMX),7.2,Timo Novotny labels his new project an experim...,2.2585,Documentary,"English, हिन्दी, 日本語, Pусский, Español",8.1,timo novotny labels his new project an experim...
4,11,Star Wars,8.204,Princess Leia is captured and held hostage by ...,14.7943,"Adventure, Action, Science Fiction",English,8.6,princess leia is captured and held hostage by ...


In [36]:
df["genres"] = df["genres"].fillna("")

In [37]:
df.isnull().sum()

id                  0
title               0
vote_average        0
overview            0
popularity          0
genres              0
spoken_languages    0
imdb_rating         0
combined_text       0
dtype: int64

In [38]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")


In [39]:
df = df[
    (df["imdb_rating"] >= 5) &
    (df["spoken_languages"].str.contains("English|Hindi|हिन्दी|हिंदी", regex=True))
]

In [40]:
df = df.sort_values("popularity", ascending=False).head(30000)
df = df.reset_index(drop=True)

In [41]:
df.shape

(30000, 9)

In [42]:
embeddings = model.encode(
    df["combined_text"].tolist(),
    batch_size=32,
    show_progress_bar=True
)

embeddings.shape

Batches:   0%|          | 0/938 [00:00<?, ?it/s]

(30000, 384)

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(movie_title, df, embeddings, top_k=5):
    # find index of movie
    idx = df[df["title"].str.lower() == movie_title.lower()].index
    
    if len(idx) == 0:
        return "Movie not found"
    
    idx = idx[0]

    # compute similarity ONLY with one vector
    sim_scores = cosine_similarity(
        embeddings[idx].reshape(1, -1),
        embeddings
    )[0]

    # get top movies
    similar_indices = sim_scores.argsort()[-top_k-1:-1][::-1]

    return df.iloc[similar_indices][["title", "genres", "vote_average"]]


In [44]:
recommend_movies("Inception", df, similarity_matrix)

Unnamed: 0,title,genres,vote_average
5321,Double Jeopardy,"Drama, Thriller, Mystery",6.621
5512,American Me,"Crime, Drama",7.4
3984,Catch Us If You Can,"Comedy, Music",5.2
6016,Billy Preston: That's the Way God Planned It,Documentary,0.0
1722,The Santa Clause,"Fantasy, Drama, Comedy, Family",6.513


In [45]:
recommend_movies("Dangal", df, similarity_matrix)

Unnamed: 0,title,genres,vote_average
1698,Monty Python's The Meaning of Life,Comedy,7.3
2921,The Perfect Snob,"Romance, Comedy",6.3
2412,Chronicle,"Science Fiction, Drama, Thriller",6.838
800,Audrie & Daisy,Documentary,7.142
3016,Monsters In The Afternoon Programming,,0.0


In [48]:
np.save("../data/movie_embeddings.npy", embeddings)
df.to_csv("../data/cleaned_movies_30k.csv", index=False)