In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("../data/cleaned_movies_30k.csv")
embeddings = np.load("../data/movie_embeddings.npy")

In [3]:
df.shape, embeddings.shape


((30000, 9), (30000, 384))

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_by_movie(title, df, embeddings, top_n=5, alpha=0.7):
    title = title.lower().strip()

    matches = df[df["title"].str.lower().str.contains(title)]

    if matches.empty:
        return "Movie not found"

    idx = matches.index[0]

    movie_vec = embeddings[idx].reshape(1, -1)
    similarity_scores = cosine_similarity(movie_vec, embeddings)[0]

    # Normalize IMDb rating
    imdb_norm = df["imdb_rating"] / 10

    # Final ranking score
    final_score = alpha * similarity_scores + (1 - alpha) * imdb_norm

    temp_df = df.copy()
    temp_df["final_score"] = final_score

    recommendations = (
        temp_df
        .sort_values("final_score", ascending=False)
        .iloc[1:top_n + 1]
        .reset_index(drop=True)
    )

    recommendations.insert(0, "Rank", range(1, top_n + 1))

    return recommendations[["Rank", "title", "genres", "imdb_rating"]]


In [18]:
recommend_by_movie("Interstellar",df,embeddings)

Unnamed: 0,Rank,title,genres,imdb_rating
0,1,Prometheus,"Science Fiction, Adventure, Mystery",7.0
1,2,Star Trek Beyond,"Action, Adventure, Science Fiction",7.0
2,3,Things to Come,"Drama, Science Fiction",6.6
3,4,Passengers,"Drama, Romance, Science Fiction",7.0
4,5,The Carpenters... Space Encounters,"Comedy, Music, Science Fiction, TV Movie",7.2
