In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Step 2: Load the Kaggle dataset
movies = pd.read_csv("../data/movies_with_posters.csv")  # adjust path if needed
movies.head()


Unnamed: 0,movie_id,title,genres,img,plot
0,tt0000076,Exit of Rip and the Dwarf,['Drama'],/x3vnlvcBM6wBMbmgdzGLAkZYYpP.jpg,A series of short black and white films from d...
1,tt0000131,A Terrible Night,"['Comedy', 'Horror']",/7UDpBTe9mSwGeHUo5DZ2VmUWZMy.jpg,"A man tries to get a good night's sleep, but i..."
2,tt0000172,The X-Ray Fiend,"['Comedy', 'Horror']",/9W5EcN51j8eNrxqUwbQfoS6HUgZ.jpg,A romantic couple are transformed into skeleto...
3,tt0000248,A Kiss in the Tunnel,"['Comedy', 'Romance']",/lwdF4ih4E88Icfdk2nKTDMdUd3K.jpg,"Produced and directed by George Albert Smith, ..."
4,tt0000273,Attack on a China Mission,"['Action', 'Crime', 'Drama']",/2zzTE2p1tEAQmy9ppYP2Da10nQF.jpg,The titles tell us this film is based on an in...


In [3]:
# Step 3: Create poster URL column
movies['poster_url'] = "https://image.tmdb.org/t/p/w500" + movies['img']

In [4]:
# Step 4: Create tags for recommendation
# Combine plot + genres as tags
movies['tags'] = movies['plot'].fillna('') + " " + movies['genres'].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else "")
movies['tags'] = movies['tags'].str.lower()  # lowercase for uniformity

In [5]:
# Step 5: Convert tags into numerical vectors (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english')
vectors = tfidf.fit_transform(movies['tags']).toarray()
print("TF-IDF Vectors Shape:", vectors.shape)

TF-IDF Vectors Shape: (24171, 47674)


In [6]:
# Step 6: Compute cosine similarity
similarity = cosine_similarity(vectors)
print("Similarity matrix shape:", similarity.shape)

Similarity matrix shape: (24171, 24171)


  ret = a @ b
  ret = a @ b
  ret = a @ b


In [7]:
# Step 7: Recommendation function
def recommend(movie_name):
    if movie_name not in movies['title'].values:
        print("Movie not found!")
        return [], []
    
    index = movies[movies['title'] == movie_name].index[0]
    distances = similarity[index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    recommended_titles = []
    recommended_posters = []
    for i in movie_list:
        recommended_titles.append(movies.iloc[i[0]].title)
        recommended_posters.append(movies.iloc[i[0]].poster_url)
    return recommended_titles, recommended_posters

In [8]:
# Step 8: Test recommendation
titles, posters = recommend("Avatar")  # replace with any movie in dataset
for t, p in zip(titles, posters):
    print(t, p)

Movie not found!


In [10]:
sim_values = similarity.flatten()
print("Mean similarity:", np.mean(sim_values))
print("Max similarity:", np.max(sim_values))
print("Min similarity:", np.min(sim_values))

Mean similarity: 0.007938736346709503
Max similarity: 1.0000000000000009
Min similarity: 0.0


In [9]:
import pickle
pickle.dump(movies, open("../notebooks/movies.pkl", "wb"))
pickle.dump(similarity, open("../notebooks/similarity.pkl", "wb"))
print("Pickle files saved successfully!")

Pickle files saved successfully!
