In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load dataset
url = "movies.csv"
df = pd.read_csv(url)

# Display first few rows
print(df.head())

# Check columns available
print(df.columns)


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
Index(['movieId', 'title', 'genres'], dtype='object')


In [25]:
# Normalize column names to lowercase
df.columns = df.columns.str.strip().str.lower()

# Fill missing values in 'genres'
df["genres"].fillna("", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["genres"].fillna("", inplace=True)


In [26]:
vectorizer = TfidfVectorizer(stop_words="english")
genre_matrix = vectorizer.fit_transform(df["genres"])


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert genres to TF-IDF features
vectorizer = TfidfVectorizer(stop_words="english")
genre_matrix = vectorizer.fit_transform(df["genres"])

print("Genre feature matrix shape:", genre_matrix.shape)


Genre feature matrix shape: (62423, 23)


In [28]:
def recommend_movies(movie_title, num_recommendations=5):
    movie_title = movie_title.lower()

    # Find the movie index
    movie_index = df[df["title"].str.lower() == movie_title].index

    if len(movie_index) == 0:
        return "Movie not found! Please try another title."

    movie_index = movie_index[0]

    # Compute similarity only for the given movie
    movie_vector = genre_matrix[movie_index]  # Get the vector for the given movie
    similarity_scores = cosine_similarity(movie_vector, genre_matrix)[0]  # Compute similarity for only this movie

    # Sort by highest similarity (excluding itself)
    sorted_indices = np.argsort(similarity_scores)[::-1][1:num_recommendations+1]

    # Get recommended movie titles
    recommended_movies = df.iloc[sorted_indices]["title"].tolist()

    return recommended_movies

In [29]:
movie_name = "The Dark Knight"
recommended_movies = recommend_movies(movie_name)

print(f"Movies similar to '{movie_name}':")
for idx, movie in enumerate(recommended_movies, start=1):
    print(f"{idx}. {movie}")

Movies similar to 'The Dark Knight':
1. M
2. o
3. v
4. i
5. e
6.  
7. n
8. o
9. t
10.  
11. f
12. o
13. u
14. n
15. d
16. !
17.  
18. P
19. l
20. e
21. a
22. s
23. e
24.  
25. t
26. r
27. y
28.  
29. a
30. n
31. o
32. t
33. h
34. e
35. r
36.  
37. t
38. i
39. t
40. l
41. e
42. .
