In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

movies = pd.read_csv("content_movie_recommender/movies.csv")
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [4]:
movies['genres'] = movies['genres'].replace('(no genres listed)', '')

movies['content'] = movies['title'] + " " + movies['genres'].str.replace('|', ' ', regex=False)

movies[['title', 'genres', 'content']].head()


Unnamed: 0,title,genres,content
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure Animation Children ...
1,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure Children Fantasy
2,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy Drama Romance
4,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['content'])

tfidf_matrix.shape


(9742, 9058)

In [6]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape


(9742, 9742)

In [7]:
movies = movies.reset_index()
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

movies.head()


Unnamed: 0,index,movieId,title,genres,content
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure Animation Children ...
1,1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure Children Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy Drama Romance
4,4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [8]:
def recommend(title, n=10):
    """
    Recommend n movies similar to the given title, using cosine similarity
    on the TF-IDF content matrix.
    """
    title = title.strip()
    if title not in indices:
        print(f"'{title}' not found in the dataset.")
        return pd.DataFrame()

    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:n+1]

    movie_indices = [i[0] for i in sim_scores]

    return movies.loc[movie_indices, ['title', 'genres']]


In [9]:
recommend("Toy Story (1995)", n=5)


Unnamed: 0,title,genres
2355,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
7355,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
3595,"Toy, The (1982)",Comedy
2539,We're Back! A Dinosaur's Story (1993),Adventure|Animation|Children|Fantasy
26,Now and Then (1995),Children|Drama
