In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv(r"movies.csv")


In [3]:
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df.shape

(9742, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [6]:
df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [7]:
df['genres']

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

In [8]:
df['genres'] = df['genres'].str.strip()

In [9]:
df['genres']=df['genres'].str.replace(r'|',' ')

In [10]:
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


## Textvectorization

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000,stop_words='english')

In [24]:
vectors = cv.fit_transform(df['genres']).toarray()

In [25]:
#test = cv.fit(df['genres'])
#test

In [14]:
vectors.shape

(9742, 23)

In [15]:
len(cv.get_feature_names_out())

23

## Using cosine similarity for Recommendation system 

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
similartity = cosine_similarity(vectors)

In [30]:
similartity.shape

(9742, 9742)

In [31]:
similartity[0]

array([1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
       0.4472136 ])

In [32]:
# code for finding the top 5 movies that are most similar to the first movie in your dataset, based on their genre similarity scores.
sorted(list(enumerate(similartity[0])),reverse=True,key=lambda x:x[1])[1:6]

[(1706, 0.9999999999999999),
 (2355, 0.9999999999999999),
 (2809, 0.9999999999999999),
 (3000, 0.9999999999999999),
 (3568, 0.9999999999999999)]

In [40]:
def recommend(movie):
    movie_index = df[df['title']==movie].index[0]
    distances = similartity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(df.iloc[i[0]].title)
        
    return

It takes a movie title (movie) as input.

It finds the index of the movie in the DataFrame (df) by checking the 'title' column for a match.

It retrieves the similarity scores for this movie from the similarity matrix (similarity).

It pairs up each similarity score with its index, sorts them in descending order based on the similarity score, and selects the top 5 similar movies.

It then prints the titles of these top 5 similar movies

In [44]:
recommend('Grumpier Old Men (1995)')

Sabrina (1995)
Clueless (1995)
Two if by Sea (1996)
French Twist (Gazon maudit) (1995)
If Lucy Fell (1996)
