In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
%matplotlib inline

In [37]:
movies = pd.read_csv('movies.csv')

In [38]:
movies.drop('type', inplace=True, axis=1)

In [39]:
movies.isnull().sum()

title            0
year             0
rating         288
description     25
dtype: int64

In [40]:
movies.dropna(inplace=True)

In [41]:
movies['description'] = movies['description'].apply(lambda x:x.lower())

In [42]:
ps = PorterStemmer()

In [43]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [44]:
movies['description'] = movies['description'].apply(stem)

In [45]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [46]:
vectors = cv.fit_transform(movies['description']).toarray()

In [47]:
cv.get_feature_names_out()

array(['000', '10', '100', ..., 'zombies', 'zone', 'zoo'], dtype=object)

In [48]:
similarity = cosine_similarity(vectors)

In [49]:
sorted(list(enumerate(similarity[1])),reverse=True,key=lambda x:x[1])[1:6]

[(10989, 0.36380343755449945),
 (1844, 0.280056016805602),
 (7878, 0.280056016805602),
 (2810, 0.27500954910846337),
 (4925, 0.25928148942086576)]

In [52]:
def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(movies.iloc[i[0]].title)

In [54]:
recommend('Kingdom')

Solace
Manhunt
The Limehouse Golem
The Gates
Suspect Zero


In [55]:
pickle.dump(similarity,open('similarity.pkl','wb'))