In [52]:
import numpy as np
import pandas as pd

In [53]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [54]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [55]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [56]:
movies = movies.merge(credits,on='title')

In [57]:
movies.shape

(4809, 23)

In [58]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [60]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [61]:
movies.isnull().sum()


Unnamed: 0,0
movie_id,0
title,0
overview,3
genres,0
keywords,0
cast,0
crew,0


In [62]:
movies.dropna(inplace = True)

Functions for data preprocessing

In [63]:
import ast
def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

In [64]:

def top3(obj):
  L = []
  count = 0
  for i in ast.literal_eval(obj):
    if count != 3:
      L.append(i['name'])
      count += 1
    else:
      break
  return L

In [65]:
def get_dir(obj):
  L = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      L.append(i['name'])
      break
  return L

In [66]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(top3)
movies['crew'] = movies['crew'].apply(get_dir)
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [67]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])


In [68]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [69]:
movies['tags'] = movies['genres'] +  movies['keywords'] + movies['cast'] + movies['crew'] +  movies['overview']

In [70]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, c..."


In [71]:
movies.drop(['genres','keywords','cast','crew','overview'],axis = 1,inplace = True)

In [72]:
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, c..."


In [73]:
# converting list as string
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x)).apply(lambda x: x.lower())

In [77]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [78]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

In [79]:
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventure fantasy sciencefiction cultur...


In [81]:
X.shape


(4806, 5000)

In [97]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# td = TfidfVectorizer(max_features = 5000,stop_words = 'english',min_df = 2, max_df = 0.8)


from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

X = cv.fit_transform(movies['tags']).toarray()

In [98]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(X)


In [99]:
def recommend_sys(movie_title, num_recommendations=5):
    try:
        movie_index = movies[movies['title'] == movie_title].index[0]
        distances = similarity[movie_index]
        movies_list = sorted(
            list(enumerate(distances)), reverse=True, key=lambda x: x[1]
        )[1 : num_recommendations + 1]
        print(f"Recommendations for '{movie_title}':")
        for i in movies_list:
            print(movies.iloc[i[0]].title)
    except IndexError:
        print(f"Movie '{movie_title}' not found in the dataset.")


In [95]:
# def recommend_sys(movie):
#   movie_index = movies[movies['title'] == movie].index[0]
#   distances = similarity[movie_index]
#   movies_list = sorted(list(enumerate(distances)),reverse = True,key = lambda x:x[1])[1:6]
#   for i in movies_list:
#     print(movies.iloc[i[0]].title)

In [100]:
recommend_sys('Gandhi')

Recommendations for 'Gandhi':
Gandhi, My Father
The Wind That Shakes the Barley
A Passage to India
Guiana 1838
Bloody Sunday


In [101]:
def coverage():
    recommended_movies = set()
    for index in range(len(movies)):
        try:
            recommendations = [
                movies.iloc[i[0]].title
                for i in sorted(
                    list(enumerate(similarity[index])),
                    reverse=True,
                    key=lambda x: x[1]
                )[1:6]  # Top 5 recommendations
            ]
            recommended_movies.update(recommendations)
        except IndexError:
            continue

    coverage_score = len(recommended_movies) / len(movies)
    print(f"Coverage: {coverage_score:.2%}")


In [102]:
coverage()

Coverage: 86.58%
