In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
movies=pd.read_csv("/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv")
credits=pd.read_csv("/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv")

In [4]:
movies.head(1)

In [5]:
credits.head(1)

In [6]:
# Merging the two dataframe

movies=movies.merge(credits,on='title')

In [7]:
movies.head(1)

In [8]:
# for the sake of simplicity of model , i have deleted many features that may or may not contributing
# significantly in recommending any movie .

#1. budget
#2. homepage
#3. original language- bcz when i extract the values under this column i found that the distribution of data values is imbalanced. 
#4. original title - as title is related with it
#5. popularity - as our aim is to create tags 
#6. production company
#7. release date - again numeric which eventually will no help in creating tags
#8. revenue - again numeric . 
#9. runtime-
#10. spoken languages
#11. status
#12. tagline- little vague 
#13. vote_average- as it is numeric
#14. vote_count- also numeric 
#15. movie_id- already taken "id" 

# new dataset is

movies=movies[["id","title","overview","genres","keywords","cast","crew"]]

In [9]:
movies.head()

In [10]:
# checking missing values if any 

movies.isnull().sum()

In [11]:
# i have deleted total 3 rows cntaining missing value

movies.dropna(inplace=True)

In [12]:
# Checking duplicate entries

movies.duplicated().sum()

In [13]:
movies.iloc[0].genres

In [14]:
import ast

In [15]:

ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

In [16]:
# since the genres are not in the proper format for our model, so  we are going to extract the value of the key name from each string 
# my objective is to get genres in the below manner-
# ["Action","Adventure","Fantasy","Scifi"]

def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i["name"])
    return L

In [17]:
movies["genres"]=movies["genres"].apply(convert)

In [18]:
movies.head(1)

In [19]:
movies["keywords"]=movies["keywords"].apply(convert)

In [20]:
movies.head(1)

In [21]:
movies["cast"].values

In [22]:
# under the cast column i am only interested in name .so i am going to create a function to extract the
# name from the dictionary 

def convert3(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
           L.append(i["name"])
           counter+=1
        else:
            break
    return L
        

In [23]:
movies["cast"]=movies["cast"].apply(convert3)

In [24]:
movies.head(1)

In [25]:
# under the column crew i am considering job=director as the feature value contributing towards model building

def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i["job"]=="Director":
            L.append(i["name"])
            break
    return L


In [26]:
movies["crew"]=movies["crew"].apply(fetch_director)

In [27]:
movies.head(1)

In [28]:
# converting the Overview column into string

movies["overview"]=movies["overview"].apply(lambda x:x.split())

In [29]:
movies.head(1)

In [30]:
# erasing space in between the words that carry one meaning , so as to vectorize it eventually.

movies["genres"]=movies["genres"].apply(lambda x:[i.replace(" ","") for i in x])
movies["keywords"]=movies["keywords"].apply(lambda x:[i.replace(" ","") for i in x])
movies["cast"]=movies["cast"].apply(lambda x:[i.replace(" ","") for i in x])
movies["crew"]=movies["crew"].apply(lambda x:[i.replace(" ","") for i in x])


In [31]:
movies.head(1)

In [32]:
# Finally our data is prepared for vectorization. so we are going to concat every feature into one feature

movies["tags"]=movies["overview"]+movies["genres"]+movies["keywords"]+movies["cast"]+movies["crew"]

In [33]:
movies.head()

In [34]:
new_df=movies[["id","title","tags"]]

In [35]:
new_df.head(1)

In [36]:
new_df["tags"][0]

In [37]:
new_df["tags"]=new_df["tags"].apply(lambda x:" ".join(x))

In [38]:
new_df.head()

In [39]:
# converting tags into lower case

new_df["tags"]=new_df["tags"].apply(lambda x:x.lower())

In [40]:
new_df.head()

In [41]:
# Stemming 

import nltk

In [42]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [43]:
def stem(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [44]:
new_df["tags"]=new_df["tags"].apply(stem)

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words="english")

In [46]:
vectors=cv.fit_transform(new_df["tags"]).toarray()
vectors

In [47]:
vectors[0]

In [48]:
cv.get_feature_names()

In [49]:
new_df.shape

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
similarity=cosine_similarity(vectors)

In [52]:
similarity.shape

In [53]:
# Getting the index of movie

new_df[new_df["title"]=="Avatar"].index[0]

In [54]:
# converting the indexes into list with cosine- similarity

sorted(list(enumerate(similarity[0])),reverse=True)

In [55]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

In [56]:
def recommend(movie):
    movie_index=new_df[new_df["title"]==movie].index[0]
    distances=similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
                       
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [57]:
recommend("Batman Begins")

In [58]:
# Next step is to convert these entire things into website
# i have done this using pycharm community


In [59]:
import pickle

In [60]:
pickle.dump(new_df,open("movies_dictry1.pkl","wb"))

In [61]:
pickle.dump(similarity,open('similarity_arr.pkl','wb'))

In [62]:
new_df.head()