In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [None]:
movies.head()

In [None]:
movies.shape

In [None]:
credits.head()

In [None]:
credits.shape

# **Merging both the datasets on the basis of 'title' column**

In [None]:
movies = movies.merge(credits, on='title')

In [None]:
movies.shape

**Movies dataset**
1. genres
2. movie_id
3. keywords
4. title
5. overview
6. cast
7. crew

In [None]:
movies.info()

In [None]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [None]:
movies.head()

# Checking for null values

In [None]:
movies.isnull().sum()

**Dropping the null values**

In [None]:
movies.dropna(subset=['overview'],inplace=True)

In [None]:
movies.isnull().sum()

In [None]:
movies.duplicated().sum()

# **Genres column**

In [None]:
movies.iloc[0]['genres']

In [None]:
import ast

In [None]:
def genres_and_keywords(text):
    L1 = []
    for i in ast.literal_eval(text):
        L1.append(i['name'])
    L = L1[:]
    L1.clear()
    return L

In [None]:
movies['genres'] = movies['genres'].apply(genres_and_keywords)

In [None]:
movies.head()

# Keywords column

In [None]:
movies.iloc[0]['keywords']

In [None]:
movies['keywords'] = movies['keywords'].apply(genres_and_keywords)

In [None]:
movies.head()

# Overview column

In [None]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [None]:
movies.head()

# **Cast column**

In [None]:
def process_cast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        else:
            break
        counter+=1
    L1 = L[:]
    L.clear()
    return L1

In [None]:
movies['cast'] = movies['cast'].apply(process_cast)

In [None]:
movies.head()

# **Crew column**

In [None]:
def process_crew(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    L1 = L[:]
    L.clear()
    return L1

In [None]:
movies['crew'] = movies['crew'].apply(process_crew)

In [None]:
movies.head()

# **Removing all the spaces from the words of the columns**

In [None]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies.head()

# **Adding all the lists of the columns in a single column**

In [None]:
movies['tags'] = movies['genres'] + movies['keywords'] + movies['overview'] + movies['cast'] + movies['crew']

In [None]:
movies.head()

# **Dropping all the unnecessary columns**

In [None]:
final = movies[['movie_id', 'title', 'tags']]
final.head()

# **Converting the 'tags' column from list to string**

In [None]:
final['tags'] = final['tags'].apply(lambda x:" ".join(x))

In [None]:
final['tags'][0]

# **Converting the lists in the tags column to lowercase string**

In [None]:
final['tags'] = final['tags'].apply(lambda x:x.lower())

In [None]:
final.head()

# **Removing stop words**

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
def removeStopWords(text): 
    x = [] 
    for i in text.split():     
        if i not in stopwords.words('english'):         
            x.append(i)        
    y = x[:]
    x.clear()         
    return y

In [None]:
final['tags'] = final['tags'].apply(removeStopWords)

# **Stemming the text**

In [None]:
from nltk.stem.porter import PorterStemmer
pt = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text:
        y.append(pt.stem(i))
        
    return " ".join(y)

In [None]:
final['tags'] = final['tags'].apply(stem)

# **Text Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(stop_words = 'english',max_features=5000)

In [None]:
vectors = cv.fit_transform(final['tags']).toarray()

In [None]:
vectors.shape

In [None]:
y = final.iloc[:,1].values

# **Calculating cosine similarity of each movie with every movies**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
similarity.shape

In [None]:
def recommend(movie):
    movie_index = final[final['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:11]
    
    for i in movies_list:
        print(final.iloc[i[0]].title)

In [None]:
recommend("Spider-Man")

In [None]:
import pickle

In [None]:
## pickle.dump(final.to_dict(), open('movies_dict.pkl', 'wb'))

In [None]:
## pickle.dump(similarity, open('similarity.pkl', 'wb'))