In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv


# Movies Recommender System


Steps:

1. Combine movies and credits dataset and create a single pandas dataframe.
2. Check and Remove Duplicate (if any)
3. Check and Handle null values (if any)
4. Perform Feature Engineering and drop unwanted features
5. Create tags for movies by combining different features
6. Convert list of tags into String
7. Convert All the tags into lower case (Recommended)
8. Perform Stemming on tags to bring different forms of words to its root version
9. Perform Vectorizarion and Create vectors of these tags. (Represent each movie with a vector in multidimentional space)
10. Find Similarity matrix (cosin similarity) of the movies
11. Based on similarity matrix recommend 5 similar movies
   

# Read Movies Dataframe

In [2]:
movies_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
movies_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

# Read Credits Dataframe

In [3]:
credits_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')
credits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


# Step 1: Combine movies and credits dataset and create a single pandas dataframe.

In [4]:
movies = pd.merge(movies_df,credits_df, on='title')


# Step 2: Check and Remove Duplicate (if any)


# Step 3: Check and Handle null values (if any)

In [5]:
movies.isnull().sum()

budget                     0
genres                     0
homepage                3096
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
movie_id                   0
cast                       0
crew                       0
dtype: int64

In [6]:
# Dropping columns that has many null values and are NOT important.
movies = movies.drop(['homepage','tagline','release_date','runtime'], axis=1)

# Step 4: Perform Feature Engineering and drop unwanted features

Select only below features which can contribute in movie Tags.
1. movie ID
2. genres
3. keywords
4. overview
5. title
6. cast
7. crew

In [7]:
movies = movies[['id','title','overview','keywords','genres','cast','crew']]

In [8]:
movies.isnull().sum()

id          0
title       0
overview    3
keywords    0
genres      0
cast        0
crew        0
dtype: int64

In [9]:
movies[movies['overview'].isna()]

Unnamed: 0,id,title,overview,keywords,genres,cast,crew
2658,370980,Chiamatemi Francesco - Il Papa della gente,,"[{""id"": 717, ""name"": ""pope""}, {""id"": 5565, ""na...","[{""id"": 18, ""name"": ""Drama""}]","[{""cast_id"": 5, ""character"": ""Jorge Mario Berg...","[{""credit_id"": ""5660019ac3a36875f100252b"", ""de..."
4145,459488,"To Be Frank, Sinatra at 100",,"[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,...","[{""id"": 99, ""name"": ""Documentary""}]","[{""cast_id"": 0, ""character"": ""Narrator"", ""cred...","[{""credit_id"": ""592b25e4c3a368783e065a2f"", ""de..."
4437,292539,Food Chains,,[],"[{""id"": 99, ""name"": ""Documentary""}]",[],"[{""credit_id"": ""5470c3b1c3a368085e000abd"", ""de..."


In [10]:
# Drop rows with null values in 'overview'
movies.shape
movies = movies.dropna()


# Split the Sentence and Create tokens from 'overview'

In [11]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Create a Function which extracts name from 'Genres' and 'keywords'

In [12]:
def name_extractor(text):
    lst = []
    for item in eval(text):
        lst.append(item['name'])
    return lst

In [13]:
movies['keywords'] = movies['keywords'].apply(name_extractor)
movies['genres'] = movies['genres'].apply(name_extractor)

# Extract Top 3 Cast

In [14]:
def cast_extractor(text):
    cast = []
    count = 0
    for item in eval(text):
        if(count < 3):
            cast.append(item['name'])
            count += 1
        else:
            break;
    return cast
        

In [15]:
movies['cast'] = movies['cast'].apply(cast_extractor)

# Extract Director from Crew

In [16]:
def fetch_director(text):
    director = []
    for item in eval(text):
        if(item['job'] == 'Director'):
            director.append(item['name'])
            break;
    return director
            

In [17]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [18]:

movies['keywords'] = movies['keywords'].apply(lambda x: [item.replace(" ","") for item in x])
movies['genres'] = movies['genres'].apply(lambda x: [item.replace(" ","") for item in x])
movies['cast'] = movies['cast'].apply(lambda x: [item.replace(" ","") for item in x])
movies['crew'] = movies['crew'].apply(lambda x: [item.replace(" ","") for item in x])



# Step 5: Create tags for movies by combining different features. like overview, keywords, genres, cast, crew.

In [19]:
movies['tags'] = movies['overview'] + movies['keywords'] + movies['genres'] + movies['cast'] + movies['crew']

In [20]:
movies_new = movies[['id','title','tags']]
movies_new.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


# Step 6: Convert list of tags into String

In [21]:
movies_new['tags'] = movies_new['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['tags'] = movies_new['tags'].apply(lambda x: " ".join(x))


In [22]:
movies_new['tags'] = movies_new['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['tags'] = movies_new['tags'].apply(lambda x: x.lower())


# Step 7: Convert All the tags into lower case (Recommended)

In [23]:
movies_new.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


# Step 8: Perform Stemming on tags to bring different forms of words to its root version
# e.g ['love' ,'loving' , 'loved'] -> after Stemming - ['love' ,'love' , 'love']

In [24]:
# Install nltk library (if not already installed) for Stemming
#!pip install nltk

In [25]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [26]:
def stem(text):
    stem_words =[]
    for i in text.split():
        stem_words.append(ps.stem(i))
    return " ".join(stem_words)

In [27]:
movies_new['tags'] = movies_new['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['tags'] = movies_new['tags'].apply(stem)


# Step 9: Perform Vectorizarion and Create vectors of these tags. (Represent each movie with a vector in multidimentional space)

In [28]:
# perform vectorization of tags using CountVector
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv = CountVectorizer(max_features=5000,stop_words='english')
vectors = cv.fit_transform(movies_new['tags']).toarray()

In [30]:
# To check vectors
#cv.get_feature_names_out()


# Step 10: Find Similarity matrix (cosin similarity) of the movies

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [32]:
#movies_new[movies_new['title'] == 'Spider Man']
#movies_new[movies_new['id'] == '1214']
#similarity[0]
#movies_new.iloc[1214]['title']
#sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]
#list(enumerate(similarity[0]))

In [33]:
def recommend(movie):
    recommended_movies = []
    # fetch index from movie name
    index = movies_new[movies_new['title'] == movie].index[0]

    # find distance matrix
    distances = similarity[index]

    '''The enumerate() function adds a counter and returns pairs of (index, value) 
    We will Sort the movie distances(on actual distance not index) in ascending order (higher the distance more similar) 
    and pick Top 5 '''
    sorted_movies = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

    # Finally will fetch the movie name from the index
    for item in sorted_movies:
        recommended_movies.append(movies_new.iloc[item[0]]['title'])
    return recommended_movies

In [34]:
#recommend('Batman')
index = movies_new[movies_new['title'] == 'Avatar']['id'][0]
index

19995

In [35]:
import pickle as pkl

pkl.dump(movies_new.to_dict(), open('movies_dict.pkl','wb'))
pkl.dump(similarity,open('similarity_matrix','wb'))