In [2]:
import numpy as np
import pandas as pd
import ast

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import pickle

# Data Collection

In [3]:
movie = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
df = movie.merge(credit, on=['title'])

# Data Merging

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [6]:
df.shape

(4809, 23)

# Necessary Column
genres
id
keywords
overview
title
crew
cast

In [7]:
df = df[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [8]:
df.shape

(4809, 7)

In [9]:
df.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# Preprocessing Data

In [10]:
type(df['genres'][0])

str

In [11]:
def convert(obj, counter = None, producer = False):
    key = []
    if producer is False : 
        if counter is None: 
            for i in ast.literal_eval(obj):
                key.append(i['name'])
            return key
        elif counter is not None:
            for i in ast.literal_eval(obj):
                if counter-1 >= 0 :
                    key.append(i['name'])
                    counter = counter-1
                else :
                    break
            return key
    
    else:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Producer':
                key.append(i['name'])
        return key
        

In [12]:
df['genres']=df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(lambda x:convert(x))
df['cast'] = df['cast'].apply(lambda x: convert(x, counter = 3))
df['crew'] = df['crew'].apply(lambda x: convert(x, producer = True))

In [13]:
df.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[James Cameron, Jon Landau]"
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[Jerry Bruckheimer, Eric McLeod, Chad Oman, Pe..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[Barbara Broccoli, Michael G. Wilson]"
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]","[Charles Roven, Christopher Nolan, Emma Thomas]"
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[Colin Wilson, Jim Morris, Lindsey Collins]"


# removing Space

In [14]:
df['keywords'] = df['keywords'].apply(lambda x:[i.replace(" ", "")for i in x])
df['cast'] = df['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
df['crew'] = df['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
df['genres'] = df['genres'].apply(lambda x: [i.replace(" ", "") for i in x])

In [15]:
df.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[JamesCameron, JonLandau]"
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]","[JerryBruckheimer, EricMcLeod, ChadOman, Peter..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]","[BarbaraBroccoli, MichaelG.Wilson]"
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]","[CharlesRoven, ChristopherNolan, EmmaThomas]"
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]","[ColinWilson, JimMorris, LindseyCollins]"


# Making Overview into a List

In [16]:
df['overview'] = df['overview'].astype(str)

In [17]:
type(df['overview'])

pandas.core.series.Series

In [18]:
df['overview'] = df['overview'].apply(lambda x: x.split())

In [19]:
df['overview'] = df['overview'] + df['keywords'] + df['cast'] + df['crew']

In [20]:
df.rename(columns={'overview' : 'tag'}, inplace = True)

In [21]:
df = df.drop(columns = ['genres', 'cast', 'crew', 'keywords'])

In [22]:
df['tag'] = df['tag'].apply(lambda x: " ".join(x))

In [23]:
df.head()

Unnamed: 0,id,title,tag
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


# Root Word Formation 

In [24]:
ps = PorterStemmer()

In [25]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i.lower()))
    return " ".join(y)

In [26]:
df['tag'] = df['tag'].apply(stem)

# text Vectorisation

In [27]:
vector = CountVectorizer(max_features = 5000, stop_words = 'english', lowercase = True)

In [28]:
movie = vector.fit_transform(df['tag']).toarray()

In [29]:
vector.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

# Similarity Finding 

In [30]:
similarity = cosine_similarity(movie)

In [31]:
similarity.shape

(4809, 4809)

# Recommended Movie Name

In [32]:
print(df[df['title'] == 'Avatar'].index[0])

0


In [33]:
def recommendation(name):
    index = df[df['title'] == name].index[0]
    distance = similarity[index]
    movie_list = sorted(list(enumerate(distance)), reverse = True, key = lambda x: x[1])[1:6]
    
    # printing Index of the Movie
    # first_elements = [element[0] for element in movie_list]
    # 
    # for i in first_elements:
    #     print(i)
    
    #Printing Name of the Movie
    for i in movie_list:
        print(df['id'][i[0]])
        # print(df['id'].iloc[i[0]])
        

In [34]:
recommendation('I Am Legend')

170
11683
41439
35791
44943


In [35]:
# pickle.dump(df.to_dict(), open('df.pkl', 'wb'))


In [36]:
# pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [37]:
df[df['id'] == 170]['title']

3243    28 Days Later
Name: title, dtype: object

In [38]:
similarity[0]

array([1.        , 0.        , 0.03175003, ..., 0.02578553, 0.        ,
       0.        ])