In [1]:
import ast
import pickle
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
print(movies.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [4]:
credits = pd.read_csv('tmdb_5000_credits.csv')
print(credits.head())

   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                                cast  \
0  [{"cast_id": 242, "character": "Jake Sully", "...   
1  [{"cast_id": 4, "character": "Captain Jack Spa...   
2  [{"cast_id": 1, "character": "James Bond", "cr...   
3  [{"cast_id": 2, "character": "Bruce Wayne / Ba...   
4  [{"cast_id": 5, "character": "John Carter", "c...   

                                                crew  
0  [{"credit_id": "52fe48009251416c750aca23", "de...  
1  [{"credit_id": "52fe4232c3a36847f800b579", "de...  
2  [{"credit_id": "54805967c3a36829b5002c41", "de...  
3  [{"credit_id": "52fe4781c3a36847f81398c3", "de...  
4  [{"credit_id": "52fe479ac3a36847f813eaa3",

In [5]:
merged_dataset = movies.merge(credits,on='title')
print(merged_dataset.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [6]:
print(merged_dataset.shape)

(4809, 23)


In [7]:
movies_dataset = merged_dataset[['movie_id','title','overview','genres','keywords','cast','crew']]
print(movies_dataset.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.6+ KB
None


In [8]:
movies_dataset.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [9]:
movies_dataset.dropna(inplace=True)


In [10]:
print(movies_dataset.isnull().sum())

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64


In [11]:
print(movies_dataset.duplicated().sum())

0


In [12]:
def convert1(obj):
    list1 = []
    for dict in ast.literal_eval(obj):
        list1.append(dict['name'])
    return list1

In [13]:
movies_dataset['genres'] = movies_dataset['genres'].apply(convert1)
movies_dataset['keywords'] = movies_dataset['keywords'].apply(convert1)

In [14]:
def convert2(obj):
    list1 = []
    counter=3
    for dict in ast.literal_eval(obj):
        list1.append(dict['name'])
        if(counter==0):
            break
        counter -= 1
    return list1

In [15]:
movies_dataset['cast'] = movies_dataset['cast'].apply(convert2)

In [16]:
def convert3(obj):
    list1 = []
    for dict in ast.literal_eval(obj):
        if(dict['job']=='Director'):
            list1.append(dict['name'])
            break
    return list1

In [17]:
movies_dataset['crew'] = movies_dataset['crew'].apply(convert3)

In [18]:
movies_dataset['overview'] = movies_dataset['overview'].apply(lambda x:x.split())

In [19]:
movies_dataset['genres'] = movies_dataset['genres'].apply(lambda x:[item.replace(" ","") for item in x])
print(movies_dataset['genres'])

0       [Action, Adventure, Fantasy, ScienceFiction]
1                       [Adventure, Fantasy, Action]
2                         [Action, Adventure, Crime]
3                   [Action, Crime, Drama, Thriller]
4                [Action, Adventure, ScienceFiction]
                            ...                     
4804                       [Action, Crime, Thriller]
4805                               [Comedy, Romance]
4806               [Comedy, Drama, Romance, TVMovie]
4807                                              []
4808                                   [Documentary]
Name: genres, Length: 4806, dtype: object


In [20]:
movies_dataset['keywords'] = movies_dataset['keywords'].apply(lambda x:[item.replace(" ","") for item in x])
print(movies_dataset['keywords'])

0       [cultureclash, future, spacewar, spacecolony, ...
1       [ocean, drugabuse, exoticisland, eastindiatrad...
2       [spy, basedonnovel, secretagent, sequel, mi6, ...
3       [dccomics, crimefighter, terrorist, secretiden...
4       [basedonnovel, mars, medallion, spacetravel, p...
                              ...                        
4804    [unitedstates–mexicobarrier, legs, arms, paper...
4805                                                   []
4806    [date, loveatfirstsight, narration, investigat...
4807                                                   []
4808             [obsession, camcorder, crush, dreamgirl]
Name: keywords, Length: 4806, dtype: object


In [21]:
movies_dataset['cast'] = movies_dataset['cast'].apply(lambda x:[item.replace(" ","") for item in x])
print(movies_dataset['cast'])

0       [SamWorthington, ZoeSaldana, SigourneyWeaver, ...
1       [JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...
2       [DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...
3       [ChristianBale, MichaelCaine, GaryOldman, Anne...
4       [TaylorKitsch, LynnCollins, SamanthaMorton, Wi...
                              ...                        
4804    [CarlosGallardo, JaimedeHoyos, PeterMarquardt,...
4805    [EdwardBurns, KerryBishé, MarshaDietlein, Cait...
4806    [EricMabius, KristinBooth, CrystalLowe, GeoffG...
4807     [DanielHenney, ElizaCoupe, BillPaxton, AlanRuck]
4808    [DrewBarrymore, BrianHerzlinger, CoreyFeldman,...
Name: cast, Length: 4806, dtype: object


In [22]:
movies_dataset['crew'] = movies_dataset['crew'].apply(lambda x:[item.replace(" ","") for item in x])
print(movies_dataset['crew'])

0           [JamesCameron]
1          [GoreVerbinski]
2              [SamMendes]
3       [ChristopherNolan]
4          [AndrewStanton]
               ...        
4804     [RobertRodriguez]
4805         [EdwardBurns]
4806          [ScottSmith]
4807          [DanielHsia]
4808     [BrianHerzlinger]
Name: crew, Length: 4806, dtype: object


In [23]:
movies_dataset['tags'] = movies_dataset['overview'] + movies_dataset['genres'] + movies_dataset['keywords'] + movies_dataset['cast'] + movies_dataset['crew']
print(movies_dataset['tags'].values[0])

['In', 'the', '22nd', 'century,', 'a', 'paraplegic', 'Marine', 'is', 'dispatched', 'to', 'the', 'moon', 'Pandora', 'on', 'a', 'unique', 'mission,', 'but', 'becomes', 'torn', 'between', 'following', 'orders', 'and', 'protecting', 'an', 'alien', 'civilization.', 'Action', 'Adventure', 'Fantasy', 'ScienceFiction', 'cultureclash', 'future', 'spacewar', 'spacecolony', 'society', 'spacetravel', 'futuristic', 'romance', 'space', 'alien', 'tribe', 'alienplanet', 'cgi', 'marine', 'soldier', 'battle', 'loveaffair', 'antiwar', 'powerrelations', 'mindandsoul', '3d', 'SamWorthington', 'ZoeSaldana', 'SigourneyWeaver', 'StephenLang', 'JamesCameron']


In [24]:
movies_new_dataset = movies_dataset[['movie_id','title','tags']]

In [25]:
movies_new_dataset['tags'] = movies_new_dataset['tags'].apply(lambda x:" ".join(x))

In [26]:
movies_new_dataset['tags'] = movies_new_dataset['tags'].apply(lambda x:x.lower())

In [27]:
print(movies_new_dataset['tags'].values[0])

in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver stephenlang jamescameron


In [28]:
ps = PorterStemmer()

In [29]:
def stemming(text):
    list2=[]
    for word in text.split():
        list2.append(ps.stem(word))
    return " ".join(list2)

In [30]:
movies_new_dataset['tags'] = movies_new_dataset['tags'].apply(stemming)
print(movies_new_dataset['tags'].values[0])

in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav stephenlang jamescameron


In [31]:
cv = CountVectorizer(max_features=4000,stop_words='english')
vectors = cv.fit_transform(movies_new_dataset['tags']).toarray()

In [32]:
print(vectors[0])

[0 0 0 ... 0 0 0]


In [33]:
print(cv.get_feature_names())

['000', '007', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18', '18th', '19', '1910', '1930', '1940', '1950', '1960', '1960s', '1970', '1970s', '1980', '1990', '19th', '19thcenturi', '20', '200', '20th', '24', '25', '30', '3d', '40', '50', '60', '70', 'aaron', 'aaroneckhart', 'abandon', 'abduct', 'abigailbreslin', 'abil', 'abl', 'aboard', 'abov', 'abus', 'academi', 'accept', 'access', 'accid', 'accident', 'accompani', 'accomplish', 'account', 'accus', 'ace', 'achiev', 'act', 'action', 'activ', 'activist', 'actor', 'actress', 'actual', 'adam', 'adamsandl', 'adapt', 'add', 'addict', 'adjust', 'admir', 'admit', 'adolesc', 'adopt', 'ador', 'adrienbrodi', 'adult', 'adulteri', 'adulthood', 'advanc', 'adventur', 'adventure', 'advertis', 'advic', 'advis', 'affair', 'affect', 'afghanistan', 'africa', 'african', 'africanamerican', 'aftercreditssting', 'afterlif', 'aftermath', 'ag', 'age', 'agediffer', 'agency', 'agenda', 'agent', 'aggress', 'ago', 'agre', 'ahead', 'aid', 'ail', 'aim'

In [34]:
similarity = cosine_similarity(vectors)

In [35]:
print(similarity.shape)

(4806, 4806)


In [36]:
def recommend(movie):
    movie_index = movies_new_dataset[movies_new_dataset['title']==movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for new_movie in movies_list:
        print(movies_new_dataset.iloc[new_movie[0]].title)
        
recommend('Avatar')

Aliens vs Predator: Requiem
Independence Day
Falcon Rising
Titan A.E.
Jupiter Ascending


In [37]:
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(movies_new_dataset.to_dict(),open('movies_dict.pkl','wb'))