In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
import pickle

In [16]:
movies = pd.read_csv("./data/tmdb_5000_movies.csv")
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [17]:
movies = movies[['id','title','genres','keywords','original_language','production_companies','production_countries','overview']]

In [9]:
credits = pd.read_csv("./data/tmdb_5000_credits.csv", usecols=['movie_id','cast','crew'])
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   cast      4803 non-null   object
 2   crew      4803 non-null   object
dtypes: int64(1), object(2)
memory usage: 112.7+ KB


In [18]:
movies = movies.merge(credits, left_on='id', right_on='movie_id')
movies.drop('movie_id', axis=1, inplace=True)

In [19]:
movies.columns

Index(['id', 'title', 'genres', 'keywords', 'original_language',
       'production_companies', 'production_countries', 'overview', 'cast',
       'crew'],
      dtype='object')

In [20]:
movies.shape

(4803, 10)

In [21]:
movies.isnull().sum()

id                      0
title                   0
genres                  0
keywords                0
original_language       0
production_companies    0
production_countries    0
overview                3
cast                    0
crew                    0
dtype: int64

# Data Cleaning

In [22]:
from ast import literal_eval

In [24]:
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in literal_eval(x)])
movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'] for i in literal_eval(x)])

In [25]:
movies[['genres','keywords']].head()

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


In [26]:
def fetch_cast(obj):
    L = []
    counter = 0
    for i in literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

def fetch_director(x):
    L = []
    for i in literal_eval(x):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [27]:
movies['cast'] = movies['cast'].apply(fetch_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

In [28]:
movies[['cast','crew']].head()

Unnamed: 0,cast,crew
0,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [29]:
# production companies
movies['production_companies'] = movies['production_companies'].apply(lambda x: [i['name'] for i in literal_eval(x)])
movies['production_countries'] = movies['production_countries'].apply(lambda x: [i['name'] for i in literal_eval(x)])

In [30]:
movies[['production_companies','production_countries']].head()

Unnamed: 0,production_companies,production_countries
0,"[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]"
1,"[Walt Disney Pictures, Jerry Bruckheimer Films...",[United States of America]
2,"[Columbia Pictures, Danjaq, B24]","[United Kingdom, United States of America]"
3,"[Legendary Pictures, Warner Bros., DC Entertai...",[United States of America]
4,[Walt Disney Pictures],[United States of America]


In [38]:
movies['genres'] = movies['genres'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['production_companies'] = movies['production_companies'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['production_countries'] = movies['production_countries'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['original_language'] = movies['original_language'].apply(lambda x: [x])

In [39]:
movies['overview'][0]

['nd centuri parapleg marin dispatch moon pandora uniqu mission becom torn follow order protect alien civil']

In [32]:
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
from nltk.corpus import stopwords
stwords = stopwords.words('english')

def stem(text):
    y = []
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    for word in text:
        if not word in stwords:
            y.append(ps.stem(word))

    return  [" ".join(y)]

In [33]:
movies['overview'] = movies['overview'].fillna('')
movies['overview'] = movies['overview'].apply(stem)

In [34]:
movies['overview']

0       [nd centuri parapleg marin dispatch moon pando...
1       [captain barbossa long believ dead come back l...
2       [cryptic messag bond past send trail uncov sin...
3       [follow death district attorney harvey dent ba...
4       [john carter war weari former militari captain...
                              ...                        
4798    [el mariachi want play guitar carri famili tra...
4799    [newlyw coupl honeymoon upend arriv respect si...
4800    [sign seal deliv introduc dedic quartet civil ...
4801    [ambiti new york attorney sam sent shanghai as...
4802    [ever sinc second grade first saw e extraterre...
Name: overview, Length: 4803, dtype: object

In [35]:
movies.columns

Index(['id', 'title', 'genres', 'keywords', 'original_language',
       'production_companies', 'production_countries', 'overview', 'cast',
       'crew'],
      dtype='object')

In [40]:
movies.head()

Unnamed: 0,id,title,genres,keywords,original_language,production_companies,production_countries,overview,cast,crew
0,19995,Avatar,"[action, adventure, fantasy, sciencefiction]","[culture_clash, future, space_war, space_colon...",[en],"[ingenious_film_partners, twentieth_century_fo...","[united_states_of_america, united_kingdom]",[nd centuri parapleg marin dispatch moon pando...,"[sam_worthington, zoe_saldana, sigourney_weaver]",[james_cameron]
1,285,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drug_abuse, exotic_island, east_india_...",[en],"[walt_disney_pictures, jerry_bruckheimer_films...",[united_states_of_america],[captain barbossa long believ dead come back l...,"[johnny_depp, orlando_bloom, keira_knightley]",[gore_verbinski]
2,206647,Spectre,"[action, adventure, crime]","[spy, based_on_novel, secret_agent, sequel, mi...",[en],"[columbia_pictures, danjaq, b24]","[united_kingdom, united_states_of_america]",[cryptic messag bond past send trail uncov sin...,"[daniel_craig, christoph_waltz, léa_seydoux]",[sam_mendes]
3,49026,The Dark Knight Rises,"[action, crime, drama, thriller]","[dc_comics, crime_fighter, terrorist, secret_i...",[en],"[legendary_pictures, warner_bros., dc_entertai...",[united_states_of_america],[follow death district attorney harvey dent ba...,"[christian_bale, michael_caine, gary_oldman]",[christopher_nolan]
4,49529,John Carter,"[action, adventure, sciencefiction]","[based_on_novel, mars, medallion, space_travel...",[en],[walt_disney_pictures],[united_states_of_america],[john carter war weari former militari captain...,"[taylor_kitsch, lynn_collins, samantha_morton]",[andrew_stanton]


In [41]:
movies['tags'] = movies['genres']+movies['keywords']+movies['original_language']+movies['production_companies'] \
            +movies['production_countries']+movies['cast']+movies['crew']
print(movies['tags'][0])

['action', 'adventure', 'fantasy', 'sciencefiction', 'culture_clash', 'future', 'space_war', 'space_colony', 'society', 'space_travel', 'futuristic', 'romance', 'space', 'alien', 'tribe', 'alien_planet', 'cgi', 'marine', 'soldier', 'battle', 'love_affair', 'anti_war', 'power_relations', 'mind_and_soul', '3d', 'en', 'ingenious_film_partners', 'twentieth_century_fox_film_corporation', 'dune_entertainment', 'lightstorm_entertainment', 'united_states_of_america', 'united_kingdom', 'sam_worthington', 'zoe_saldana', 'sigourney_weaver', 'james_cameron']


In [42]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))
movies['tags'][0]

'action adventure fantasy sciencefiction culture_clash future space_war space_colony society space_travel futuristic romance space alien tribe alien_planet cgi marine soldier battle love_affair anti_war power_relations mind_and_soul 3d en ingenious_film_partners twentieth_century_fox_film_corporation dune_entertainment lightstorm_entertainment united_states_of_america united_kingdom sam_worthington zoe_saldana sigourney_weaver james_cameron'

In [43]:
movies['overview'] = movies['overview'].apply(lambda x: " ".join(x))
movies['overview'][0]

'nd centuri parapleg marin dispatch moon pandora uniqu mission becom torn follow order protect alien civil'

In [44]:
df = movies[['id','title','tags','overview']]

In [45]:
pickle.dump(df.to_dict(), open('movies.pkl','wb'))

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000)

In [98]:
vectors = cv.fit_transform(new_df).toarray()

In [99]:
vectors.shape

(4803, 5000)

In [100]:
from sklearn.metrics.pairwise import cosine_similarity

In [101]:
similarity = cosine_similarity([vectors[0]],vectors)

In [102]:
similarity.shape

(1, 4803)

In [49]:
def get_recommendations(movie):
    idx = movies[movies['title']==movie].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

In [50]:
get_recommendations('Avatar')

1216                  Aliens vs Predator: Requiem
507                              Independence Day
539                                    Titan A.E.
2409                                       Aliens
1204                                    Predators
260                                  Ender's Game
61                              Jupiter Ascending
778                                     Meet Dave
1920                                    Lifeforce
172     The Twilight Saga: Breaking Dawn - Part 2
Name: title, dtype: object

In [51]:
get_recommendations('The Twilight Saga: Breaking Dawn - Part 2')

1340                                             Twilight
898                           The Twilight Saga: New Moon
612                            The Twilight Saga: Eclipse
4355                                       Love Me Tender
777                 The Mortal Instruments: City of Bones
2699                                   New York, New York
4641    The Incredibly True Adventure of Two Girls In ...
912                            Interview with the Vampire
2480                              Arn: The Knight Templar
4314                                               Desire
Name: title, dtype: object

In [56]:
get_recommendations('The Avengers')

7                     Avengers: Age of Ultron
31                                 Iron Man 3
26                 Captain America: Civil War
169        Captain America: The First Avenger
68                                   Iron Man
182                                   Ant-Man
215    Fantastic 4: Rise of the Silver Surfer
85        Captain America: The Winter Soldier
511                                     X-Men
126                      Thor: The Dark World
Name: title, dtype: object

In [65]:
import pickle

In [72]:
pickle.dump(movies.to_dict(), open('movie_dict.pkl','wb'))

In [67]:
pickle.dump(similarity, open('similarity.pkl','wb'))

In [78]:
import requests
def fetch_poster(movie_id):
    response = requests.get("https://api.themoviedb.org/3/movie/{}?api_key=4d3a04e97e46d3313ca2082c381d09e2&language=en-US".format(movie_id))
    data = response.json()
    return data

def get_popular_movies(df):
    popular_movies = dict()
    for i in list(df.sort_values('popularity_score', ascending=False)['movie_id'].head(8)):
        popular_movies[i] = fetch_poster(i)
        
    return popular_movies

In [79]:
d = get_popular_movies(movies)

In [80]:
d

{278: {'adult': False,
  'backdrop_path': '/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg',
  'belongs_to_collection': None,
  'budget': 25000000,
  'genres': [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}],
  'homepage': '',
  'id': 278,
  'imdb_id': 'tt0111161',
  'original_language': 'en',
  'original_title': 'The Shawshank Redemption',
  'overview': 'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
  'popularity': 88.33,
  'poster_path': '/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg',
  'production_companies': [{'id': 97,
    'logo_path': '/7znWcbDd4PcJzJUlJxYqAlPPykp.png',
    'name': 'Castle Rock Entertainment',
    'origin_country': 'US'}],
  'production_cou

In [84]:
for i in d.keys():
    print(i)

278
550
155
680
27205
238
157336
13


In [83]:
d[278]['poster_path']

'/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg'

In [85]:
d[278]

{'adult': False,
 'backdrop_path': '/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg',
 'belongs_to_collection': None,
 'budget': 25000000,
 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}],
 'homepage': '',
 'id': 278,
 'imdb_id': 'tt0111161',
 'original_language': 'en',
 'original_title': 'The Shawshank Redemption',
 'overview': 'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
 'popularity': 88.33,
 'poster_path': '/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg',
 'production_companies': [{'id': 97,
   'logo_path': '/7znWcbDd4PcJzJUlJxYqAlPPykp.png',
   'name': 'Castle Rock Entertainment',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1'

In [93]:
for i in d[278]['spoken_languages']:
    print(i['name'])

English


In [90]:
d[278]['genres'][0]

{'id': 18, 'name': 'Drama'}

In [94]:
def get_recommendations(id):
    idx = movies[movies['movie_id']==id].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['movie_id'].iloc[movie_indices]

In [96]:
list(get_recommendations(14161))

[9341, 30379, 47933, 88751, 56288, 19585, 49521, 10195, 12589, 24432]