In [1]:
# dataset was taken from kaggle https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset?select=IMDb+movies.csv

In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('datasets/initial.csv')

In [3]:
movies.describe()

Unnamed: 0,year,duration,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics
count,85854.0,85854.0,85854.0,85854.0,13304.0,78257.0,74057.0
mean,1993.500594,100.351329,5.898642,9493.321,55.895144,46.03969,27.479036
std,24.216405,22.553964,1.234988,53574.65,17.784415,178.512268,58.338977
min,1894.0,41.0,1.0,99.0,1.0,1.0,1.0
25%,1979.0,88.0,5.2,205.0,43.0,4.0,3.0
50%,2003.0,96.0,6.1,484.0,57.0,9.0,8.0
75%,2013.0,108.0,6.8,1766.0,69.0,27.0,23.0
max,2020.0,808.0,9.9,2278845.0,100.0,10472.0,999.0


In [4]:
movies.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

In [5]:
# only get columns we will use in our application
movies = movies[['imdb_title_id','title','year','genre','duration','country','language','director','production_company','actors','description']]

In [6]:
movies.columns

Index(['imdb_title_id', 'title', 'year', 'genre', 'duration', 'country',
       'language', 'director', 'production_company', 'actors', 'description'],
      dtype='object')

In [7]:
movies = movies.fillna('')

In [8]:
movies.size

944394

In [9]:
# we will be using imdb_title_id as a primary key for each movie so we do not want any rows where it is empty
movies = movies.dropna(subset = ['imdb_title_id'])

In [10]:
movies.dtypes

imdb_title_id         object
title                 object
year                   int64
genre                 object
duration               int64
country               object
language              object
director              object
production_company    object
actors                object
description           object
dtype: object

In [11]:
# free version of heroku postgreSQL limits number of rows in our database so only getting recent movies
movies = movies.loc[movies['year'] >= 1990]

In [12]:
movies.size

624426

In [13]:
countries_to_drop = ['French','German','Danish','Hungarian','Spanish','Arabic','Italian','Czech','Russian','Polish','Persian','Bengali','Neapolitan','Croatian','Vietnamese','Hindi','Serbo-Croatian','Sioux','Romanian','Swedish','Finnish','Hebrew','Norwegian','Turkish','Aboriginal','Greek','Yiddish','Dyula','Wolof','Quechua','Slovenian','Urdu','Latin','Scots','Georgian','Tamil','Telugu','Serbian','Malayalam','Kannada','Gallegan','Macedonian','Kurdish','Luxembourgish','Armenian','Hokkien','Malay','Azerbaijani','Marathi','Latvian','Estonian','Filipino','Tagalog','Min Nan','Bambara','Tibetan','Bulgarian','Romany','Faroese','Catalan','Tatar','Lithuanian','Icelandic','Kirghiz','Punjabi','Sinhalese','Slovak','Saami','Indonesian','Afrikaans','Nepali','Samoan','Tonga','Maori']

In [14]:
# free version of heroku postgreSQL limits number of rows in our database so only getting select langugages
movies = movies[~movies['language'].isin(countries_to_drop)]

In [15]:
movies.size

433411

In [16]:
# adding imdb_url for each movie so web application can link to it later
movies['imdb_url'] = [f'http://www.imdb.com/title/{x}/reference' for x in movies['imdb_title_id']]

In [17]:
movies

Unnamed: 0,imdb_title_id,title,year,genre,duration,country,language,director,production_company,actors,description,imdb_url
4334,tt0035423,Kate & Leopold,2001,"Comedy, Fantasy, Romance",118,USA,"English, French",James Mangold,Konrad Pictures,"Meg Ryan, Hugh Jackman, Liev Schreiber, Brecki...",An English Duke from 1876 is inadvertedly drag...,http://www.imdb.com/title/tt0035423/reference
14127,tt0064730,Nihon boryoku-dan: Kumicho,2000,"Action, Crime",97,Japan,Japanese,Kinji Fukasaku,Toei Company,"Kôji Tsuruta, Tomisaburô Wakayama, Bunta Sugaw...",Coming out of jail and hoping for a quiet life...,http://www.imdb.com/title/tt0064730/reference
14739,tt0066498,L'orecchio,1990,"Drama, Thriller",94,Czechoslovakia,"Czech, Russian",Karel Kachyna,Filmové studio Barrandov,"Jirina Bohdalová, Radoslav Brzobohatý, Gustav ...",After coming home from a Party gathering one n...,http://www.imdb.com/title/tt0066498/reference
15675,tt0069049,L'altra faccia del vento,2018,Drama,122,"France, Iran, USA","English, German",Orson Welles,Royal Road Entertainment,"John Huston, Oja Kodar, Peter Bogdanovich, Sus...",A Hollywood director emerges from semi-exile w...,http://www.imdb.com/title/tt0069049/reference
19545,tt0081145,Harry e Carota,1993,"Comedy, Crime, Drama",94,USA,English,Dan Curtis,Dan Curtis Productions,"Danny Aiello, Alex Zuckerman, Joe Pantoliano, ...",A lonely and emotionally neglected rich kid fo...,http://www.imdb.com/title/tt0081145/reference
...,...,...,...,...,...,...,...,...,...,...,...,...
85840,tt9899880,Columbus,2018,"Comedy, Drama",82,Iran,"Persian, English",Hatef Alimardani,,"Farhad Aslani, Majid Salehi, Saeed Poursamimi,...",A rich family are deciding to immigrate to the...,http://www.imdb.com/title/tt9899880/reference
85841,tt9900060,Lupin III: La menzogna di Fujiko Mine,2019,"Animation, Adventure, Crime",58,Japan,Japanese,Takeshi Koike,TMS Entertainment,"Kan'ichi Kurita, Kiyoshi Kobayashi, Miyuki Saw...",Fujiko befriends a young boy whose late father...,http://www.imdb.com/title/tt9900060/reference
85844,tt9904250,La reina de los lagartos,2019,Fantasy,63,,"Spanish, Catalan","Juan González, Nando Martínez",Aquí y Allí Films,"Javier Botet, Bruna Cusí, Miki Esparbé, Ivan L...","A spaceship is about to come to pick up Javi, ...",http://www.imdb.com/title/tt9904250/reference
85845,tt9904802,Enemy Lines,2020,War,92,UK,"English, Polish, Russian, German",Anders Banke,Happy Hour Films,"Ed Westwick, John Hannah, Tom Wisdom, Corey Jo...","In the frozen, war torn landscape of occupied ...",http://www.imdb.com/title/tt9904802/reference


In [25]:
# this function will get the movie poster image url for each movie which we will display in our HTML
import requests
def getPosterURL(imdb_title_id):
    response = requests.get(f"https://api.themoviedb.org/3/find/{imdb_title_id}?api_key=2bb89f5e38215f75ec3c5bdfc756831a&language=en-US&external_source=imdb_id").json()
    try:
        poster_url = response['movie_results'][0]['poster_path']
        return f"https://image.tmdb.org/t/p/original{poster_url}"
    except IndexError:
        return "No Image"

In [26]:
movies['poster_url'] = movies.apply(lambda row: getPosterURL(row['imdb_title_id']), axis=1)

In [31]:
movies['poster_url']

4334     https://image.tmdb.org/t/p/original/mUvikzKJJS...
14127    https://image.tmdb.org/t/p/original/rP0KbvKVDC...
14739    https://image.tmdb.org/t/p/original/2imasBvAgb...
15675    https://image.tmdb.org/t/p/original/kFky1paYEf...
19545              https://image.tmdb.org/t/p/originalNone
                               ...                        
85840    https://image.tmdb.org/t/p/original/q7QEAOfycg...
85841    https://image.tmdb.org/t/p/original/x2goFsoI2u...
85844    https://image.tmdb.org/t/p/original/gujDX4I8uB...
85845    https://image.tmdb.org/t/p/original/vG8qBkByy9...
85850    https://image.tmdb.org/t/p/original/qZNlF6i3MO...
Name: poster_url, Length: 39401, dtype: object

In [35]:
pd.options.mode.chained_assignment = None

In [36]:
def clean_data(x):
    return str.lower(x.replace(" ",""))

focus_columns = ['title','genre','director','production_company','actors','description']
focus = movies[focus_columns]
for column in focus_columns:
    focus[column] = focus[column].apply(clean_data)

In [37]:
# create a soup to use in our vectorizer to generate recommendations
def create_soup(x):
    return x['title'] + ' ' + x['genre'] + x['director'] + x['production_company'] + x['actors'] + x['description']

focus['soup'] = focus.apply(create_soup, axis=1)

In [48]:
focus['soup']

4334     kate&leopold comedy,fantasy,romancejamesmangol...
14127    nihonboryoku-dan:kumicho action,crimekinjifuka...
14739    l'orecchio drama,thrillerkarelkachynafilmovést...
15675    l'altrafacciadelvento dramaorsonwellesroyalroa...
19545    harryecarota comedy,crime,dramadancurtisdancur...
                               ...                        
85840    columbus comedy,dramahatefalimardanifarhadasla...
85841    lupiniii:lamenzognadifujikomine animation,adve...
85844    lareinadeloslagartos fantasyjuangonzález,nando...
85845    enemylines warandersbankehappyhourfilmsedwestw...
85850    debeentjesvansint-hildegard comedy,dramajohann...
Name: soup, Length: 39401, dtype: object

In [49]:
# this will get top 10 recommendations

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(focus['soup'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)
focus = focus.reset_index()
indices = pd.Series(focus.index, index=focus['title'])

def recommend(x, cosine_sim=cosine_sim):
    try:
        title = x['title'].str.replace(' ', '').str.lower()
    except AttributeError:
        title = x['title'].replace(' ', '').lower()
    idx = indices[title]
    try:
        sim_scores = sorted(list(enumerate(cosine_sim[idx])), key=lambda y: y[1], reverse=True)[1:11]
    except ValueError:
        return 'None'
    movie_indices = [i[0] for i in sim_scores]
    result = movies['title'].iloc[movie_indices].values
    return '~'.join(result)



In [50]:
movies['recommendations'] = movies.apply(lambda row: recommend(row, cosine_sim), axis=1)

In [51]:
movies['recommendations']

4334     The Infinite Man~La rivolta delle ex~Er dai ya...
14127    Kimi ga wakamono nara~Buraddo~Gonin~Kuro no te...
14739    Stay~Centigrade~Gerry~Ruoma de shi qi sui~The ...
15675    X Moor~The Last Heist~Il tagliaerbe 2 - The Cy...
19545    Tinseltown~Benvenuti a Radioland~Casper~Vidas ...
                               ...                        
85840                                                 None
85841    Lupin III: La tomba di Jigen Daisuke~Lupin III...
85844    El rei borni~Le distanze~Barcelona, nit d'hive...
85845    Corbo~Le double de ma moitié~Il giorno del per...
85850    L'apprendista mago~Rocks in My Pockets~Ciao Ci...
Name: recommendations, Length: 39401, dtype: object

In [52]:
movies

Unnamed: 0,imdb_title_id,title,year,genre,duration,country,language,director,production_company,actors,description,imdb_url,poster_url,recommendations
4334,tt0035423,Kate & Leopold,2001,"Comedy, Fantasy, Romance",118,USA,"English, French",James Mangold,Konrad Pictures,"Meg Ryan, Hugh Jackman, Liev Schreiber, Brecki...",An English Duke from 1876 is inadvertedly drag...,http://www.imdb.com/title/tt0035423/reference,https://image.tmdb.org/t/p/original/mUvikzKJJS...,The Infinite Man~La rivolta delle ex~Er dai ya...
14127,tt0064730,Nihon boryoku-dan: Kumicho,2000,"Action, Crime",97,Japan,Japanese,Kinji Fukasaku,Toei Company,"Kôji Tsuruta, Tomisaburô Wakayama, Bunta Sugaw...",Coming out of jail and hoping for a quiet life...,http://www.imdb.com/title/tt0064730/reference,https://image.tmdb.org/t/p/original/rP0KbvKVDC...,Kimi ga wakamono nara~Buraddo~Gonin~Kuro no te...
14739,tt0066498,L'orecchio,1990,"Drama, Thriller",94,Czechoslovakia,"Czech, Russian",Karel Kachyna,Filmové studio Barrandov,"Jirina Bohdalová, Radoslav Brzobohatý, Gustav ...",After coming home from a Party gathering one n...,http://www.imdb.com/title/tt0066498/reference,https://image.tmdb.org/t/p/original/2imasBvAgb...,Stay~Centigrade~Gerry~Ruoma de shi qi sui~The ...
15675,tt0069049,L'altra faccia del vento,2018,Drama,122,"France, Iran, USA","English, German",Orson Welles,Royal Road Entertainment,"John Huston, Oja Kodar, Peter Bogdanovich, Sus...",A Hollywood director emerges from semi-exile w...,http://www.imdb.com/title/tt0069049/reference,https://image.tmdb.org/t/p/original/kFky1paYEf...,X Moor~The Last Heist~Il tagliaerbe 2 - The Cy...
19545,tt0081145,Harry e Carota,1993,"Comedy, Crime, Drama",94,USA,English,Dan Curtis,Dan Curtis Productions,"Danny Aiello, Alex Zuckerman, Joe Pantoliano, ...",A lonely and emotionally neglected rich kid fo...,http://www.imdb.com/title/tt0081145/reference,https://image.tmdb.org/t/p/originalNone,Tinseltown~Benvenuti a Radioland~Casper~Vidas ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85840,tt9899880,Columbus,2018,"Comedy, Drama",82,Iran,"Persian, English",Hatef Alimardani,,"Farhad Aslani, Majid Salehi, Saeed Poursamimi,...",A rich family are deciding to immigrate to the...,http://www.imdb.com/title/tt9899880/reference,https://image.tmdb.org/t/p/original/q7QEAOfycg...,
85841,tt9900060,Lupin III: La menzogna di Fujiko Mine,2019,"Animation, Adventure, Crime",58,Japan,Japanese,Takeshi Koike,TMS Entertainment,"Kan'ichi Kurita, Kiyoshi Kobayashi, Miyuki Saw...",Fujiko befriends a young boy whose late father...,http://www.imdb.com/title/tt9900060/reference,https://image.tmdb.org/t/p/original/x2goFsoI2u...,Lupin III: La tomba di Jigen Daisuke~Lupin III...
85844,tt9904250,La reina de los lagartos,2019,Fantasy,63,,"Spanish, Catalan","Juan González, Nando Martínez",Aquí y Allí Films,"Javier Botet, Bruna Cusí, Miki Esparbé, Ivan L...","A spaceship is about to come to pick up Javi, ...",http://www.imdb.com/title/tt9904250/reference,https://image.tmdb.org/t/p/original/gujDX4I8uB...,"El rei borni~Le distanze~Barcelona, nit d'hive..."
85845,tt9904802,Enemy Lines,2020,War,92,UK,"English, Polish, Russian, German",Anders Banke,Happy Hour Films,"Ed Westwick, John Hannah, Tom Wisdom, Corey Jo...","In the frozen, war torn landscape of occupied ...",http://www.imdb.com/title/tt9904802/reference,https://image.tmdb.org/t/p/original/vG8qBkByy9...,Corbo~Le double de ma moitié~Il giorno del per...
