In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('movie_metadata.csv')
df.head(1)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000


In [3]:
df.shape

(5043, 28)

In [3]:
pd.set_option('display.max_columns',None) # display all columns

In [4]:
df['movie_title']=df['movie_title'].apply(lambda n: n.strip())

In [5]:
print(df.columns.tolist())

['color', 'director_name', 'num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes']


In [6]:
subset = df.select_dtypes('object').drop(columns=['color','movie_imdb_link','language']).copy()

In [7]:
print(subset.columns.tolist())

['director_name', 'actor_2_name', 'genres', 'actor_1_name', 'movie_title', 'actor_3_name', 'plot_keywords', 'country', 'content_rating']


In [8]:
subset.dropna(inplace=True)

In [9]:
subset['text'] = subset['director_name'] + subset['actor_2_name'] + subset['genres'] + subset['actor_1_name'] + subset['movie_title']+subset['actor_3_name']+subset['plot_keywords']+subset['country']+subset['content_rating']

In [10]:
subset['text'] # out final data

0       James CameronJoel David MooreAction|Adventure|...
1       Gore VerbinskiOrlando BloomAction|Adventure|Fa...
2       Sam MendesRory KinnearAction|Adventure|Thrille...
3       Christopher NolanChristian BaleAction|Thriller...
5       Andrew StantonSamantha MortonAction|Adventure|...
                              ...                        
5034    Neill Dela LlanaEdgar TancangcoThrillerIan Gam...
5035    Robert RodriguezPeter MarquardtAction|Crime|Dr...
5036    Anthony ValloneJohn ConsidineCrime|DramaRichar...
5037    Edward BurnsCaitlin FitzGeraldComedy|DramaKerr...
5042    Jon GunnBrian HerzlingerDocumentaryJohn August...
Name: text, Length: 4595, dtype: object

In [11]:
vec = TfidfVectorizer(stop_words='english')
movie_mat = vec.fit_transform(subset['text'][:1000]).toarray()
movie_mat.shape

  movie_mat = vec.fit_transform(subset['text'][:1000]).toarray()


(1000, 8243)

In [12]:
cs = cosine_similarity(movie_mat, movie_mat, dense_output=True)
cs

array([[1.        , 0.01470073, 0.01852516, ..., 0.02218542, 0.        ,
        0.00498001],
       [0.01470073, 1.        , 0.01561103, ..., 0.00441   , 0.        ,
        0.00419662],
       [0.01852516, 0.01561103, 1.        , ..., 0.00555727, 0.        ,
        0.00528838],
       ...,
       [0.02218542, 0.00441   , 0.00555727, ..., 1.        , 0.        ,
        0.00426742],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.00498001, 0.00419662, 0.00528838, ..., 0.00426742, 0.        ,
        1.        ]])

In [13]:
def get_movie_loc(name):
    try:
        return subset[subset['movie_title'] == name ].index[0]
    except Exception as e:
        print(f'Error {name} not found, {e}')
        return None

In [14]:
get_movie_loc('The Dark Knight Rises')

3

In [15]:
def recommend(movie, k=5):
    if subset['movie_title'].str.contains(movie).any():
        try:
            idx = get_movie_loc(movie)
            print(idx)
            if idx != -1:
                sim_scores = list(enumerate(cs[idx]))
                sim_scores.sort(key=lambda i:i[1], reverse=True)
                movie_idxs = [i[0] for i in sim_scores]
                return subset.iloc[movie_idxs]['movie_title'].head(k).tolist()
            else:
                return None
        except Exception as e:
            print("Error+>",e)
            return None
    else:
        print('movie not found')
        return None

In [20]:
recommend('The Dark Knight Rises', k=10)

3


['The Dark Knight Rises',
 'Jack Reacher',
 'Silent Hill',
 'Inception',
 'Mission: Impossible II',
 'Under Siege 2: Dark Territory',
 "The Devil's Own",
 'The Dark Knight',
 'Mission: Impossible - Rogue Nation',
 'Need for Speed']