In [61]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

import warnings 

warnings.filterwarnings('ignore')

In [2]:
imdb = pd.read_csv('processed_imdb.csv')
netflix = pd.read_csv('processed_netflix.csv')

In [4]:
match_list = pd.read_csv('match_list.csv')

In [6]:
imdb.shape, netflix.shape, match_list.shape

((28041, 783), (7787, 781), (1348, 3))

In [5]:
match_list.head()

Unnamed: 0,IMDB Title,Matched Netflix Title,Match Score
0,The Great Train Robbery,the great train robbery,100
1,By the Sea,by the sea,100
2,The Captive,the captive,100
3,In the Park,in the dark,91
4,The Tramp,the trap,94


In [139]:
def get_recommendation(input_title, top_k=10, use_genre=True):
    
    input_title = imdb[imdb['IMDB Title']==input_title]
    # By default, only show the top 10 closest titles

    columns_to_show = [
        'IMDB Title',
        'Release Year', 
        'averageRating',
        'numVotes',
        'plot',
        'genres',
        'country',
        'Similarity Score'
    ]
    
    # First, filter out any titles that have different Genre Group
    # if use_genre is set to True
    if use_genre:
        recommended = imdb[imdb['Genre Group']==input_title['Genre Group'].item()]
    else:
        recommended = imdb.copy()
        
    # Second, filter out the data identical to input_title
    # so that there is no same movie in the recommended titles
    recommended = recommended[recommended['IMDB Title']!=input_title['IMDB Title'].item()]
    
    # Third, also filter out titles that may already exist in Netflix dataset
    # since we don't want to recommend what they already have
    recommended = recommended[~recommended['IMDB Title'].isin(match_list['IMDB Title'])]
    
    # Append the ipnut data at the top of the recommended list so that easier to compute the similarity
    # and reset indices
    recommended = pd.concat([input_title, recommended], ignore_index=True)
    recommended.reset_index(drop=True, inplace=True)

    semantics = recommended.iloc[:, -768:]
    similarity = cosine_similarity(semantics, semantics)
    # The first row contains similarity score between input title and other recomemnded titles. The first elem with score of 1 is the input which is why the score 1 (or 100%)
    similarity = similarity[0]
    
    recommended['Similarity Score'] = similarity
    
    # Drop semantics from the recommended since the score is already computed
    recommended.drop(columns=imdb.columns[imdb.columns.str.contains('Semantic')].tolist(), inplace=True)
    
    recommended = recommended.sort_values('Similarity Score', ascending=False)
    
    # Drop the first row since it's the input title
    recommended = recommended.iloc[1:].reset_index(drop=True)
    recommended = recommended[columns_to_show]
    recommended = recommended.iloc[:top_k]
    
    print('The original title information')
    
    # input_title = input_title.to_frame().transpose()
    input_title.drop(columns=input_title.columns[input_title.columns.str.contains('Semantic')].tolist(), inplace=True)
    input_title = input_title[columns_to_show[:-1]]
    
    display(input_title)
    
    print()
    
    display(recommended)
    
    return recommended

As an example, Fast & Furious movie will be used as an input

In [141]:
imdb[imdb['IMDB Title'].str.contains('Fast &')]

Unnamed: 0,IMDB ID,titleType,IMDB Title,originalTitle,isAdult,Release Year,runtimeMinutes,genres,averageRating,numVotes,...,Semantic 759,Semantic 760,Semantic 761,Semantic 762,Semantic 763,Semantic 764,Semantic 765,Semantic 766,Semantic 767,Semantic 768
28036,tt1013752,movie,Fast & Furious,Fast & Furious,0,2009,107,"Action,Thriller",6.6,264960,...,-0.583039,-0.363652,-0.208181,-0.398622,-0.009654,0.449042,-0.085996,-0.281167,0.036825,-0.253933


One can specify if one wants to use genre as a feature to recommend titles or not. Two cells below show different list of titles with and without genres

In [None]:
recommended = get_recommendation('Fast & Furious', top_k=15, use_genre=False)
recommended.to_csv('Recommended_titles_without_genres.csv', index=False)

In [None]:
recommended = get_recommendation('Fast & Furious', top_k=15)
recommended.to_csv('Recommended_titles_with_genres.csv', index=False)

As you can see here, there is only one Fast & Furious. It's possible that if there are other series'ed Fast & Furious, the model could have other Fast & Furious series as one of recommended titles