In [13]:
import pandas as pd
import numpy as np

In [14]:
# https://files.grouplens.org/datasets/movielens/ml-25m.zip
movies = pd.read_csv("movies.csv")

In [15]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [16]:
#cleaning movie titles with regex
import re

In [17]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [32]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [33]:
#creating a tfidf matrix 
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [35]:
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [36]:
#creating a search function
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
title="Toy Story 1995"
title = clean_title(title)
query_vec = vectorizer.transform([title])
query_vec

<1x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [46]:
similarity = cosine_similarity(query_vec, tfidf).flatten()
similarity

array([1.        , 0.09681098, 0.06531543, ..., 0.        , 0.        ,
       0.        ])

In [47]:
indices = np.argpartition(similarity, -5)[-5:]  # finds 5 most similar titles in terms of indices of matrix
indices

array([20497, 14813, 59767,  3021,     0], dtype=int64)

In [48]:
results = movies.iloc[indices]                  # gives the names of 5 most similar movies titles
results

Unnamed: 0,movieId,title,genres,clean_title
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


In [50]:
results = movies.iloc[indices].iloc[::-1]       # gives the names of 5 most similar movies in asc order
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [51]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]  # finds 5 most similar titles terms
    results = movies.iloc[indices].iloc[::-1]       # gives the names of 5 most similar movies
    return results

In [52]:
# Building an interactive search box with Jupyter
# pip install ipywidgets
# jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [53]:
import ipywidgets as widgets
from IPython.display import display

In [55]:
movie_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()      # Output widget

In [57]:
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

In [58]:
display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()