In [16]:
import pandas as pd

movies = pd.read_csv("movies.csv")


In [17]:
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [28]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title



In [29]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [19]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title= "Toy Story 1995"
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1] #::-1 is for most similar result on top
    return results


In [45]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]   #new , gives the new value
    if len(title)>5:
      display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_list)


Text(value='Toy Story', description='Movie Title:')

Output()

In [51]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)


Text(value='Toy Story', description='Movie Title:')

Output()

In [52]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [55]:
ratings = pd.read_csv("ratings.csv")

In [56]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

In [57]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()


In [58]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]


In [59]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]


In [60]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]


In [61]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
#percentage of all users who liked this movies


In [62]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [63]:
rec_percentages


Unnamed: 0,similar,all
89745,1.000000,0.040380
58559,0.705882,0.127078
79132,0.647059,0.133017
59315,0.529412,0.048694
112852,0.500000,0.051069
...,...,...
53125,0.117647,0.009501
1265,0.117647,0.076010
7147,0.117647,0.029691
84772,0.117647,0.007126


In [64]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]


In [65]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)


In [66]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean title,clean_title
17067,1.0,0.04038,24.764706,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,", ()",Avengers The 2012
25068,0.176471,0.008314,21.226891,122914,Avengers: Infinity War - Part II (2019),Action|Adventure|Sci-Fi,: - (),Avengers Infinity War Part II 2019
19891,0.117647,0.005938,19.811765,103253,Elysium (2013),Action|Drama|Sci-Fi|IMAX,(),Elysium 2013
20018,0.117647,0.005938,19.811765,103772,"Wolverine, The (2013)",Action|Adventure|Fantasy|Sci-Fi,", ()",Wolverine The 2013
15674,0.176471,0.009501,18.573529,82461,Tron: Legacy (2010),Action|Adventure|Sci-Fi|IMAX,: (),Tron Legacy 2010
16523,0.176471,0.009501,18.573529,87222,Kung Fu Panda 2 (2011),Action|Adventure|Animation|Children|Comedy|IMAX,(),Kung Fu Panda 2 2011
19807,0.176471,0.009501,18.573529,102903,Now You See Me (2013),Crime|Mystery|Thriller,(),Now You See Me 2013
17178,0.147059,0.008314,17.689076,90249,Real Steel (2011),Action|Drama|Sci-Fi|IMAX,(),Real Steel 2011
30139,0.147059,0.008314,17.689076,135569,Star Trek Beyond (2016),Action|Adventure|Sci-Fi,(),Star Trek Beyond 2016
17470,0.205882,0.011876,17.335294,91542,Sherlock Holmes: A Game of Shadows (2011),Action|Adventure|Comedy|Crime|Mystery|Thriller,: (),Sherlock Holmes A Game of Shadows 2011


In [67]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [68]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()