In [1]:
import pandas as pd
movies = pd.read_csv("ml-25m/movies.csv")

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
import re

def clean_movie_titles(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [4]:
movies['cleaned_titles'] = movies['title'].apply(clean_movie_titles)

# Building Term Frequency Matrix

In [5]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf =  vectorizer.fit_transform(movies['cleaned_titles'])

In [40]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    #title = 'Toy Story 1995'
    title = clean_movie_titles(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [53]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
            value = 'Toy Story',
            description = 'Movie Title:',
            disabled=False
            )

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data['new']
        if len(title)>=5:
            display(search(title))
        
        
movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [54]:
ratings = pd.read_csv("ml-25m/ratings.csv")

In [58]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [59]:
similar_users = ratings[(ratings["movieId"]==1) & (ratings["rating"]>4)]["userId"].unique()

In [60]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

In [67]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]

In [70]:
similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [76]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]

In [78]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [79]:
all_users_recs

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

In [82]:
rec_percent = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percent.columns = ["similar", "all"]
rec_percent

Unnamed: 0,similar,all
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [83]:
rec_percent["score"] = rec_percent["similar"]/rec_percent["all"]
rec_percent = rec_percent.sort_values("score", ascending=False)
rec_percent

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [84]:
rec_percent.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,cleaned_titles
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [87]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"]==movie_id) & (ratings["rating"]>4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percent = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percent.columns = ["similar", "all"]
    
    rec_percent["score"] = rec_percent["similar"]/rec_percent["all"]
    rec_percent = rec_percent.sort_values("score", ascending=False)
    
    return rec_percent.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title","genres"]]
    

In [89]:
find_similar_movies(1)

Unnamed: 0,score,title,genres
0,8.017414,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3021,5.225654,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2264,4.405452,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
14813,4.354038,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
4780,3.320783,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
580,3.208539,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
6258,3.156862,Finding Nemo (2003),Adventure|Animation|Children|Comedy
587,2.99115,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
8246,2.972889,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
359,2.954762,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX


In [93]:
input_movie = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

recommedation_list = widgets.Output()

def on_type(data):
    with recommedation_list:
        recommedation_list.clear_output()
        title = data["new"]
        if len(title)>5:
            results = search(title)
            movieID = results.iloc[0]["movieId"]
            display(find_similar_movies(movieID))

input_movie.observe(on_type, names="value")
display(input_movie, recommedation_list)

Text(value='Toy Story', description='Movie Title:')

Output()