In [1]:
import pandas as pd

In [2]:
import string

In [3]:
movies=pd.read_csv('movies.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
def clean_title(title):
    cleantitle=[c for c in title if c not in string.punctuation]
    cleantitle=''.join(cleantitle)
    return cleantitle

In [6]:
movies['clean_title']=movies['title'].apply(clean_title)

In [7]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [10]:
search('Toy story(1994)')

Unnamed: 0,movieId,title,genres,clean_title
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
4823,4929,"Toy, The (1982)",Comedy,Toy The 1982
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019


In [11]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    description='Movie Title:')
movie_list=widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title=data['new']
        if len(title)>5:
            display(search(title))
movie_input.observe(on_type, names='value')

In [12]:
display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [13]:
ratings=pd.read_csv('ratings.csv')

In [14]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [15]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [16]:
#tim nguoi cung thich 1 bo phim
similar_user = ratings[(ratings['movieId']==movie_id) & (ratings['rating']>=4)]['userId'].unique()

In [17]:
#tim nhung bo phim khac ma similar_user thich
similar_user_recs = ratings[(ratings['userId']).isin(similar_user)&(ratings['rating']>=4)]['movieId']

In [18]:
similar_user_recs=similar_user_recs.value_counts()/len(similar_user)
similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [19]:
similar_user_recs

89745     1.000000
58559     0.708627
79132     0.684536
2571      0.665968
59315     0.655780
            ...   
38061     0.101504
61132     0.101219
88810     0.101028
4720      0.100743
103042    0.100457
Name: movieId, Length: 370, dtype: float64

đến đây có thể recommend rồi, nhưng để tối ưu thì sẽ tránh việc rcmd 1 bộ phim mà thích theo số đông (ví dụ mọi người đều thích Avengers nhưng thực sự gout phim của họ không phải siêu anh hùng ...)  


In [20]:
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index))&(ratings['rating']>=4)]

In [21]:

all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [22]:
all_user_recs

318       0.440875
296       0.390244
356       0.368104
593       0.362440
2571      0.348516
            ...   
103228    0.012117
122906    0.011686
122914    0.011673
106072    0.011404
103042    0.009210
Name: movieId, Length: 370, dtype: float64

In [23]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [24]:
rec_percentages

Unnamed: 0,similar,all
89745,1.000000,0.065662
58559,0.708627,0.201418
79132,0.684536,0.185193
2571,0.665968,0.348516
59315,0.655780,0.099681
...,...,...
38061,0.101504,0.027573
61132,0.101219,0.018438
88810,0.101028,0.023971
4720,0.100743,0.046054


In [25]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [26]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [27]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
17067,1.0,0.065662,15.229575,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
25058,0.289088,0.022877,12.63651,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015
20513,0.142354,0.011404,12.482574,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,Thor The Dark World 2013
19678,0.274424,0.023021,11.920596,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
21348,0.356504,0.030543,11.672371,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
25071,0.253856,0.021952,11.56424,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
16725,0.288897,0.025022,11.54586,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011
25061,0.208151,0.018932,10.994667,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
19841,0.100457,0.00921,10.907809,103042,Man of Steel (2013),Action|Adventure|Fantasy|Sci-Fi|IMAX,Man of Steel 2013
16312,0.246048,0.022746,10.817268,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011


In [28]:
def find_similar_movies(movie_id):
    movie = movies[movies["movieId"] == movie_id]
    similar_users=ratings[(ratings['movieId']==movie_id)&(ratings['rating']>=4)]['userId'].unique()
    similar_user_recs=ratings[(ratings['userId'].isin(similar_users))&(ratings['rating']>=4)]['movieId']
    similar_user_recs=similar_user_recs.value_counts()/len(similar_users)
    
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [29]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()