In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
movies=pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies.shape

(62423, 3)

In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [7]:
#cleaning movies title's with regex
import re
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [8]:
movies["clean_title"]=movies["title"].apply(clean_title)

In [9]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(ngram_range=(1,2))
tfidf=vectorizer.fit_transform(movies["clean_title"])

In [11]:
#creating search function
from sklearn.metrics.pairwise import cosine_similarity
def search(title):
    
    titles=clean_title(title)
    query_vec=vectorizer.transform([title])#convert the entered search terms into numbers 
    similarity=cosine_similarity(query_vec,tfidf).flatten()#returns the similar titles
    indices=np.argpartition(similarity,-5)[-10:]
    results=movies.iloc[indices][::-1]
    return results

In [14]:
results

NameError: name 'results' is not defined

In [16]:
search("the hulk")

Unnamed: 0,movieId,title,genres,clean_title
6411,6534,Hulk (2003),Action|Adventure|Sci-Fi,Hulk 2003
12425,60040,"Incredible Hulk, The (2008)",Action|Sci-Fi,Incredible Hulk The 2008
51827,183983,Hulk: Where Monsters Dwell (2016),Action|Animation|Fantasy|Sci-Fi,Hulk Where Monsters Dwell 2016
32940,142056,Iron Man & Hulk: Heroes United (2013),Action|Adventure|Animation,Iron Man Hulk Heroes United 2013
45854,171251,"Nobody Speak: Hulk Hogan, Gawker and Trials of...",Documentary,Nobody Speak Hulk Hogan Gawker and Trials of a...
7028,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,Lord of the Rings The Return of the King The 2003
9160,27307,Night of the Day of the Dawn of the Son of the...,Comedy|Horror,Night of the Day of the Dawn of the Son of the...
4152,4256,"Center of the World, The (2001)",Drama,Center of the World The 2001
59531,201062,The House at the Edge of the Galaxy (2013),Drama|Mystery,The House at the Edge of the Galaxy 2013
51742,183805,"The Good, the Bad & the Corny (2017)",Comedy,The Good the Bad the Corny 2017


In [29]:
#creating interactive jupyter notebook
import ipywidgets as widgets
from IPython.display import display
movie_input=widgets.Text(
        description="Movie Title",
        disable=False)
movie_list=widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title=data["new"]
        if len(title)>5:
            display(search(title))
movie_input.observe(on_type,names="value")
display(movie_input,movie_list)


Text(value='', description='Movie Title')

Output()

In [30]:
ratings=pd.read_csv("ratings.csv")

In [31]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [32]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [135]:
#finding users who liked same movies
movie_id=1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [136]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [137]:
#finding movies that peoples(same as users) likes 
similar_user_recs=ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]

In [138]:
#

In [139]:
#finding only the movies that pepols similar to user likes 
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [140]:
similar_user_recs=similar_user_recs.value_counts()/len(similar_users)


In [141]:
#takking movies only greater than 10% of users like
similar_user_recs=similar_user_recs[similar_user_recs>.10]

In [142]:
similar_user_recs

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

finding how much all users likes movies

In [143]:
 all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [144]:
  all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [145]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)

In [146]:
 rec_percentages.columns = ["similar", "all"]

In [147]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [148]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [149]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)


In [150]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [151]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [152]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()