<a href="https://colab.research.google.com/github/saketh1999/MovieRec/blob/main/MovieRec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
movies = pd.read_csv("movies.csv")

In [None]:
import re # pythong regex lib
def clean_title(title):
  return re.sub("[^a-zA-Z0-9] ","",title)

In [None]:
movies["Clean_Title"] = movies["title"].apply(clean_title)

In [None]:
movies

Unnamed: 0,movieId,title,genres,Clean_Title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995)
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995)
...,...,...,...,...
62418,209157,We (2018),Drama,We (2018)
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul (2001)
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems (2018)
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing (2001)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2)) # check for movies like story 1995 and toy 1995

tfidf = vectorizer.fit_transform(movies["Clean_Title"]) #converting to a matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def search(title):

  title = clean_title(title)
  query_vec = vectorizer.transform([title]) # converting the input movie to a set of numbers
  similarity = cosine_similarity(query_vec,tfidf).flatten()
  indices = np.argpartition(similarity,-5)[-5:]
  results = movies.iloc[indices][::-1]
  return results


In [None]:
results

Unnamed: 0,movieId,title,genres,Clean_Title
11003,47484,G Men (1935),Crime|Drama,G Men (1935)
3692,3793,X-Men (2000),Action|Adventure|Sci-Fi,X-Men (2000)
28489,131824,Men... (1985),Comedy,Men..(1985)
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995)
1126,1154,T-Men (1947),Film-Noir,T-Men (1947)


In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title: ",
    diable = False
)
movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type,names = 'value')
display(movie_input,movie_list)


Text(value='Toy Story', description='Movie Title: ')

Output()

In [None]:
ratings = pd.read_csv("ratings.csv")


In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296.0,5.0,1.147880e+09
1,1,306.0,3.5,1.147869e+09
2,1,307.0,5.0,1.147869e+09
3,1,665.0,5.0,1.147879e+09
4,1,899.0,3.5,1.147869e+09
...,...,...,...,...
1821983,12132,953.0,5.0,1.547748e+09
1821984,12132,954.0,4.0,1.547749e+09
1821985,12132,969.0,4.0,1.547749e+09
1821986,12132,1028.0,3.5,1.547750e+09


In [None]:
movie_id = 1

In [None]:
#similar users movie likes
#we are finding unique users that liked the movie we inputed and with a raing >= 5

similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()


In [None]:
similar_users

array([   36,    75,    86, ..., 12113, 12115, 12131])

In [None]:
similar_user_recs = ratings[(ratings["userId"]. isin(similar_users)) & (ratings["rating"] > 4)] ["movieId"]

In [None]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [None]:
print(similar_user_recs)

1.0       1.000000
318.0     0.411708
260.0     0.377159
296.0     0.338772
356.0     0.334933
            ...   
1259.0    0.103647
4973.0    0.103647
1387.0    0.103647
1278.0    0.102687
2396.0    0.101727
Name: movieId, Length: 89, dtype: float64


In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"]>4)]
print(all_users)

         userId  movieId  rating     timestamp
0             1    296.0     5.0  1.147880e+09
29            1   4973.0     4.5  1.147869e+09
72            2    110.0     5.0  1.141417e+09
76            2    260.0     5.0  1.141417e+09
79            2    318.0     5.0  1.141417e+09
...         ...      ...     ...           ...
1821936   12132    111.0     4.5  1.547748e+09
1821943   12132    541.0     4.5  1.547748e+09
1821948   12132    608.0     4.5  1.547748e+09
1821958   12132    904.0     4.5  1.547748e+09
1821963   12132    912.0     4.5  1.547748e+09

[108212 rows x 4 columns]


In [None]:
all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
print(all_users_rec)

318.0      0.347342
296.0      0.291307
2571.0     0.248922
356.0      0.233926
593.0      0.232489
             ...   
377.0      0.042744
1278.0     0.040769
78499.0    0.037177
745.0      0.036009
2355.0     0.023886
Name: movieId, Length: 89, dtype: float64


In [None]:
rec_percentages = pd.concat ([similar_user_recs, all_users_rec], axis=1)
rec_percentages.columns = ["similar", "all"]
print(rec_percentages)

         similar       all
1.0     1.000000  0.127425
318.0   0.411708  0.347342
260.0   0.377159  0.218930
296.0   0.338772  0.291307
356.0   0.334933  0.233926
...          ...       ...
1259.0  0.103647  0.050198
4973.0  0.103647  0.108657
1387.0  0.103647  0.047324
1278.0  0.102687  0.040769
2396.0  0.101727  0.045169

[89 rows x 2 columns]


In [None]:
rec_percentages ["score"] = rec_percentages ["similar"] / rec_percentages ["all"]

In [None]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)
print(rec_percentages)

          similar       all     score
1.0      1.000000  0.127425  7.847780
3114.0   0.290787  0.052173  5.573500
2355.0   0.112284  0.023886  4.700735
588.0    0.247601  0.070312  3.521433
78499.0  0.127639  0.037177  3.433308
...           ...       ...       ...
79132.0  0.136276  0.131286  1.038012
2858.0   0.171785  0.165499  1.037981
7153.0   0.173704  0.172683  1.005914
4973.0   0.103647  0.108657  0.953893
2959.0   0.193858  0.221354  0.875782

[89 rows x 3 columns]


In [None]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId" )

Unnamed: 0,similar,all,score,movieId,title,genres,Clean_Title
0,1.0,0.127425,7.84778,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995)
3021,0.290787,0.052173,5.5735,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 (1999)
2264,0.112284,0.023886,4.700735,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bug's LifeA (1998)
580,0.247601,0.070312,3.521433,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin (1992)
14813,0.127639,0.037177,3.433308,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 (2010)
587,0.211132,0.064386,3.279178,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast (1991)
33,0.166027,0.051994,3.193221,34,Babe (1995),Children|Drama,Babe (1995)
729,0.108445,0.036009,3.011588,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace GromitA Close Shave (1995)
1047,0.153551,0.053251,2.883545,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory (1971)
359,0.254319,0.088991,2.857812,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion KingThe (1994)


In [None]:
def find_similar_movies (movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"]. isin(similar_users)) & (ratings["rating"] > 4)] ["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]

    all_users = ratings [(ratings ["movieId"]. isin (similar_user_recs.index)) & (ratings ["rating"] > 4)]
    all_user_recs = all_users ["movieId"]. value_counts() / len(all_users ["userId"] .unique ())

    rec_percentages = pd.concat ([similar_user_recs, all_users_rec], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages ["score"] = rec_percentages ["similar"] / rec_percentages ["all"]

    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId") [ ["score", "title", "genres"]]

In [None]:
movie_name_input = widgets.Text(
value="Toy Story",
description="Movie Title:", disabled=False
)
recommendation_list = widgets.Output ()
def on_type (data) :
  with recommendation_list:
    recommendation_list.clear_output ()
    title = data ["new"]
    if len (title) > 5:
      results = search (title)
      movie_id = results. iloc [0] ["movieId"]
      display(find_similar_movies(movie_id) )

movie_name_input. observe(on_type, names="value")
display (movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()