In [1]:
#Import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Give an intuition about User-Item Matrix by using an example of movie rating
data = {'userId': [1,1,1,1,1,1,
                   2,2,2,2,2,2,
                   3,3,3,3,3,3,
                   4,4,4,4,4,4,
                   5,5,5,5,5,5],
        'movieId': ['Action1', 'Action2', 'Action3','Romantic1','Romantic2','Romantic3',
                    'Action1', 'Action2', 'Action3','Romantic1','Romantic2','Romantic3',
                    'Action1', 'Action2', 'Action3','Romantic1','Romantic2','Romantic3',
                    'Action1', 'Action2', 'Action3','Romantic1','Romantic2','Romantic3',
                    'Action1', 'Action2', 'Action3','Romantic1','Romantic2','Romantic3'],
        'rating': [4,5,3,0,2,1,
                   5,3,3,2,2,0,
                   1,0,0,4,5,4,
                   0,2,1,4,0,3,
                   1,0,2,3,3,4]}

ratings=pd.DataFrame(data)

In [4]:
ratings.head(6)

Unnamed: 0,userId,movieId,rating
0,1,Action1,4
1,1,Action2,5
2,1,Action3,3
3,1,Romantic1,0
4,1,Romantic2,2
5,1,Romantic3,1


In [5]:
ratings=ratings.fillna(0) #replace any missing values with 0

In [7]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating
0,1,Action1,4
1,1,Action2,5
2,1,Action3,3
3,1,Romantic1,0
4,1,Romantic2,2
5,1,Romantic3,1
6,2,Action1,5
7,2,Action2,3
8,2,Action3,3
9,2,Romantic1,2


In [8]:
ratings=ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
#Transpose values   

In [9]:
ratings.head()

movieId,Action1,Action2,Action3,Romantic1,Romantic2,Romantic3
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4,5,3,0,2,1
2,5,3,3,2,2,0
3,1,0,0,4,5,4
4,0,2,1,4,0,3
5,1,0,2,3,3,4


In [10]:
#Apply standardization to bring all values to same scale 
def standardize(row):
  new_row=(row-row.mean())/(row.max()-row.min())
  return new_row

In [11]:
ratings=ratings.apply(standardize)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity ## find similarity row wise

In [13]:
# user_similarity=cosine_similarity(ratings)
item_similarity=cosine_similarity(ratings.T)

In [14]:
item_sim_df=pd.DataFrame(item_similarity,index=ratings.columns,columns=ratings.columns)
# user_sim_df=pd.DataFrame(user_similarity,index=ratings.index,columns=ratings.index)

In [15]:
item_sim_df.head()

movieId,Action1,Action2,Action3,Romantic1,Romantic2,Romantic3
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
Action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
Action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
Romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
Romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939


In [16]:
# user_sim_df.head()

In [17]:
#Define a function to get similar movie name based on similarity score
def get_similar_movie(movie_name,user_rating):
  # similar_score=item_sim_df[movie_name]*(user_rating) find most similar element and put at top liked disliked both at top
  similar_score=item_sim_df[movie_name]*(user_rating-2.5)
  return similar_score.sort_values(ascending=False)

In [18]:
print(get_similar_movie('Romantic1',1))

movieId
Action3      1.271919
Action2      1.267731
Action1      1.199116
Romantic2   -0.222059
Romantic3   -1.085620
Romantic1   -1.500000
Name: Romantic1, dtype: float64


In [19]:
action_lover=[('Action1',5),('Romantic2',1),('Romantic3',1)]

In [20]:
#Create a blank dataframe
similar_movies=pd.DataFrame()


for movie,rating in action_lover:
  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)

similar_movies.head()
similar_movies.sum().sort_values(ascending=False)

  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)
  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)
  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)


movieId
Action1      3.909247
Action2      3.810282
Action3      3.807445
Romantic2   -2.154389
Romantic1   -3.306206
Romantic3   -4.376174
dtype: float64

In [70]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

--2024-01-10 19:03:41--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-01-10 19:03:41 (8.08 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


## Real Life dataset

In [98]:
movies=pd.read_csv('movies.csv')
ratings=pd.read_csv('ratings.csv').sample(10000)
ratings=pd.merge(movies,ratings).drop(['genres','timestamp'],axis=1)
# ratings.set_index(ratings.columns[0], inplace=True)

In [100]:
ratings

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),134643,4.0
1,1,Toy Story (1995),1641,4.5
2,1,Toy Story (1995),23186,4.0
3,1,Toy Story (1995),128623,3.0
4,1,Toy Story (1995),8337,5.0
...,...,...,...,...
9995,115824,Mr. Turner (2014),68026,5.0
9996,117511,Hello Ladies: The Movie (2014),71,3.0
9997,118696,The Hobbit: The Battle of the Five Armies (2014),10788,4.0
9998,119145,Kingsman: The Secret Service (2015),73873,1.5


In [101]:
ratings=ratings[['userId','title','rating']]

In [102]:
ratings=ratings.pivot(index='userId', columns='title', values='rating').fillna(0)

In [103]:
ratings=ratings.apply(standardize)

In [104]:
ratings.head()

title,'Round Midnight (1986),"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),13 Tzameti (2005),"13th Warrior, The (1999)",...,Zazie dans le métro (1960),Zelig (1983),Zero Effect (1998),Zombie Strippers! (2008),Zoolander (2001),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,-0.000112,-0.000179,-0.000485,-0.000616,-0.000496,-0.000504,-0.000694,-0.000314,-0.000212,-0.000594,...,-0.000112,-0.000112,-0.000269,-0.000112,-0.000291,-0.000112,-0.000112,-0.000112,-0.000432,-0.000112
36,-0.000112,-0.000179,-0.000485,-0.000616,-0.000496,-0.000504,-0.000694,-0.000314,-0.000212,-0.000594,...,-0.000112,-0.000112,-0.000269,-0.000112,-0.000291,-0.000112,-0.000112,-0.000112,-0.000432,-0.000112
63,-0.000112,-0.000179,-0.000485,-0.000616,-0.000496,-0.000504,-0.000694,-0.000314,-0.000212,-0.000594,...,-0.000112,-0.000112,-0.000269,-0.000112,-0.000291,-0.000112,-0.000112,-0.000112,-0.000432,-0.000112
71,-0.000112,-0.000179,-0.000485,-0.000616,-0.000496,-0.000504,-0.000694,-0.000314,-0.000212,-0.000594,...,-0.000112,-0.000112,-0.000269,-0.000112,-0.000291,-0.000112,-0.000112,-0.000112,-0.000432,-0.000112
78,-0.000112,-0.000179,-0.000485,-0.000616,-0.000496,-0.000504,-0.000694,-0.000314,-0.000212,-0.000594,...,-0.000112,-0.000112,-0.000269,-0.000112,-0.000291,-0.000112,-0.000112,-0.000112,-0.000432,-0.000112


In [107]:
item_similarity=cosine_similarity(ratings.T)
item_sim_df=pd.DataFrame(item_similarity,index=ratings.columns,columns=ratings.columns)

In [109]:
print(get_similar_movie('Toy Story (1995)',4).head())

title
Toy Story (1995)                        1.500000
Paths of Glory (1957)                   0.180099
African Queen, The (1951)               0.151606
How to Make an American Quilt (1995)   -0.000718
Beginners (2010)                       -0.000718
Name: Toy Story (1995), dtype: float64


In [113]:
temp_user=[('Toy Story (1995)',1),('Father of the Bride Part II (1995)',1),('Sabrina (1995)',1),('Grumpier Old Men (1995)',1)]

In [117]:
similar_movies=pd.DataFrame()

for movie,rating in temp_user:
  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)

  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)
  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)
  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)
  similar_movies=similar_movies.append(get_similar_movie(movie,rating),ignore_index=True)


In [118]:
similar_movies.sum().sort_values(ascending=False).head(10)

title
Shawshank Redemption, The (1994)             0.012936
Forrest Gump (1994)                          0.012381
Fargo (1996)                                 0.011623
Silence of the Lambs, The (1991)             0.011356
Jurassic Park (1993)                         0.011317
Terminator 2: Judgment Day (1991)            0.011132
Star Wars: Episode IV - A New Hope (1977)    0.010893
Apollo 13 (1995)                             0.010763
Braveheart (1995)                            0.010672
Fugitive, The (1993)                         0.010535
dtype: float64