In [1]:
import numpy as np
import pandas as pd
import random

#import the rating data
original_ratings=pd.read_csv('/Users/chenrui/Desktop/erg3020/ratings.csv')
links=pd.read_csv('/Users/chenrui/Desktop/erg3020/links.csv')
movies=pd.read_csv('/Users/chenrui/Desktop/erg3020/movies.csv')

In [2]:
user_max=1000
test_max=200
movie_max=10000


ratings=original_ratings.loc[(user_max>=original_ratings.userId)&(original_ratings.movieId<=movie_max)]
test_user=original_ratings.loc[(user_max<original_ratings.userId)&(original_ratings.movieId<=movie_max)]

In [3]:
def get_correlation_matrix():
    correlation_matrix=np.zeros((movie_max,movie_max),dtype=float)
    rated_movie=[]
    for user in range(user_max):
        rated_movie.append(list(ratings.movieId[ratings['userId']==user+1]))
    for user in range(user_max,user_max+test_max):
        rated_movie.append(list(test_user.movieId[test_user['userId']==user+1]))  
    for i in range(user_max):
        for j in range(len(rated_movie[i])):
            for k in range(j+1,len(rated_movie[i])):
                correlation_matrix[rated_movie[i][j]-1,rated_movie[i][k]-1] += 1
                correlation_matrix[rated_movie[i][k]-1,rated_movie[i][j]-1] += 1
    normalized_correlation_matrix=correlation_matrix/correlation_matrix.sum(axis=0,keepdims=True)
    where_are_NaNs = np.isnan(normalized_correlation_matrix)
    normalized_correlation_matrix[where_are_NaNs] = 0
    return normalized_correlation_matrix,rated_movie

In [4]:
normalized_correlation_matrix,rated_movie=get_correlation_matrix()

  del sys.path[0]


In [5]:
def user_score(user_id,rate_movie=rated_movie):
    user_rating=np.zeros((movie_max,1),dtype=float)
    if user_id in range(1001,1201):
        for i in rate_movie[user_id-1]:
            user_rating[i-1][0]=test_user.rating[(test_user['userId']==user_id)&(test_user['movieId']==i)]
    else:
        for i in rate_movie[user_id-1]:
            user_rating[i-1][0]=ratings.rating[(ratings['userId']==user_id)&(ratings['movieId']==i)]
    return(user_rating/sum(user_rating))

In [9]:
def itemRank(user_id,IR=normalized_correlation_matrix,alpha=0.85,max_ite=100,eplison=1e-8):

    N=IR.shape[0]
    v=np.repeat(1/N,N)
    v=v.reshape((N,1))
    d_ui=user_score(user_id)
    for ite in range(max_ite):
        v_new=np.zeros_like(v)
        for i in range(N):
            v_new[i]=alpha*np.dot(IR[i],v)+(1-alpha)*d_ui[i]

        #check difference between iteration result
        if np.allclose(v, v_new, atol=eplison):
            v=v_new
            break
        else:
            v=v_new
    return v

itemRank(112)

array([[0.00105002],
       [0.00066291],
       [0.00137922],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [10]:
def item_recommend(user_id,matrix=normalized_correlation_matrix,recommend_num=10):
    final_rank=itemRank(user_id,matrix).reshape(movie_max,)
    top_ten=np.argpartition(final_rank, -recommend_num)[-recommend_num:]
    rank_order=[top for i,top in sorted(zip(final_rank[top_ten],top_ten))]
    id_order=[rank+1 for rank in rank_order]
    count=['1st','2nd','3rd','4th','5th','6th','7th','8th','9th','10th']
    print('Recommending 10 movies for user '+str(user_id)+' .....')
    print('\n')
    for i in range(recommend_num):
        print('The '+str(count[i])+' recommendation is'
              +str(list(movies.title[movies.movieId==id_order[i]])).replace('[',' ').replace(']',' '))
        print("It's a(an)"+str(list(movies.genres[movies.movieId==id_order[i]])).replace('[',' ').replace(']',' ')
              +'movie, IMDB link: https://www.imdb.com/title/tt00'
              +str(list(links.imdbId[links.movieId==id_order[i]])).replace('[','').replace(']','')
              +'\n'+'TMDB link: https://www.themoviedb.org/movie/'
              +str(int(float(str(list(links.tmdbId[links.movieId==id_order[i]])).replace('[','').replace(']','')))))
        print('\n')
        

In [110]:
item_recommend(1011)

Recommending 10 movies for user 1011 .....


The 1st recommendation is 'Terminator 2: Judgment Day (1991)' 
It's a(an) 'Action|Sci-Fi' movie, IMDB link: https://www.imdb.com/title/tt00103064
TMDB link: https://www.themoviedb.org/movie/280


The 2nd recommendation is 'American Beauty (1999)' 
It's a(an) 'Comedy|Drama' movie, IMDB link: https://www.imdb.com/title/tt00169547
TMDB link: https://www.themoviedb.org/movie/14


The 3rd recommendation is 'Star Wars: Episode V - The Empire Strikes Back (1980)' 
It's a(an) 'Action|Adventure|Sci-Fi' movie, IMDB link: https://www.imdb.com/title/tt0080684
TMDB link: https://www.themoviedb.org/movie/1891


The 4th recommendation is 'Star Wars: Episode IV - A New Hope (1977)' 
It's a(an) 'Action|Adventure|Sci-Fi' movie, IMDB link: https://www.imdb.com/title/tt0076759
TMDB link: https://www.themoviedb.org/movie/11


The 5th recommendation is 'Jurassic Park (1993)' 
It's a(an) 'Action|Adventure|Sci-Fi|Thriller' movie, IMDB link: https://www.imdb.com/tit

In [18]:
def user_correlation_matrix():
    user_correlation=np.zeros((user_max,user_max),dtype=float)
    for i in range(user_max):
        for j in range(i+1,user_max):
            if i!=j:
                if len(rated_movie[i])>=len(rated_movie[j]):
                    user_correlation[i,j]=len(list(set(rated_movie[i]).intersection(rated_movie[j])))
                    user_correlation[j,i]=len(list(set(rated_movie[i]).intersection(rated_movie[j])))
                else:
                    user_correlation[i,j]=len(list(set(rated_movie[j]).intersection(rated_movie[i])))
                    user_correlation[j,i]=len(list(set(rated_movie[i]).intersection(rated_movie[j])))
    normalized_user_correlation=user_correlation/user_correlation.sum(axis=0,keepdims=True)
    where_are_NaNs = np.isnan(normalized_user_correlation)
    normalized_user_correlation[where_are_NaNs] = 0
    return normalized_user_correlation
normalized_user_correlation=user_correlation_matrix()

In [22]:
def pageRank(M,d=0.15,max_ite=1000,eplison=1e-8):
    #start with equal probability for initial point
    N=M.shape[0]
    v=np.repeat(1/N,N)

    for ite in range(max_ite):
        v_new=np.zeros_like(v)
        for i in range(N):
            v_new[i]=(1-d)*np.dot(M[i],v)+d/N

        #check difference between iteration result
        if np.allclose(v, v_new, atol=eplison):
            v=v_new
            break
        else:
            v=v_new
    return v

array([0.00127432, 0.00052993, 0.0014384 , 0.0004696 , 0.00094283,
       0.00037799, 0.00154214, 0.00104119, 0.00033198, 0.00055941,
       0.00227456, 0.00066661, 0.00100245, 0.00141722, 0.00085096,
       0.00052789, 0.00041524, 0.00078501, 0.00045829, 0.00040523,
       0.00128775, 0.00118048, 0.00093417, 0.00293576, 0.00153072,
       0.00100229, 0.00060088, 0.00058352, 0.00143539, 0.00032283,
       0.00077025, 0.0010511 , 0.00042397, 0.00109892, 0.00123458,
       0.00026556, 0.00024051, 0.00084229, 0.00039439, 0.00065157,
       0.00051094, 0.00060136, 0.00092803, 0.00044793, 0.00038574,
       0.00106335, 0.00079173, 0.00105497, 0.00043732, 0.00107275,
       0.00091835, 0.0004925 , 0.00104695, 0.0035861 , 0.00046696,
       0.00135172, 0.00037677, 0.00364345, 0.00038463, 0.00059661,
       0.00106243, 0.00030682, 0.00046859, 0.00068592, 0.00053107,
       0.00068721, 0.00074426, 0.00048746, 0.0021873 , 0.00104143,
       0.00034256, 0.00099863, 0.00135075, 0.00062522, 0.00076

In [101]:
def pageSim(movie1_id,movie2_id):
    rank=pageRank(normalized_user_correlation)
    rated_both_user=[]
    for i in range(user_max):
        if (movie1_id in rated_movie[i]) & (movie2_id in rated_movie[i]):
            rated_both_user.append(i+1)
    user_average=[]
    user_rank=[]
    movie1_ratings=[]
    movie2_ratings=[]
    for i in rated_both_user:
        user_rank.append(rank[i])
        user_average.append(np.mean(ratings.rating[ratings.userId==i]))
        movie1_ratings.append(ratings.rating[(ratings.userId==i)&(ratings.movieId==movie1_id)].values[0])
        movie2_ratings.append(ratings.rating[(ratings.userId==i)&(ratings.movieId==movie2_id)].values[0])
    
    user_average=np.array(user_average)
    movie1_ratings=np.array(movie1_ratings)
    movie2_ratings=np.array(movie2_ratings)
    user_rank=np.array(user_rank)
    
    if ((sum(np.power(movie1_ratings-user_average,2)*np.power(user_rank,2))**(1/2))\
          *sum(np.power(movie2_ratings-user_average,2)*np.power(user_rank,2))**(1/2))==0:
        return -1
    
    else:
        sim=sum((movie1_ratings-user_average)*(movie2_ratings-user_average)*np.power(user_rank,2))\
        /((sum(np.power(movie1_ratings-user_average,2)*np.power(user_rank,2))**(1/2))\
          *sum(np.power(movie2_ratings-user_average,2)*np.power(user_rank,2))**(1/2))
        return sim           

In [102]:
pageSim(2,4400)

-1

In [117]:
def pageSim_recommend(movie_id,recommend_num=10):
    sim=[]
    for i in range(100):
        if i!=movie_id:
            sim.append(pageSim(movie_id,i+1))
        else:
            sim.append(-1)
    sim=np.array(sim).reshape((100,))
    top_ten=np.argpartition(sim, -recommend_num)[-recommend_num:]
    rank_order=[top for i,top in sorted(zip(sim[top_ten],top_ten))]
    id_order=[rank+1 for rank in rank_order]
    count=['1st','2nd','3rd','4th','5th','6th','7th','8th','9th','10th']
    print('Recommending 10 movies for movie '
          +str(list(movies.title[movies.movieId==movie_id])).replace('[',' ').replace(']',' ')
          +'id: '+ str(movie_id)
          +' .....')
    print('\n')
    for i in range(recommend_num):
        print('The '+str(count[i])+' recommendation is'
              +str(list(movies.title[movies.movieId==id_order[i]])).replace('[',' ').replace(']',' '))
        print("It's a(an)"+str(list(movies.genres[movies.movieId==id_order[i]])).replace('[',' ').replace(']',' ')
              +'movie, IMDB link: https://www.imdb.com/title/tt00'
              +str(list(links.imdbId[links.movieId==id_order[i]])).replace('[','').replace(']','')
              +'\n'+'TMDB link: https://www.themoviedb.org/movie/'
              +str(int(float(str(list(links.tmdbId[links.movieId==id_order[i]])).replace('[','').replace(']','')))))
        print('\n')

In [121]:
pageSim_recommend(81)

Recommending 10 movies for movie 81 "Things to Do in Denver When You're Dead (1995)"  .....


The 1st recommendation is 'Screamers (1995)' 
It's a(an) 'Action|Sci-Fi|Thriller' movie, IMDB link: https://www.imdb.com/title/tt00114367
TMDB link: https://www.themoviedb.org/movie/9102


The 2nd recommendation is 'Last Summer in the Hamptons (1995)' 
It's a(an) 'Comedy|Drama' movie, IMDB link: https://www.imdb.com/title/tt00113612
TMDB link: https://www.themoviedb.org/movie/188588


The 3rd recommendation is 'In the Bleak Midwinter (1995)' 
It's a(an) 'Comedy|Drama' movie, IMDB link: https://www.imdb.com/title/tt00113403
TMDB link: https://www.themoviedb.org/movie/40154


The 4th recommendation is 'Waiting to Exhale (1995)' 
It's a(an) 'Comedy|Drama|Romance' movie, IMDB link: https://www.imdb.com/title/tt00114885
TMDB link: https://www.themoviedb.org/movie/31357


The 5th recommendation is 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)' 
It's a(an) 'Crime|Drama' movie, IMDB link: http