In [40]:

import numpy as np;
import pandas as pd;



In [41]:
#Loading the dataset
rating_data = pd.io.parsers.read_csv('ml-1m/ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')
movie_data = pd.io.parsers.read_csv('ml-1m/movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::')

In [42]:
rating_data

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [43]:
movie_data

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [44]:
#Calculating the rating matrices
ratings_mat = np.ndarray(
    shape=(np.max(rating_data.movie_id.values), np.max(rating_data.user_id.values)),
    dtype=np.uint8)
ratings_mat[rating_data.movie_id.values-1, rating_data.user_id.values-1] = rating_data.rating.values

In [45]:
ratings_mat

array([[5, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [46]:
#Normalization
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T
normalised_mat

array([[ 3.57400662, -1.42599338, -1.42599338, ..., -1.42599338,
        -1.42599338,  1.57400662],
       [-0.37152318, -0.37152318, -0.37152318, ..., -0.37152318,
        -0.37152318, -0.37152318],
       [-0.23874172, -0.23874172, -0.23874172, ..., -0.23874172,
        -0.23874172, -0.23874172],
       ...,
       [-0.03278146, -0.03278146, -0.03278146, ..., -0.03278146,
        -0.03278146, -0.03278146],
       [-0.02582781, -0.02582781, -0.02582781, ..., -0.02582781,
        -0.02582781, -0.02582781],
       [-0.24288079, -0.24288079, -0.24288079, ..., -0.24288079,
        -0.24288079, -0.24288079]])

In [47]:
#Calculating U, S, V with the help of SVD

cov_mat = np.cov(normalised_mat)
print('Covariance Matrix:\n', cov_mat)

Covariance Matrix:
 [[ 4.13030618e+00  5.75769310e-01  2.58605324e-01 ...  3.68692352e-02
  -1.89683174e-03  1.01352495e-01]
 [ 5.75769310e-01  1.16348766e+00  1.58844772e-01 ...  2.35864233e-02
   1.72608728e-04  4.73547767e-02]
 [ 2.58605324e-01  1.58844772e-01  7.54054386e-01 ...  1.23744228e-02
  -3.18657205e-03  2.89395420e-02]
 ...
 [ 3.68692352e-02  2.35864233e-02  1.23744228e-02 ...  1.28748016e-01
   2.31637842e-02  7.54942213e-02]
 [-1.89683174e-03  1.72608728e-04 -3.18657205e-03 ...  2.31637842e-02
   1.07297708e-01  5.63190257e-02]
 [ 1.01352495e-01  4.73547767e-02  2.89395420e-02 ...  7.54942213e-02
   5.63190257e-02  9.15498240e-01]]


In [48]:
# Computing Eigen values abd Eigen Vectots
eigen_val_cov, eigen_vec_cov = np.linalg.eig(cov_mat)
eigen_val_cov

array([279.30422154+0.j,  74.63191754+0.j,  54.66225966+0.j, ...,
         0.        +0.j,   0.        +0.j,   0.        +0.j])

In [49]:
eigen_vec_cov

array([[ 0.05722455+0.j,  0.02094612+0.j, -0.02863335+0.j, ...,
         0.        +0.j,  0.        +0.j,  0.        +0.j],
       [ 0.02693498+0.j,  0.02978433+0.j,  0.00862002+0.j, ...,
         0.        +0.j,  0.        +0.j,  0.        +0.j],
       [ 0.0144301 +0.j,  0.01669989+0.j, -0.01365576+0.j, ...,
         0.        +0.j,  0.        +0.j,  0.        +0.j],
       ...,
       [ 0.00350289+0.j, -0.00187546+0.j, -0.00191671+0.j, ...,
         0.        +0.j,  0.        +0.j,  0.        +0.j],
       [ 0.00081997+0.j, -0.00226486+0.j, -0.00347437+0.j, ...,
         0.        +0.j,  0.        +0.j,  0.        +0.j],
       [ 0.01254195+0.j, -0.00502192+0.j, -0.02208081+0.j, ...,
         0.        +0.j,  0.        +0.j,  0.        +0.j]])

In [50]:
# Cosine similarity
def cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

In [51]:
# Function for printing similar movie
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations movies for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

## Predicting top 15 movies when  we selected top 50 featues 

In [52]:
#Predicting movies
features = 50
movie_id = 22
top_n = 10

sliced = eigen_vec_cov.T[:, :features] 
indexes = cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(movie_data, movie_id, indexes)

Recommendations movies for Copycat (1995): 

Copycat (1995)
Jane Eyre (1996)
Every Other Weekend (1990)
Terminator 2: Judgment Day (1991)
Loser (1991)
Tales From the Crypt Presents: Demon Knight (1995)
No Escape (1994)
Apartment, The (1960)
Two or Three Things I Know About Her (1966)
Nemesis 2: Nebula (1995)


  


## Predicting top 15 movies when  we selected top 20 featues 

In [53]:
features = 20
movie_id = 22
top_n = 10

sliced = eigen_vec_cov.T[:, :features] 
indexes = cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(movie_data, movie_id, indexes)

Recommendations movies for Copycat (1995): 

Copycat (1995)
Loser (1991)
Heathers (1989)
Maverick (1994)
Ed (1996)
Celtic Pride (1996)
Gaslight (1944)
8 Heads in a Duffel Bag (1997)
Every Other Weekend (1990)
Suture (1993)


  


There is a significant difference between the predictions having 20 features and 50 features respectively. Only one movie that is 'Loser (1991)' is the common movie in both the predictions which is also in different rank. It seems that , when we take 20 featues there is a significant information loss which can be helpful in making predictions much aligned to user preference.