# SVD 

In [3]:
import pandas as pd 
from main.py import Dataset
data = pd.io.parsers.read_csv('dataset/ratings.dat',
    names=['userid', 'movieid', 'ratings'],encoding='latin-1',
    engine='python', delimiter='::')

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [2]:
data

Unnamed: 0,Data
0,1:1193:5:978300760
1,1:661:3:978302109
2,1:914:3:978301968
3,1:3408:4:978300275
4,1:2355:5:978824291
...,...
1000204,6040:1091:1:956716541
1000205,6040:1094:5:956704887
1000206,6040:562:5:956704746
1000207,6040:1096:4:956715648


In [None]:
movie_data = pd.io.parsers.read_csv('movies.dat',
    names=['movie_id', 'title', 'genre'],encoding='latin-1',
    engine='python', delimiter='::')

In [8]:
data

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [9]:
import numpy as np
ratings_mat = np.ndarray(
    shape=(np.max(data.movie_id.values), np.max(data.user_id.values)),
    dtype=np.uint8)
ratings_mat[data.movie_id.values-1, data.user_id.values-1] = data.rating.values

In [10]:
ratings_mat

array([[5, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [12]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

In [13]:
normalised_mat

array([[ 3.57400662, -1.42599338, -1.42599338, ..., -1.42599338,
        -1.42599338,  1.57400662],
       [-0.37152318, -0.37152318, -0.37152318, ..., -0.37152318,
        -0.37152318, -0.37152318],
       [-0.23874172, -0.23874172, -0.23874172, ..., -0.23874172,
        -0.23874172, -0.23874172],
       ...,
       [-0.03278146, -0.03278146, -0.03278146, ..., -0.03278146,
        -0.03278146, -0.03278146],
       [-0.02582781, -0.02582781, -0.02582781, ..., -0.02582781,
        -0.02582781, -0.02582781],
       [-0.24288079, -0.24288079, -0.24288079, ..., -0.24288079,
        -0.24288079, -0.24288079]])

In [14]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)

In [15]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1 in the dataset
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

In [16]:
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [30]:
k = 50
movie_id = 3793 # (getting an id from movies.dat)
top_n = 20
sliced = V.T # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)
# Print the top N similar movies.

print_similar_movies(movie_data, movie_id, indexes)

Recommendations for X-Men (2000): 

X-Men (2000)
Moonstruck (1987)
Erin Brockovich (2000)
Starman (1984)
English Patient, The (1996)
Three Kings (1999)
And Now for Something Completely Different (1971)
Sixth Sense, The (1999)
Edward Scissorhands (1990)
West Side Story (1961)
Hot Shots! Part Deux (1993)
Thomas Crown Affair, The (1999)
Sneakers (1992)
Thin Red Line, The (1998)
Great Escape, The (1963)
Sting, The (1973)
Children of a Lesser God (1986)
Fisher King, The (1991)
Men in Black (1997)
Space Cowboys (2000)


# SVD with 90% retained energy

In [18]:
S

array([2.06617808e+01, 1.06804930e+01, 9.14055972e+00, ...,
       1.71271486e-15, 1.45774792e-15, 8.47072761e-16])

In [19]:
S.shape

(3952,)

In [20]:
U.shape

(6040, 6040)

In [21]:
V.shape

(3952, 3952)

In [22]:
A.shape

(6040, 3952)

In [23]:
temp=np.square(S).sum()*0.9

In [24]:
sum=0
for i in range(len(S)):
    sum+=S[i]*S[i]
    if sum>temp:
        break
k=i

In [25]:
Uk = U[:, :k]
Vk = V[:k, :].T
Sk = np.diag(S[:k])

In [26]:
A_approx = np.dot(Uk, np.dot(Sk, Vk.T))

In [27]:
U_r, S_r, V_r = np.linalg.svd(A_approx)

In [31]:
movie_id = 3793 # (getting an id from movies.dat)
top_n = 20
sliced = V_r.T # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)
# Print the top N similar movies.

print_similar_movies(movie_data, movie_id, indexes)

Recommendations for X-Men (2000): 

X-Men (2000)
Untouchables, The (1987)
Taxi Driver (1976)
Breakfast Club, The (1985)
Amadeus (1984)
In the Line of Fire (1993)
Last of the Mohicans, The (1992)
Perfect Storm, The (2000)
Monty Python and the Holy Grail (1974)
Christmas Story, A (1983)
Scream (1996)
Mission: Impossible 2 (2000)
Alien³ (1992)
Maverick (1994)
League of Their Own, A (1992)
Cinema Paradiso (1988)
Frequency (2000)
Total Recall (1990)
Edward Scissorhands (1990)
Blade (1998)


In [2]:
data["userid"].value_counts()

NameError: name 'data' is not defined