In [2]:
import surprise
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold

data = Dataset.load_builtin('ml-100k', prompt=False)
data.raw_ratings[:10]


[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [4]:
model = SVD()

In [6]:
cross_validate(model, data, measures=['rmse', 'mae'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9373  0.9389  0.9367  0.9315  0.9323  0.9353  0.0029  
MAE (testset)     0.7383  0.7375  0.7400  0.7345  0.7355  0.7372  0.0020  
Fit time          0.43    0.55    0.56    0.41    0.42    0.47    0.07    
Test time         0.08    0.11    0.18    0.08    0.08    0.11    0.04    


{'test_rmse': array([0.93727485, 0.93891519, 0.93667973, 0.93149028, 0.93229237]),
 'test_mae': array([0.73834199, 0.73754296, 0.73995697, 0.73446107, 0.73554232]),
 'fit_time': (0.42996692657470703,
  0.5458259582519531,
  0.5619261264801025,
  0.4144458770751953,
  0.41840291023254395),
 'test_time': (0.08258605003356934,
  0.10776090621948242,
  0.18478822708129883,
  0.07706022262573242,
  0.07650923728942871)}

In [11]:
raw_data = np.array(data.raw_ratings, dtype=int)
raw_data

array([[      196,       242,         3, 881250949],
       [      186,       302,         3, 891717742],
       [       22,       377,         1, 878887116],
       ...,
       [      276,      1090,         1, 874795795],
       [       13,       225,         2, 882399156],
       [       12,       203,         3, 879959583]])

In [14]:
raw_data[:,0] -= 1
raw_data[:,1] -= 1
raw_data

n_users = np.max(raw_data[:,0])
n_movies = np.max(raw_data[:,1])
shape = (n_users +1, n_movies + 1)
shape

(941, 1680)

# 기본 추천 알고리즘.
- 영화를 봤는지 보지 않았는지를 기준으로 유사도를 측정한다.
- 해당 유저의 본 영화중, 내가 보지 않은 영화를 추천해준다.


In [62]:
#데이터가 존재하면 1을 찍어준다.
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = 1
adj_matrix


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [1, 1, 1, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 1, 1]])

In [76]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    # print(user_id, user_vector)
    if my_id != user_id:
        similarity = np.dot(my_vector, user_vector)
        if similarity > best_match:
            best_match = similarity
            best_match_id = user_id
            best_match_vector = user_vector
print('Best match : {}, Best match Id : {} '.format(best_match, best_match_id))

Best match : 47, Best match Id : 10 


In [None]:
recommend_list = []
for i, log in enumerate(zip(my_vector,best_match_vector)):
    log1, log2 = log
    if log1 < 1 and log2 > 0:
        recommend_list.append(i)

recommend_list

# 유클리디안 거리 기반 추천


In [82]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = 9999, -1, []

for user_id, user_vector in enumerate(adj_matrix):

    # print(user_id, user_vector)
    if my_id != user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if euclidean_dist < best_match:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector
print('Best match : {}, Best match Id : {} '.format(best_match, best_match_id))

recommend_list = []
for i, log in enumerate(zip(my_vector,best_match_vector)):
    log1, log2 = log
    if log1 < 1 and log2 > 0:
        recommend_list.append(i)

recommend_list

Best match : 6.48074069840786, Best match Id : 314 


[310, 675, 680, 745, 876]

# 코사인 거리


In [84]:
def compute_cos_similarity(v1, v2):
    norm1 = np.sqrt(np.sum(np.square(v1)))
    norm2 = np.sqrt(np.sum(np.square(v2)))
    dot = np.dot(v1, v2)
    return dot/(norm1*norm2)

my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    # print(user_id, user_vector)
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector
print('Best match : {}, Best match Id : {} '.format(best_match, best_match_id))

recommend_list = []
for i, log in enumerate(zip(my_vector,best_match_vector)):
    log1, log2 = log
    if log1 < 1 and log2 > 0:
        recommend_list.append(i)

print(recommend_list)

Best match : 0.5188106719705753, Best match Id : 613 
[266, 283, 286, 289, 298, 310, 312, 313, 359, 675, 686, 745, 747, 870, 876, 892, 934, 1310]


## 0,1로 영화 시청 미시청으로 나눴던 행렬에 영화 평점을 넣는다.

In [86]:
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = rating
adj_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 4, 3],
       ...,
       [0, 0, 0, ..., 0, 5, 0],
       [4, 3, 3, ..., 0, 5, 3],
       [0, 0, 0, ..., 0, 4, 5]])

In [87]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = 9999, -1, []

for user_id, user_vector in enumerate(adj_matrix):

    # print(user_id, user_vector)
    if my_id != user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if euclidean_dist < best_match:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector
print('Best match : {}, Best match Id : {} '.format(best_match, best_match_id))

recommend_list = []
for i, log in enumerate(zip(my_vector,best_match_vector)):
    log1, log2 = log
    if log1 < 1 and log2 > 0:
        recommend_list.append(i)

recommend_list

Best match : 20.952326839756964, Best match Id : 314 


[310, 675, 680, 745, 876]

In [88]:
def compute_cos_similarity(v1, v2):
    norm1 = np.sqrt(np.sum(np.square(v1)))
    norm2 = np.sqrt(np.sum(np.square(v2)))
    dot = np.dot(v1, v2)
    return dot/(norm1*norm2)

my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    # print(user_id, user_vector)
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector
print('Best match : {}, Best match Id : {} '.format(best_match, best_match_id))

recommend_list = []
for i, log in enumerate(zip(my_vector,best_match_vector)):
    log1, log2 = log
    if log1 < 1 and log2 > 0:
        recommend_list.append(i)

print(recommend_list)

Best match : 0.4890317942714801, Best match Id : 860 
[239, 256, 259, 266, 267, 283, 286, 289, 298, 301, 302, 303, 307, 310, 312, 313, 356, 358, 359, 535, 679, 680, 687, 688, 745, 746, 747, 748, 749, 751, 869, 870, 873, 874, 876, 879, 882, 883, 884, 892, 895, 897, 898, 899, 900, 903, 905, 906, 907, 987, 1019, 1021, 1035, 1059, 1124, 1231, 1234, 1240, 1291, 1293, 1310, 1392, 1428, 1431, 1604, 1675, 1676, 1677]


# 협업 필터링
- 사용자와 항목의 유사성을 동시에 고려하여 추천
- 기존에 내 관심사가 아닌 항목이라도 추천 가능
- 자동으로 임베딩 학습 가능
- 협업 필터링은 다음과 같은 장단점 갖고 있음.
    - 장점
        - 자동으로 임베딩을 학습하기 때문에 도메인 지식이 필요 없음.
        - 기존의 관심사가 아니더라도 추천 가능
    - 단점
        - 학습 과정에 나오지 않은 항목은 임베딩을 만들 수 잆음
        - 추가 특성을 사용하기 어려움
        
