## 컨텐츠 기반 필터링

In [1]:
# 라이브러리 설치
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095407 sha256=29f562360512903e284c41fba143be8b5ab4fa09f2403b7d2c5786837a165587
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import numpy as np
import pandas as pd
from surprise import Dataset

In [4]:
data = Dataset.load_builtin('ml-100k', prompt=False)
df = pd.DataFrame(data.raw_ratings, columns=['user-id','movie-id','rating','timestamp'])
df.head()

Unnamed: 0,user-id,movie-id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [8]:
df.shape, df['user-id'].nunique(), df['movie-id'].nunique()

((100000, 4), 943, 1682)

##### 1. Adjacent Matrix 생성
- 행: 사용자 id
- 열: 영화 id
- 내용: 평점

In [11]:
raw_data = np.array(data.raw_ratings, dtype=int)
np.min(raw_data, axis=0)

array([        1,         1,         1, 874724710])

In [13]:
# user-id, movie-id 가 0부터 시작하도록 만들어 줌
raw_data[:, :2 ] -=1
raw_data[:5]

array([[      195,       241,         3, 881250949],
       [      185,       301,         3, 891717742],
       [       21,       376,         1, 878887116],
       [      243,        50,         2, 880606923],
       [      165,       345,         1, 886397596]])

#### 1) 본 영화/안 본 영화로만 구분, 1/0

In [20]:
nrows = df['user-id'].nunique()
ncols = df['movie-id'].nunique()
adj_matrix = np.zeros([nrows, ncols], dtype= int)
for user_id, movie_id, _, _ in raw_data:
  adj_matrix[user_id, movie_id] = 1 
adj_matrix[:5]


array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [21]:
# 0번 데이터를 '나'라고 가정
my_id, my_vector = 0, adj_matrix[0]

In [23]:
# 유사도 - 이진 벡터의 내적
# 나와 10, 20번 사용자와의 유사도
np.dot(my_vector, adj_matrix[10]), np.dot(my_vector, adj_matrix[20])

(71, 42)

In [24]:
# 누가 나랑 가장 닮았나
best_score, best_match_id = 0, 0


In [32]:
for i in range(1,len(adj_matrix)):
  dot = np.dot(my_vector, adj_matrix[i])
  if dot > best_score:
    best_score, best_match_id = dot, i
best_score, best_match_id

(183, 275)

In [34]:
# 내가 본 영화 갯수, 가장 닮은 사람이 본 영화 갯수
my_vector.sum(), adj_matrix[best_match_id].sum()

(272, 518)

In [35]:
# 내가 보지 않은 영화중에서 가장 닮은 사람이 본 영화 추천
recommend_list = []
best_vector = adj_matrix[best_match_id]
for i in range(len(my_vector)):
  if my_vector[i] == 0 and best_vector[i] ==1:
    recommend_list.append(i)
len(recommend_list), recommend_list[:10]

(335, [272, 273, 275, 280, 281, 283, 287, 288, 289, 290])

2) 평점 점수를 주는 경우

In [39]:
adj_matrix = np.zeros([nrows, ncols], dtype= int)
for user_id, movie_id, rating, _ in raw_data:
  adj_matrix[user_id, movie_id] = rating
adj_matrix[:5]

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [4, 3, 0, ..., 0, 0, 0]])

- Case 1) 유클리드 거리

In [44]:

# 누가 나랑 가자 '나'와 닮았나
best_score, best_match_id = 10000000, 0
my_vector = adj_matrix[0]
for i in range(1,len(adj_matrix)):
  euc = np.sqrt(np.sum(np.square(my_vector - adj_matrix[i])))
  if euc < best_score:
    best_score, best_match_id = euc, i
best_score, best_match_id

(55.06359959174482, 737)

In [47]:
# 내가 보지 않은 영화중에서 가장 닮은 사람이 본 영화중 평점이 4 이상인 영화 추천
recommend_list = []
best_vector = adj_matrix[best_match_id]
for i in range(len(my_vector)):
  if my_vector[i] == 0 and best_vector[i] > 4:
    recommend_list.append(i)

len(recommend_list), recommend_list[:10]

(6, [312, 317, 384, 407, 526, 602])

- Case 2) 코사인 유사도

In [48]:
def cos_similarity(v1,v2):
  v1_norm = np.sqrt(np.sum(np.square(v1)))
  v2_norm = np.sqrt(np.sum(np.square(v2)))
  return np.dot(v1, v2) / (v1_norm*v2_norm)

In [49]:
best_score, best_match_id = -1, 0

for i in range(1, len(adj_matrix)):
    sim = cos_similarity(my_vector, adj_matrix[i])
    if sim > best_score:
        best_score, best_match_id = sim, i

best_score, best_match_id

(-1, 0)

In [1]:
recommend_list = []
best_vector = adj_matrix[best_match_id]
for i in range(len(my_vector)):
    if my_vector[i] == 0 and best_vector[i] >= 4:
        recommend_list.append(i)
                
len(recommend_list), recommend_list[:10]
     

NameError: name 'adj_matrix' is not defined