<a href="https://colab.research.google.com/github/senasung37/recommendation/blob/main/movie_recommender_surprise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

코드출처: 
- 이수안 컴퓨터 연구소 <br>
https://www.youtube.com/watch?v=6TP51jvjLsE&t=698s
- towardsdatascience.com <br>
https://towardsdatascience.com/how-you-can-build-simple-recommender-systems-with-surprise-b0d32a8e4802

#라이브러리 & 데이터 불러오기

surprise: 추천 시스템 개발을 위한 라이브러리 

In [None]:
#surprise 라이브러리 설치
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 7.1 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633980 sha256=edca6f62ae96911c70a0ca4633a22249cbe5cb1e1b6c5d1bda3b87c803a8dfe2
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
#라이브러리 불러오기
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [None]:
#데이터 불러오기 (ml-100k: 데이터 종류)
#피쳐: 사용자id, 영화id, 평점, 시간)
data = Dataset.load_builtin('ml-100k', prompt=False)
data.raw_ratings[:10]

Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [None]:
#모델 설정(SVD - matrix factorization 알고리즘, 비어있는 값들 예측)
model = SVD()

In [None]:
#모델 검증
cross_validate(model, data, measures=['rmse', 'mae'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9360  0.9338  0.9348  0.9372  0.9360  0.9356  0.0012  
MAE (testset)     0.7379  0.7361  0.7365  0.7381  0.7388  0.7375  0.0010  
Fit time          5.42    6.42    4.10    4.06    4.11    4.82    0.95    
Test time         0.24    0.21    0.16    0.17    0.15    0.19    0.03    


{'fit_time': (5.421935319900513,
  6.41520357131958,
  4.104832649230957,
  4.06043004989624,
  4.110447406768799),
 'test_mae': array([0.73791046, 0.73613901, 0.73653169, 0.73806921, 0.73878262]),
 'test_rmse': array([0.93603889, 0.93377945, 0.93484755, 0.93721403, 0.93598863]),
 'test_time': (0.2397444248199463,
  0.20917224884033203,
  0.1646127700805664,
  0.16756963729858398,
  0.1454753875732422)}

#컨텐츠 기반 필터링
사용자가 좋아하는 상품과 유사한 상품 추천
- 장점: 많은 수의 사용자 대상으로 쉽게 확장, 관심 없던 상품 추천 가능
- 단점: 입력 특성을 직접 설계해야 돼서 도메인 지식 필요, 사용자 기존 관심사항 기반으로만 추천 가능

# 협업 필터링 - 사용자 기반
- 자신과 유사한 다른 사용자의 취향을 기반으로 상품 추천

## dot product를 사용한 추천

In [None]:
import numpy as np
from surprise import Dataset

In [None]:
#데이터 array로 만들기
raw_data = np.array(data.raw_ratings, dtype=int)
raw_data

array([[      196,       242,         3, 881250949],
       [      186,       302,         3, 891717742],
       [       22,       377,         1, 878887116],
       ...,
       [      276,      1090,         1, 874795795],
       [       13,       225,         2, 882399156],
       [       12,       203,         3, 879959583]])

In [None]:
#userid, movieid가 0부터 시작하도록 변경
raw_data[:,0] -= 1
raw_data[:,1] -= 1
raw_data


array([[      195,       241,         3, 881250949],
       [      185,       301,         3, 891717742],
       [       21,       376,         1, 878887116],
       ...,
       [      275,      1089,         1, 874795795],
       [       12,       224,         2, 882399156],
       [       11,       202,         3, 879959583]])

In [None]:
#유저, 영화 최대값으로 shape 만들기
n_users = np.max(raw_data[:, 0])
n_movies = np.max(raw_data[:, 1])
shape = (n_users + 1, n_movies + 1)
shape

(943, 1682)

In [None]:
#user x movie 인접행렬 만들기(데이터가 있으면 1, 없으면 0)
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id, movie_id] = 1.
adj_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [None]:
#dot product 유사도 평가 알고리즘 - 입력한 id와 가장 유사한 similarity 및 ID 도출
my_id = int(input())
my_vector = adj_matrix[my_id]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    similarity = np.dot(my_vector, user_vector)
    if similarity > best_match:
      best_match = similarity
      best_match_id = user_id
      best_match_vector = user_vector
      
print("Best Match Similarity: {}, Best Match ID: {}".format(best_match, best_match_id))

In [None]:
#추천리스트 뽑기(내가 보지 않았지만 유사 ID가 본 영화)
recommend_lst = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_lst.append(i)
print(recommend_lst)

## 유클리드 거리를 사용한 추천

In [None]:
#유클리드 거리 유사도 평가 알고리즘
my_id = int(input())
my_vector = adj_matrix[my_id]
best_match, best_match_id, best_match_vector = 999, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    euc_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
    if euc_dist < best_match:
      best_match = euc_dist
      best_match_id = user_id
      best_match_vector = user_vector
      
print("Best Match Euclidean Distance: {}, Best Match ID: {}".format(best_match, best_match_id))

In [None]:
#추천리스트 뽑기(내가 보지 않았지만 유사 ID가 본 영화)
recommend_lst = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_lst.append(i)
print(recommend_lst)

## 코사인 유사도를 사용한 추천

In [None]:
#코사인 유사도 함수
def compute_cos_sim(v1, v2):
  norm1 = np.sqrt(np.sum(np.square(v1)))
  norm2 = np.sqrt(np.sum(np.square(v2)))
  dot = np.dot(v1, v2)
  return dot / (norm1 * norm2)

In [None]:
#코사인 유사도 평가 알고리즘
my_id = int(input())
my_vector = adj_matrix[my_id]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    cos_sim = compute_cos_sim(my_vector, user_vector)
    if cos_sim > best_match:
      best_match = cos_sim
      best_match_id = user_id
      best_match_vector = user_vector
      
print("Best Match Euclidean Distance: {}, Best Match ID: {}".format(best_match, best_match_id))

In [None]:
#추천리스트 뽑기(내가 보지 않았지만 유사 ID가 본 영화)
recommend_lst = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_lst.append(i)
print(recommend_lst)

## 평점을 활용한 추천

In [None]:
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id, movie_id] = rating
adj_matrix

In [None]:
#유클리드 거리 유사도 평가 알고리즘
my_id = int(input())
my_vector = adj_matrix[my_id]
best_match, best_match_id, best_match_vector = 999, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    euc_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
    if euc_dist < best_match:
      best_match = euc_dist
      best_match_id = user_id
      best_match_vector = user_vector
      
print("Best Match Euclidean Distance: {}, Best Match ID: {}".format(best_match, best_match_id))

In [None]:
#코사인 유사도 평가 알고리즘
my_id = int(input())
my_vector = adj_matrix[my_id]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    cos_sim = compute_cos_sim(my_vector, user_vector)
    if cos_sim > best_match:
      best_match = cos_sim
      best_match_id = user_id
      best_match_vector = user_vector
      
print("Best Match Euclidean Distance: {}, Best Match ID: {}".format(best_match, best_match_id))

In [None]:
from surprise import KNNBasic, SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate

## 다양한 모델을 사용한 협업필터링
- SVD, KNN,NMF, SVDpp

In [None]:
#SVD 모델
model = SVD()
cross_validate(model, data, measures=['rmse', 'mae'], cv=5, n_jobs=4, verbose=True)

In [None]:
#KNN 모델
model = KNNBasic()
cross_validate(model, data, measures=['rmse', 'mae'], cv=5, n_jobs=4, verbose=True)

In [None]:
#NMF 모델
model = NMF()
cross_validate(model, data, measures=['rmse', 'mae'], cv=5, n_jobs=4, verbose=True)

In [None]:
#SVD plus plus 모델 - 시간이 오래 소요됨
#model = SVDpp()
#cross_validate(model, data, measures=['rmse', 'mae'], cv=5, n_jobs=4, verbose=True)

# 하이브리드 모델
- 컨텐츠, 협업 필터링 조합
- 많은 방식 존재
- 여기에서는 협업 필터링으로 임베딩 학습 + 컨텐츠 필터링으로 유사도 기반 추천

##프로젝트

In [None]:
#모델 설정
model = SVD()

In [None]:
#훈련데이터 학습
trainset = data.build_full_trainset()
model.fit(trainset)

In [None]:
#특정 user의 특정 item에 대한 평점 예측
model.predict(uid=10, iid=50)

In [None]:
adj_matrix

In [None]:
#userid x 평점 df 만들기
import pandas as pd
cols = range(1682)
df = pd.DataFrame(adj_matrix, columns = cols)
df

In [None]:
#예측 평점 넣기
def format(my_id, x, movie_id):
  if x != "nan":
    pred = model.predict(uid=my_id, iid=movie_id)
    return pred.est
  else:
    pass

In [None]:
#평점 4점 이상 예측되는 영화 추천
my_id = int(input())

df.apply(lambda x: x['c'] if x['c']>0 else format(my_id, x, )

pred = model.predict(uid=my_id, iid=movie_id)
est = pred.est
df.loc[df[movie_id] != 'nan', movie_id] = est


In [None]:
#코사인 유사도 평가 알고리즘
my_id = int(input())
my_vector = adj_matrix[my_id]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    cos_sim = compute_cos_sim(my_vector, user_vector)
    if cos_sim > best_match:
      best_match = cos_sim
      best_match_id = user_id
      best_match_vector = user_vector
      
print("Best Match Euclidean Distance: {}, Best Match ID: {}".format(best_match, best_match_id))

In [None]:
#추천리스트 뽑기(내가 보지 않았지만 유사 ID가 본 영화)
recommend_lst = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_lst.append(i)
print(recommend_lst)