In [76]:
# module import
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize, MinMaxScaler
import math


# Task 1) Item based Collaborative Filtering

In [77]:
d = pd.read_csv("./data/ratings_train.csv")
user = d["userId"]
movie = d["movieId"]
rating = d["rating"]

movie_set = sorted(set(movie))
movie_index, index_movie = {}, {}
user_set = sorted(set(user))
print(len(movie_set))
print(len(user_set))

# item-user sparse matrix 생성
iu_sparse_mat = []
for i, m in enumerate(movie_set):
  tmp = []
  rated_user = d[d["movieId"]==m] # 영화 m에 rating한 유저를 찾음
  rated_user_set = set(rated_user["userId"])
  for u in range(1, max(user_set)+1):
    if u in rated_user_set:
      tmp.append(rated_user[rated_user["userId"]==u]["rating"].item()) # user u가 movie m에 준 rating
    else:
      tmp.append(0) # user u가 movie m에 rating하지 않았으면 0
  iu_sparse_mat.append(tmp) # tmp는 영화 m에 대한 user rating matrix가 됨
  movie_index[m] = i
  index_movie[i] = m
iu_sparse_mat = np.array(iu_sparse_mat)
print(iu_sparse_mat.shape)

7357
547
(7357, 671)


### movie - movie 사이의 cosine similarity 계산

In [78]:
movie_cossim = cosine_similarity(iu_sparse_mat, iu_sparse_mat)
print(movie_cossim.shape)

(7357, 7357)


### 추천 점수 계산

In [79]:
user_rating = {}
user_sim = {} # shape=[n(특정 userid가 rating한 영화 수), # movies]
# user_rating = []
for u, m, r in zip(user, movie, rating):
  if u in user_rating:
    user_rating[u].append((m, r)) # user u가 준 movie와 rating pair
  else:
    user_rating[u] = [(m, r)]
print(user_rating[1])
print(user_rating[2])
print(len(user_rating[1]))
print(len(user_rating[2]))

for u in user_rating:
  for m, r in user_rating[u]:
    if u in user_sim:
      user_sim[u].append((m, movie_cossim[movie_index[m]])) # user u에 대한 movie m의 추천점수 계산
    else:
      user_sim[u] = [(m, movie_cossim[movie_index[m]])] # user u에 대한 movie m의 추천점수 계산
print(len(user_sim[1]))
print(len(user_sim[2]))



[(31, 2.5), (1029, 3.0), (1061, 3.0), (1129, 2.0), (1172, 4.0), (1263, 2.0), (1287, 2.0), (1293, 2.0), (1339, 3.5), (1343, 2.0), (1371, 2.5), (1405, 1.0), (1953, 4.0), (2105, 4.0), (2150, 3.0), (2193, 2.0), (2294, 2.0), (2455, 2.5), (2968, 1.0), (3671, 3.0)]
[(10, 4.0), (17, 5.0), (39, 5.0), (47, 4.0), (50, 4.0), (52, 3.0), (62, 3.0), (110, 4.0), (144, 3.0), (150, 5.0), (153, 4.0), (161, 3.0), (165, 3.0), (168, 3.0), (185, 3.0), (186, 3.0), (208, 3.0), (222, 5.0), (223, 1.0), (225, 3.0), (235, 3.0), (248, 3.0), (253, 4.0), (261, 4.0), (265, 5.0), (266, 5.0), (272, 3.0), (273, 4.0), (292, 3.0), (296, 4.0), (300, 3.0), (314, 4.0), (317, 2.0), (319, 1.0), (339, 3.0), (349, 4.0), (350, 4.0), (356, 3.0), (357, 3.0), (364, 3.0), (367, 3.0), (370, 2.0), (371, 3.0), (372, 3.0), (377, 3.0), (382, 3.0), (405, 2.0), (410, 3.0), (454, 4.0), (457, 3.0), (468, 4.0), (474, 2.0), (480, 4.0), (485, 3.0), (497, 3.0), (500, 4.0), (508, 4.0), (509, 4.0), (515, 4.0), (527, 4.0), (537, 4.0), (539, 3.0), (55

In [80]:
# 영화에 대한 추정 점수 계산
item_based_recommender_score = {}
for u in user_set:
  user_sim_mat, user_rating_mat = [], []
  for m, r in user_rating[u]:
    user_rating_mat.append(r)
  for m, sim in user_sim[u]:
    user_sim_mat.append(sim)
  sim_sum = np.sum(np.array(user_sim_mat), axis=0)
  item_based_recommender_score[u] = np.matmul(np.array(user_sim_mat).T, np.array(user_rating_mat)) / (sim_sum + 1) # 공식 이용
print(len(item_based_recommender_score))
print(item_based_recommender_score[1].shape)

547
(7357,)


### 계산한 추천 점수와 rating.val의 RMSE 계산

In [81]:
vd = pd.read_csv("./data/ratings_val.csv")
user_val = vd["userId"]
movie_val = vd["movieId"]
rating_val = vd["rating"]

val_user_rating = {}
for u, m, r in zip(user_val, movie_val, rating_val):
  if u in val_user_rating:
    val_user_rating[u].append((m, r))
  else:
    val_user_rating[u] = [(m, r)]

item_based_rmse = {}
for u in set(user_val):
  if u not in item_based_recommender_score: # user가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외
    continue
  else:
    diff, n = 0, 0
    for m, r in val_user_rating[u]:
      if m not in movie_index: # movie가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외
        continue
      else:
        diff += math.pow((r - item_based_recommender_score[u][movie_index[m]]), 2) 
        n += 1
    if diff == 0: # movie가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외
      continue
    else:
      item_based_rmse[u] = math.sqrt((diff / n))

print("userID 73의 item-based 기법 RMSE: ", item_based_rmse[73])

userID 73의 item-based 기법 RMSE:  0.7322648432687335


# Task 2) Matrix Factorization

### Rating 점수를 이용하여 Item-User Sparse Matrix 생성
### Item-User Sparse Matrix의 빈 자리 채우기

In [82]:
# 각 movie의 평균 평점 계산
movie_rating_li = {}
movie_avg_rating = {}
for m, r in zip(movie, rating):
  if m in movie_rating_li:
    movie_rating_li[m].append(r)
  else:
    movie_rating_li[m] = [r]

for m in movie_rating_li:
  rating_li = movie_rating_li[m]
  movie_avg_rating[m] = (sum(rating_li)/len(rating_li))


# item-user sparse matrix 빈 자리 채우면서 생성
iu_sparse_mat_with_avg = []
for m in movie_set:
  tmp = []
  rated_user = d[d["movieId"]==m]
  rated_user_set = set(rated_user["userId"])
  for u in range(1, max(user_set)+1):
    if u in rated_user_set:
      tmp.append(rated_user[rated_user["userId"]==u]["rating"].item())
    else:
      tmp.append(movie_avg_rating[m]) # 평균으로 대체
  iu_sparse_mat_with_avg.append(tmp)
iu_sparse_mat_with_avg = np.array(iu_sparse_mat_with_avg)
print(iu_sparse_mat_with_avg.shape)


(7357, 671)


### Decomposing the rating matrix

In [83]:
U, s, Vt = np.linalg.svd(iu_sparse_mat_with_avg, full_matrices=False)
print(U.shape, s.shape, Vt.shape)

# Singular value 중 가장 큰 K = 400개만 사용
k = 400
Uk = U[:,:k]
sk = np.diag(s[:k]) # 대각 행렬로 만들어줌
Vtk = Vt[:k,:]
print(Uk.shape, sk.shape, Vtk.shape)

(7357, 671) (671,) (671, 671)
(7357, 400) (400, 400) (400, 671)


### 추천 점수 계산

In [84]:
Uksk = np.matmul(Uk, sk)
matfac_recommend_score = np.matmul(Uksk, Vtk)
print(matfac_recommend_score.shape)

(7357, 671)


### 계산한 추천 점수롸 "rating_val.csv"의 RMSE 계산

In [85]:
# matrix factorization 기반 RMSE 측정
user_val_set = set(user_val) # user가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외

matfac_rmse = {}
for u in val_user_rating:
  if u not in user_val_set:
    continue
  else:
    diff, n = 0, 0
    for m, r in val_user_rating[u]:
      if m not in movie_set: # movie가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외
        continue
      else:
        diff += math.pow((r - matfac_recommend_score[movie_index[m]][u - 1]), 2)
        n += 1
    if diff == 0: # movie가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외
      continue
    else:
      rmse = math.sqrt(diff / n)
      matfac_rmse[u] = rmse
print("userID 73의 matrix factorization 기법 RMSE: ", matfac_rmse[73])


userID 73의 matrix factorization 기법 RMSE:  0.7854786063906751


# Task 3) Optimization

In [86]:
# 평균으로 대체하지 말고 평균보다 좀 더 적게 weight를 곱해서 해보자
# k = 100, 200, 300, 500로 테스트해보자

# 각 movie의 평균 평점 계산
w_movie_rating_li = {}
w_movie_avg_rating = {}
for m, r in zip(movie, rating):
  if m in w_movie_rating_li:
    w_movie_rating_li[m].append(r)
  else:
    w_movie_rating_li[m] = [r]

for m in w_movie_rating_li:
  w_rating_li = w_movie_rating_li[m]
  w_movie_avg_rating[m] = (sum(w_rating_li)/len(w_rating_li))


# item-user sparse matrix 빈 자리 채우면서 생성
iu_sparse_mat_with_w_avg = []
for m in movie_set:
  tmp = []
  rated_user = d[d["movieId"]==m]
  rated_user_set = set(rated_user["userId"])
  # print(rated_user_set)
  for u in range(1, max(user_set)+1):
    if u in rated_user_set:
      tmp.append(rated_user[rated_user["userId"]==u]["rating"].item())
    else:
      tmp.append(w_movie_avg_rating[m]) # 평균으로 대체
  iu_sparse_mat_with_w_avg.append(tmp)
iu_sparse_mat_with_w_avg = np.array(iu_sparse_mat_with_w_avg)
print(iu_sparse_mat_with_avg.shape)

wU, ws, wVt = np.linalg.svd(iu_sparse_mat_with_w_avg, full_matrices=False)

# K 변화
k = 100 # 50, 100, 200, 300
Uk_opt = wU[:,:k]
sk_opt = np.diag(ws[:k]) # 대각 행렬로 만들어줌
Vtk_opt = wVt[:k,:]

wUksk = np.matmul(Uk_opt, sk_opt)
optimized_recommend_score = np.matmul(wUksk, Vtk_opt)

# optimized 기반 RMSE 측정
user_val_set = set(user_val) # user가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외

optimized_rmse = {}
for u in val_user_rating:
  if u not in user_val_set:
    continue
  else:
    diff, n = 0, 0
    for m, r in val_user_rating[u]:
      if m not in movie_set: # movie가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외
        continue
      else:
        diff += math.pow((r - optimized_recommend_score[movie_index[m]][u - 1]), 2)
        n += 1
    if diff == 0: # movie가 rating_val에는 있고 rating_train에 없으면 continue, 계산 제외
      continue
    else:
      rmse = math.sqrt(diff / n)
      optimized_rmse[u] = rmse

print("userID 73의 optimized 기법 RMSE: ", optimized_rmse[73])


(7357, 671)
userID 73의 optimized 기법 RMSE:  0.7846175249981066


In [87]:
total_matfac_rmse, total_optimized_rmse = 0, 0
for u in matfac_rmse:
  total_matfac_rmse += matfac_rmse[u]
  
for u in optimized_rmse:
  total_optimized_rmse += optimized_rmse[u]

print("optimized based RMSE : ", total_optimized_rmse/len(matfac_rmse))
print("matrix factorization based RMSE : ", total_matfac_rmse/len(optimized_rmse))

optimized based RMSE :  0.9519133010920591
matrix factorization based RMSE :  0.9527140378742626


# Output 파일 추출

In [88]:
# read input.txt 
def read_user_id():
    with open('input.txt', 'r') as f:
        user_ids = [list(l.split(',')) for l in f.readlines()]
        user_movie = []
        for u, m in user_ids:
          user_movie.append([int(u), int(m)])
        return user_movie

In [89]:
# write to output file output.txt
def write_output(prediction):
    with open('output.txt', 'w') as f:
        for u, m in prediction:
            f.write(str(u) + ',' + str(m) + ',' + '{:.4f}'.format(round(prediction[(u, m)][0], 4)) + "\n")
            f.write(str(u) + ',' + str(m) + ',' + '{:.4f}'.format(round(prediction[(u, m)][1], 4)) + "\n")
            f.write(str(u) + ',' + str(m) + ',' + '{:.4f}'.format(round(prediction[(u, m)][2], 4)) + "\n")

In [90]:
def do(ids):
    prediction = {}
    for u, m in ids:
        item_based_score = item_based_recommender_score[u][movie_index[m]]
        matfac_score = matfac_recommend_score[movie_index[m]][u - 1]
        optimized_score = optimized_recommend_score[movie_index[m]][u - 1]
        prediction[(u, m)] = [item_based_score, matfac_score, optimized_score]
          
    print(prediction)
    return prediction

In [91]:
user_ids = read_user_id()
print(user_ids)

#### TODO: replace with your implementation ####
result = do(user_ids)
print(result)
# #### TODO end ####
write_output(result)

[[1, 31], [2, 10], [3, 1235], [4, 10]]
{(1, 31): [1.8642451221205858, 2.4925056638276373, 3.161811060374272], (2, 10): [3.3855568366121793, 4.069587599108663, 3.567260434928069], (3, 1235): [3.3529568019107816, 3.8405112000937374, 3.6108746042864137], (4, 10): [4.274017935907572, 4.02104481622596, 3.8921667071878887]}
{(1, 31): [1.8642451221205858, 2.4925056638276373, 3.161811060374272], (2, 10): [3.3855568366121793, 4.069587599108663, 3.567260434928069], (3, 1235): [3.3529568019107816, 3.8405112000937374, 3.6108746042864137], (4, 10): [4.274017935907572, 4.02104481622596, 3.8921667071878887]}
