# Project 1 : Content-based Recommendation
## 2022-26772 장석운

In [441]:
# (Optional) install and import additional libraries here (numpy, pandas, etc.)

from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import math
import time
from sklearn.metrics.pairwise import cosine_similarity

## Task 1. Genres를 이용한 movie representation

In [442]:
# A) movies_w_imgurl.csv에서 영화의 Genre 추출
mv_genre_df = pd.read_csv("./data/movies_w_imgurl.csv")
genres = mv_genre_df["genres"]

num_movie = len(genres) # 총 영화 수
genre_counter = Counter()
for genre in genres:
    glist = genre.split('|')
    genre_counter.update(glist)

# B) Genre의 IDF 계산
#    IDF : numpy.log10(전체 movie 개수 / 각 genre별 movie 개수)
genre_idf = {}
for genre in genre_counter:
    genre_idf[genre] = -1 * np.log10(num_movie / genre_counter[genre])
genre_idf = sorted(genre_idf.items())

print("length of genre_idf : ", len(genre_idf))
print("num of movies : ", num_movie)
print(genre_idf)

length of genre_idf :  20
num of movies :  9125
[('(no genres listed)', -2.704960368025206), ('Action', -0.7713043893676589), ('Adventure', -0.9121797000129033), ('Animation', -1.3099253499965757), ('Children', -1.1945643183694983), ('Comedy', -0.4397493403877204), ('Crime', -0.9188401879702872), ('Documentary', -1.2656276741949437), ('Drama', -0.3202486250869238), ('Fantasy', -1.1446551248042451), ('Film-Noir', -1.8363812321614266), ('Horror', -1.0172332797624717), ('IMAX', -1.7755414423109135), ('Musical', -1.3647366513029382), ('Mystery', -1.2254330435396654), ('Romance', -0.7713043893676589), ('Sci-Fi', -1.061507691539019), ('Thriller', -0.7224378798545897), ('War', -1.395566808876423), ('Western', -1.7349235914026493)]


In [443]:
# C) TF-IDF 계산
#    TF == 1
genre_tf_idf = []
for i, genre in enumerate(genres):
    glist = genre.split('|')
    tmp = []
    for ig in genre_idf:
        if ig[0] in glist: # 해당 영화의 장르에 따른 tf-idf 계산
            tmp.append(ig[1])
        else: 
            tmp.append(0)
    genre_tf_idf.append(tmp)

  
print("length of genre_tf_idf : ", len(genre_tf_idf))
for i in range(10):
  print(genre_tf_idf[i])

length of genre_tf_idf :  9125
[0, 0, -0.9121797000129033, -1.3099253499965757, -1.1945643183694983, -0.4397493403877204, 0, 0, 0, -1.1446551248042451, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, -0.9121797000129033, 0, -1.1945643183694983, 0, 0, 0, 0, -1.1446551248042451, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, -0.4397493403877204, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.7713043893676589, 0, 0, 0, 0]
[0, 0, 0, 0, 0, -0.4397493403877204, 0, 0, -0.3202486250869238, 0, 0, 0, 0, 0, 0, -0.7713043893676589, 0, 0, 0, 0]
[0, 0, 0, 0, 0, -0.4397493403877204, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, -0.7713043893676589, 0, 0, 0, 0, -0.9188401879702872, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.7224378798545897, 0, 0]
[0, 0, 0, 0, 0, -0.4397493403877204, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.7713043893676589, 0, 0, 0, 0]
[0, 0, -0.9121797000129033, 0, -1.1945643183694983, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, -0.7713043893676589, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, -0.7713043893676589,

## Task 2. Tag를 이용한 movie representation 보완

In [444]:
# A) tags.csv에서 영화의 tag 추출
mv_tags_df = pd.read_csv("./data/tags.csv")
mvid_tags = mv_tags_df[mv_tags_df.columns[1:3]]
mvid = mv_tags_df["movieId"]
tag = mv_tags_df["tag"]

# 각 tag가 붙은 영화 수, len(tag_counter): 586, 
tag_counter = Counter()
for t in tag:
    r = t.split(', ')
    tag_counter.update(r)

# tag가 달린 영화 수 tmovie_len : 689
tmovie_len = len(set(mvid))
print(tmovie_len)

# B) tag 별 IDF 계산
#    IDF : numpy.log10(전체 movie 개수 / 각 tag에 있는 movie count)
tag_idf = Counter()
for t in tag_counter:
    tag_idf[t] = np.log10( tmovie_len / tag_counter[t])
print(tag_idf)

# tag가 없는 영화는 빈 배열을 append 하도록
no_tag_li = []
for _ in range(len(tag_idf)):
    no_tag_li.append(0)

689
Counter({"sandra 'boring' bullock": 2.8382192219076257, 'dentist': 2.8382192219076257, 'Cambodia': 2.8382192219076257, 'Russian': 2.8382192219076257, 'forgettable': 2.8382192219076257, 'short': 2.8382192219076257, 'dull story': 2.8382192219076257, 'powerpoint': 2.8382192219076257, 'activist': 2.8382192219076257, 'uganda': 2.8382192219076257, 'Ron Howard': 2.8382192219076257, 'b movie': 2.8382192219076257, 'comedt': 2.8382192219076257, 'intense': 2.8382192219076257, 'r:violence': 2.8382192219076257, 'tarantino': 2.8382192219076257, 'tragedy': 2.8382192219076257, 'Predictable': 2.8382192219076257, 'CHRISTIAN': 2.8382192219076257, 'bollywood': 2.8382192219076257, 'No progress': 2.8382192219076257, 'Too slow': 2.8382192219076257, 'Views': 2.8382192219076257, 'cult classic': 2.8382192219076257, 'jack nicholson': 2.8382192219076257, 'Stanley Kubrick': 2.8382192219076257, 'Mindfuck': 2.8382192219076257, 'intelligent': 2.8382192219076257, 'math': 2.8382192219076257, 'complicated': 2.838219

In [445]:
# C) TF-IDF 계산
#    TF는 한 영화에 대해 tag가 몇 번 등록되었는지에 따라 계산 (한 영화에 여러 명의 user가 같은 tag를 달 수 있음)
#    TF공식: n(d, t) / n(d)

# tag가 달린 영화에 어떤 tag가 있는지
movie_tag = {}
for mv in mvid:
    movie_tag[mv] = []

for i in range(len(tag)):
    movie_tag[mvid[i]] = movie_tag[mvid[i]] + tag[i].split(', ')

print(movie_tag)
print(len(movie_tag))

{339: ["sandra 'boring' bullock"], 1955: ['dentist'], 7478: ['Cambodia'], 32892: ['Russian'], 34162: ['forgettable'], 35957: ['short'], 37729: ['dull story'], 45950: ['powerpoint'], 100365: ['activist', 'documentary', 'uganda'], 150: ['Ron Howard', 'tom hanks'], 2174: ['music', 'weird', 'Micheal Keaton'], 8623: ['Steve Martin'], 107999: ['action', 'anime', 'kung fu'], 111624: ['drama', 'indie', 'love'], 130682: ['b movie', 'comedt', 'horror'], 1199: ['Trilogy of the Imagination', 'dystopia'], 2968: ['Gilliam', 'Trilogy of the Imagination'], 4467: ['Trilogy of the Imagination'], 4911: ['Gilliam'], 5909: ['Takashi Miike'], 47465: ['Gilliam'], 296: ['intense', 'r:violence', 'tarantino', 'dark comedy', 'Quentin Tarantino'], 4388: ['parody'], 1131: ['emotional', 'tragedy'], 64957: ['original plot', 'adapted from:book', 'Aging Disorder', 'Brad Pitt', 'cinematography', 'drama', 'original plot', 'slow parts', 'touching', 'Aging', 'Brad Pitt', 'original plot', 'philosophical', 'Brad Pitt', 'cin

In [446]:
# 영화 list
mv_list = mv_genre_df["movieId"]

# TF-IDF 계산
tag_tf_idf = []
mv_mvid_map = {}
idx = 0
for mv in mv_list:
    tmp = []
    if mv in movie_tag:
        for tag in tag_counter:
            tf = len([t for t in movie_tag[mv] if t == tag]) / len(movie_tag[mv]) # TF = (n(d, t) / n(d))
            if tag in movie_tag[mv]:
                tmp.append(tag_idf[tag] * tf)
            else:
                tmp.append(0)
    else:
        tmp = no_tag_li
    tag_tf_idf.append(tmp)
    
    mv_mvid_map[mv] = idx # movie id에 따른 tf-idf index
    idx += 1

print(len(tag_tf_idf))
print(len(tag_tf_idf[0]))
print(len(mv_mvid_map))
print(mv_mvid_map) # movieId와 tf-idf의 index mapping

9125
586
9125
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 48: 46, 49: 47, 50: 48, 52: 49, 53: 50, 54: 51, 55: 52, 57: 53, 58: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 68: 63, 69: 64, 70: 65, 71: 66, 72: 67, 73: 68, 74: 69, 76: 70, 77: 71, 78: 72, 79: 73, 80: 74, 81: 75, 82: 76, 83: 77, 84: 78, 85: 79, 86: 80, 87: 81, 88: 82, 89: 83, 92: 84, 93: 85, 94: 86, 95: 87, 96: 88, 97: 89, 98: 90, 99: 91, 100: 92, 101: 93, 102: 94, 103: 95, 104: 96, 105: 97, 107: 98, 108: 99, 110: 100, 111: 101, 112: 102, 113: 103, 114: 104, 116: 105, 117: 106, 118: 107, 119: 108, 121: 109, 122: 110, 123: 111, 124: 112, 125: 113, 126: 114, 129: 115, 130: 116, 131: 117, 132: 118, 135: 1

In [447]:
# 최종 movie representation
tf_idf = np.concatenate((np.array(genre_tf_idf), np.array(tag_tf_idf)), axis=1)
print(tf_idf.shape)

(9125, 606)


## Task 3. 두 movie 사이의 cosine similarity 계산

In [448]:
mv_cossim = np.zeros(shape=(tf_idf.shape[0], tf_idf.shape[0]))
st = time.time()
mv_cossim = cosine_similarity(tf_idf, tf_idf)
en = time.time()
print('shape', mv_cossim.shape)
# print("total execution time:", en - st , "sec") # in seconds
print("cossine similarity among movies : \n", mv_cossim[:5])
# print("cossine similarity among movies : \n", mv_cossim[5:10])

# def cos_sim(a, b):
#   return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))
# mv_cossim2 = np.zeros(shape=(tf_idf.shape[0], tf_idf.shape[0]))
# for i in range(tf_idf.shape[0]):
#   for j in range(tf_idf.shape[0]):
#     mv_cossim[i][j] = cos_sim(tf_idf[i], tf_idf[j])
# print(mv_cossim2[:10][:10])

shape (9125, 9125)
cossine similarity among movies : 
 [[1.         0.56825577 0.06551191 ... 0.         0.03904665 0.        ]
 [0.56825577 1.         0.         ... 0.         0.         0.        ]
 [0.06551191 0.         1.         ... 0.         0.14621366 0.        ]
 [0.0616256  0.         0.94067784 ... 0.         0.13753995 0.        ]
 [0.02025191 0.         0.07583509 ... 0.         0.04519951 0.        ]]


## Task 4. Content-based 추천 점수 계산

In [449]:
# mv_cossim로부터 user_sim 추출
ratings = pd.read_csv("./data/ratings.csv")
ratings.head()
user_rate = {}
for i in range(len(ratings["userId"])):
    user = ratings["userId"][i]
    mvid = ratings["movieId"][i]
    user_rating = ratings["rating"][i]
    if user in user_rate:
        user_rate[user].append([ mvid, user_rating])
    else:
        user_rate[user] = [ [ mvid, user_rating] ]

# print(user_rate)
# print(len(user_rate[1]))
# print(len(user_rate[2]))
# print(len(user_rate[3]))

for user in user_rate:
    user_rate[user] = np.array(user_rate[user]).T
print(user_rate[1])
print(user_rate[1].shape)
print("movies n : ", user_rate[1][0].shape) # user 1이 rating한 movie들
print("ratings n : ", user_rate[1][1].shape) # user 1이 rating한 movie의 rate들

[[3.100e+01 1.029e+03 1.061e+03 1.129e+03 1.172e+03 1.263e+03 1.287e+03
  1.293e+03 1.339e+03 1.343e+03 1.371e+03 1.405e+03 1.953e+03 2.105e+03
  2.150e+03 2.193e+03 2.294e+03 2.455e+03 2.968e+03 3.671e+03]
 [2.500e+00 3.000e+00 3.000e+00 2.000e+00 4.000e+00 2.000e+00 2.000e+00
  2.000e+00 3.500e+00 2.000e+00 2.500e+00 1.000e+00 4.000e+00 4.000e+00
  3.000e+00 2.000e+00 2.000e+00 2.500e+00 1.000e+00 3.000e+00]]
(2, 20)
movies n :  (20,)
ratings n :  (20,)


In [450]:
# mv_cossim로부터 user_sim 추출
user_sim = {}
# user_sim_sum = {}
for user in user_rate:
    mvids = np.array(user_rate[user][0])
    sim = []
    for mvid in mvids:
        sim.append(mv_cossim[mv_mvid_map[mvid]]) # 1 x 9125
    user_sim[user] = np.array(sim)
    # user_sim_sum[user] = np.sum(np.array(sim), axis = 0)

print("user sim shape : ", user_sim[1].shape) # user 1이 rating한 movie들의 다른 movie와의 mv_cossim
print("total users : ", len(user_sim))

# print(user_sim_sum[1][0].shape)
# print(user_sim_sum[1].shape) # user 1이 rating한 movie들 mv_cossim 합
# print(len(user_sim_sum))

user sim shape :  (20, 9125)
total users :  671


In [451]:
#user id의 전체 영화에 대한 추정 점수 계산
user_score = []
for user in user_rate:
    score = np.matmul(user_sim[user].T, user_rate[user][1]) / (np.sum(user_sim[user], axis = 0) + 1) # 전체 영화 추정 점수 계산
    user_score.append(score)

print("total user score shape : ", user_score[1].shape) # user의 movie별 score
print("total users : ", len(user_score))

total user score shape :  (9125,)
total users :  671


In [452]:
# read input.txt 
def read_user_id():
    with open('input.txt', 'r') as f:
        return [l.strip() for l in  f.readlines()]

In [453]:
# write to output file output.txt
def write_output(prediction):
    with open('output.txt', 'w') as f:
        for p in prediction:
            f.write(p + "\n")
            # for r in p:
                # f.write(r + "\n")

In [454]:
def do(ids):
    # test implementation
    # prediction = [['{},{},{}'.format(i, 5, 3.5)]*30 for i in ids]
    prediction = []
    for user in ids:
        score = user_score[int(user) - 1]
        mv_score_map = []
        for i, s in enumerate(score):
            mv_score_map.append((i, s))
        sorted_mv_score_map = sorted(mv_score_map, key=lambda x : x[1], reverse=True)   # score 기준으로 내림차순 정렬
        for i, s in sorted_mv_score_map[:30]:   # 상위 30개만 출력
            prediction.append('{},{},{}'.format(int(user), i, round(s, 4))) 
          
    print(prediction)
    return prediction

In [455]:
user_ids = read_user_id()
print(user_ids)
#### TODO: replace with your implementation ####
result = do(user_ids)
print(result)
#### TODO end ####
write_output(result)

['1', '10', '22', '36', '100']
['1,4620,2.239', '1,6761,2.239', '1,7472,2.239', '1,3430,2.2381', '1,3387,2.2371', '1,3994,2.2334', '1,841,2.233', '1,1065,2.233', '1,1874,2.233', '1,2179,2.233', '1,3584,2.233', '1,5549,2.233', '1,6785,2.233', '1,7804,2.233', '1,7964,2.233', '1,262,2.2215', '1,2435,2.2215', '1,3907,2.2215', '1,4391,2.2215', '1,4548,2.2215', '1,6864,2.2215', '1,6902,2.2215', '1,7382,2.2215', '1,7856,2.2215', '1,6997,2.2204', '1,5126,2.2187', '1,7422,2.2187', '1,465,2.2182', '1,281,2.2177', '1,650,2.2177', '10,972,3.5916', '10,2278,3.5916', '10,2369,3.5916', '10,2615,3.5916', '10,3065,3.5916', '10,3848,3.5916', '10,4182,3.5916', '10,4915,3.5916', '10,5246,3.5916', '10,5428,3.5916', '10,5463,3.5916', '10,5729,3.5916', '10,6330,3.5916', '10,6806,3.5916', '10,6972,3.5916', '10,7467,3.5916', '10,9120,3.5916', '10,2953,3.5829', '10,50,3.5765', '10,831,3.5765', '10,876,3.5765', '10,1056,3.5765', '10,1152,3.5765', '10,1286,3.5765', '10,1493,3.5765', '10,1661,3.5765', '10,2378,3.5