In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/lecture/"
os.listdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['movies.csv',
 'ratings.csv',
 'tags.csv',
 'links.csv',
 'movie_user.gsheet',
 '제목 없는 문서.gdoc',
 '협업 필터링 - Item-based.ipynb',
 'movies_refined.csv',
 'ratings_refined.csv',
 '무비렌즈 데이터 전처리.ipynb',
 '협업 필터링 - User-based.ipynb',
 '협업_필터링_Item_based_ipynb의_사본.ipynb',
 'Jacquard.csv',
 'Jacquard(mean).csv',
 'user_means(x).ipynb',
 'user_means(x).ipynb의 사본',
 'mode.ipynb',
 'zero.ipynb',
 'euclidean_similarity_movies.csv',
 'euclidean_similarity.ipynb',
 'movie_user.csv',
 'Untitled0.ipynb',
 '협업 필터링 - User-based with dot production.ipynb',
 'Untitled0(x).ipynb',
 'euclidean_mode.ipynb',
 'Untitled1.ipynb',
 'euclidean_mean.ipynb',
 'user_based_cf_prediction.csv',
 'euclidean_knn.ipynb',
 'euclidean_knn.ipynb의 사본']

In [3]:
import pandas as pd
import numpy as np

In [4]:
ratings = pd.read_csv(path + "ratings_refined.csv", usecols=['userId', 'movieId', 'rating'])

In [5]:
movies = pd.read_csv(path + "movies_refined.csv", usecols=['movieId', 'title'])

In [6]:
df = pd.merge(ratings, movies, on='movieId', how='left')

In [7]:
df.columns[df.isna().any()].tolist()

[]

In [8]:
df[df['title'].isnull()]

Unnamed: 0,userId,movieId,rating,title


In [9]:
movie_user = df.pivot_table(values='rating', index='title', columns='userId')
movie_user.to_csv('movie_user.csv')
movie_user

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation,,,,,,,,,,,...,,,,,,,,,,
'Round Midnight,,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot,,,,,,,,,,,...,,,,,,,,,,
'Til There Was You,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx,,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union,,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos!,4.0,,,,,,,,,,...,,,,,,,,,,


In [10]:
movie_user.shape

(9413, 610)

In [15]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# KNNImputer 객체 생성
knn_imputer = KNNImputer(n_neighbors=3)

# fit 메서드를 사용하여 결측값을 대체할 패턴 학습
knn_imputer.fit(movie_user)

# transform 메서드를 사용하여 결측값 대체
movie_user_tmp = knn_imputer.transform(movie_user)

# 결과를 데이터프레임으로 변환
movie_user_tmp = pd.DataFrame(movie_user_tmp, columns=movie_user.columns)

# SVD를 사용하여 차원 축소
n_components = 50  # 적절한 차원을 선택 (예: 50)
svd = TruncatedSVD(n_components=n_components, random_state=42)
movie_user_svd = svd.fit_transform(movie_user_tmp)


# 유사도 행렬 계산
euclidean_similarity_matrix = 1 / (1 + euclidean_distances(movie_user_tmp, movie_user_tmp))

# 유사도 행렬 출력
print("Similarity Matrix:")
print(euclidean_similarity_matrix)

Similarity Matrix:
[[1.         0.0538241  0.05339786 ... 0.03862417 0.0437352  0.04893329]
 [0.0538241  1.         0.05559721 ... 0.03513691 0.03982953 0.05340852]
 [0.05339786 0.05559721 1.         ... 0.03813216 0.0447353  0.04800531]
 ...
 [0.03862417 0.03513691 0.03813216 ... 1.         0.04749006 0.03931914]
 [0.0437352  0.03982953 0.0447353  ... 0.04749006 1.         0.0429358 ]
 [0.04893329 0.05340852 0.04800531 ... 0.03931914 0.0429358  1.        ]]


In [16]:
euclidean_similarity_matrix

array([[1.        , 0.0538241 , 0.05339786, ..., 0.03862417, 0.0437352 ,
        0.04893329],
       [0.0538241 , 1.        , 0.05559721, ..., 0.03513691, 0.03982953,
        0.05340852],
       [0.05339786, 0.05559721, 1.        , ..., 0.03813216, 0.0447353 ,
        0.04800531],
       ...,
       [0.03862417, 0.03513691, 0.03813216, ..., 1.        , 0.04749006,
        0.03931914],
       [0.0437352 , 0.03982953, 0.0447353 , ..., 0.04749006, 1.        ,
        0.0429358 ],
       [0.04893329, 0.05340852, 0.04800531, ..., 0.03931914, 0.0429358 ,
        1.        ]])

In [17]:
movie_titles = movie_user.index
movie_titles

Index([''71', ''Hellboy': The Seeds of Creation', ''Round Midnight',
       ''Salem's Lot', ''Til There Was You', ''Tis the Season for Love',
       ''burbs, The', ''night Mother', '(500) Days of Summer',
       '*batteries not included',
       ...
       'Zulu', '[REC]', '[REC]²', '[REC]³ 3 Génesis',
       'anohana: The Flower We Saw That Day - The Movie', 'eXistenZ', 'xXx',
       'xXx: State of the Union', '¡Three Amigos!',
       'À nous la liberté (Freedom for Us)'],
      dtype='object', name='title', length=9413)

In [18]:
movie_similarity = pd.DataFrame(euclidean_similarity_matrix,
                                index=movie_titles, columns=movie_titles)
print(movie_similarity.shape)
movie_similarity

(9413, 9413)


title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,1.000000,0.053824,0.053398,0.052298,0.047820,0.047609,0.042627,0.049373,0.048295,0.050488,...,0.048060,0.048931,0.047781,0.044909,0.056197,0.050168,0.040014,0.038624,0.043735,0.048933
'Hellboy': The Seeds of Creation,0.053824,1.000000,0.055597,0.055977,0.053440,0.052408,0.038179,0.048386,0.044627,0.049086,...,0.044199,0.043429,0.042754,0.040542,0.053425,0.046695,0.037790,0.035137,0.039830,0.053409
'Round Midnight,0.053398,0.055597,1.000000,0.052382,0.050330,0.046056,0.043690,0.053026,0.046636,0.049354,...,0.047383,0.046704,0.046310,0.044612,0.054570,0.046815,0.041756,0.038132,0.044735,0.048005
'Salem's Lot,0.052298,0.055977,0.052382,1.000000,0.061955,0.052919,0.043549,0.050879,0.049939,0.053845,...,0.049291,0.048709,0.048314,0.047230,0.063290,0.049388,0.045404,0.041083,0.046882,0.052135
'Til There Was You,0.047820,0.053440,0.050330,0.061955,1.000000,0.047458,0.041054,0.052201,0.045801,0.051952,...,0.047227,0.049531,0.045836,0.044406,0.054185,0.048198,0.042322,0.039440,0.043686,0.050144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,0.050168,0.046695,0.046815,0.049388,0.048198,0.044927,0.045149,0.047592,0.051869,0.052566,...,0.051479,0.049365,0.045856,0.044516,0.053850,1.000000,0.042845,0.040907,0.047403,0.045694
xXx,0.040014,0.037790,0.041756,0.045404,0.042322,0.038301,0.048159,0.045521,0.046125,0.047438,...,0.049512,0.045201,0.047347,0.049175,0.049234,0.042845,1.000000,0.050393,0.050742,0.041202
xXx: State of the Union,0.038624,0.035137,0.038132,0.041083,0.039440,0.036163,0.048426,0.044563,0.041711,0.043976,...,0.045468,0.044211,0.045145,0.049477,0.045479,0.040907,0.050393,1.000000,0.047490,0.039319
¡Three Amigos!,0.043735,0.039830,0.044735,0.046882,0.043686,0.038720,0.052047,0.047428,0.049346,0.050632,...,0.048922,0.048874,0.046425,0.048090,0.051941,0.047403,0.050742,0.047490,1.000000,0.042936


In [19]:
movie_similarity = pd.DataFrame(euclidean_similarity_matrix,
                                index=movie_titles, columns=movie_titles)

# 샘플 영화에 대한 유사한 영화 찾기
sample_movie = "'71"  # 여기에 샘플 영화 제목을 넣어주세요
similar_movies = movie_similarity[sample_movie].sort_values(ascending=False)[1:10]

# 결과 출력
print(similar_movies)

title
Stake Land                                                                             1.0
From Beyond                                                                            1.0
Perfect Getaway, A                                                                     1.0
Youth of the Beast (Yaju no seishun)                                                   1.0
Down Terrace                                                                           1.0
Devils on the Doorstep (Guizi lai le)                                                  1.0
Reality                                                                                1.0
Deadly Outlaw: Rekka (a.k.a. Violent Fire) (Jitsuroku Andô Noboru kyôdô-den: Rekka)    1.0
Room for Romeo Brass, A                                                                1.0
Name: '71, dtype: float64


In [20]:
from tqdm.notebook import tqdm

# 예측을 위한 함수
def modeling(similarity_matrix, data):
    # 데이터 프레임 초기화 및 사용자 및 영화 정보 추출
    df_pred_all = pd.DataFrame()
    user_ids = sorted(data['userId'].unique())
    all_movies = similarity_matrix.index
    n_movies = len(all_movies)

    for user_id in tqdm(user_ids):
        idx = data[data['userId'] == user_id].index

        # 현재 사용자가 이미 시청한 영화들에 대한 유사도 행렬 추출
        watched_movie_ids = data.loc[idx, 'title'].tolist()
        sub_sim_mat = similarity_matrix.loc[watched_movie_ids]
        sub_sim_mat = sub_sim_mat.T.to_numpy()

        # 유사도에 대한 정규화 항 계산
        sim_N = np.sum(sub_sim_mat, axis=1) + 1  # 1을 더하는 이유 => 유사도의 합이 0이 되는 경우를 방지

        # 현재 사용자가 시청한 영화에 대한 평점 가져오기
        watched_movie_ratings = data.loc[idx, 'rating']
        watched_movie_ratings = np.array(watched_movie_ratings.tolist()).reshape(-1, 1)

        # 가중 평균을 사용하여 평점 예측
        pred_y = np.matmul(sub_sim_mat, watched_movie_ratings).flatten() / sim_N

        user_id_list = [user_id] * n_movies
        cur_pred = pd.DataFrame(zip(user_id_list, all_movies, pred_y),
                                columns=['userId', 'title', 'pred_rating'])

        # 결과 기록
        df_pred_all = pd.concat([df_pred_all, cur_pred], axis=0)

    return df_pred_all

# 예시 사용
df_pred_all = modeling(movie_similarity, df)


  0%|          | 0/610 [00:00<?, ?it/s]

In [21]:
df_pred_all

Unnamed: 0,userId,title,pred_rating
0,1,'71,4.008426
1,1,'Hellboy': The Seeds of Creation,4.008407
2,1,'Round Midnight,4.005082
3,1,'Salem's Lot,4.018014
4,1,'Til There Was You,4.006536
...,...,...,...
9408,610,eXistenZ,3.657706
9409,610,xXx,3.557313
9410,610,xXx: State of the Union,3.525197
9411,610,¡Three Amigos!,3.607663


In [22]:
def get_unwatched_movies(user_id, watched_movies, all_movies):
    unwatched_movies = set(all_movies) - set(watched_movies)
    num_unwatched_movies = len(unwatched_movies)
    return list(unwatched_movies), num_unwatched_movies

# 예시 사용
user_id_to_check = 1  # 사용자 ID를 적절히 설정
watched_movies_by_user = df[df['userId'] == user_id_to_check]['title'].tolist()
all_movies_list = df['title'].unique().tolist()

unwatched_movies_list, num_unwatched_movies = get_unwatched_movies(user_id_to_check, watched_movies_by_user, all_movies_list)

# 처음 10개의 영화만 출력
print(f"총 {num_unwatched_movies}개의 영화가 남아있습니다.")
print(f"사용자 {user_id_to_check}가 아직 보지 않은 영화 목록:")
for movie in unwatched_movies_list[:10]:
    print(movie)

총 9184개의 영화가 남아있습니다.
사용자 1가 아직 보지 않은 영화 목록:
Shadow World
Maltese Falcon, The
Substitute, The
G.I. Jane
My Afternoons with Margueritte (La tête en friche)
Freddy Got Fingered
Passion of Joan of Arc, The (Passion de Jeanne d'Arc, La)
His Secret Life (a.k.a. Ignorant Fairies, The) (Fate ignoranti, Le)
Going Clear: Scientology and the Prison of Belief
Reign of Assassins


In [23]:
def recommend_top_movies_for_user(user_id, df_user, df_pred, top_n=10):
    # 사용자가 이미 본 영화 목록 가져오기
    watched_movies_by_user = df_user[df_user['userId'] == user_id]['title'].tolist()

    # 모든 영화 목록 가져오기
    all_movies_list = df_user['title'].unique().tolist()

    # 사용자가 아직 보지 않은 영화 목록 가져오기
    unwatched_movies_list, _ = get_unwatched_movies(user_id, watched_movies_by_user, all_movies_list)

    # 사용자가 아직 보지 않은 영화 중에서 pred_rating이 높은 상위 N개 영화 추천
    user_pred_data = df_pred[df_pred['userId'] == user_id]
    unwatched_movies_pred = user_pred_data[user_pred_data['title'].isin(unwatched_movies_list)]

    # 평점이 높은 상위 N개 영화 추천
    top_movies = unwatched_movies_pred.sort_values(by='pred_rating', ascending=False).head(top_n)

    return top_movies[['title', 'pred_rating']].values.tolist()

# 사용자가 아직 보지 않은 영화 중에서 pred_rating이 높은 상위 10개 추천 받기
user_id_to_recommend = 1  # 사용자 ID를 적절히 설정
recommended_movies = recommend_top_movies_for_user(user_id_to_recommend, df, df_pred_all, top_n=10)

# 결과 출력
print(f"사용자 {user_id_to_recommend}에게 추천되는 영화:")
for movie, pred_rating in recommended_movies:
    print(f"{movie} - 예측 평점: {pred_rating}")

사용자 1에게 추천되는 영화:
Raise Your Voice - 예측 평점: 4.067771848832606
My Blueberry Nights - 예측 평점: 4.065388054387649
Cold Creek Manor - 예측 평점: 4.0624469625744535
Little Ashes - 예측 평점: 4.0624469625744535
Come See the Paradise - 예측 평점: 4.047955730846759
Young Victoria, The - 예측 평점: 4.041035880433417
Betting on Zero - 예측 평점: 4.040754350581963
Kizumonogatari II: Passionate Blood - 예측 평점: 4.040754350581963
Kizumonogatari Part 1: Tekketsu - 예측 평점: 4.040754350581963
Love Live! The School Idol Movie - 예측 평점: 4.040754350581963


In [24]:
def recommend_top_movies_for_user_input(user_id, df_user, df_pred, top_n=10):
    # 사용자가 이미 본 영화 목록 가져오기
    watched_movies_by_user = df_user[df_user['userId'] == user_id]['title'].tolist()

    # 모든 영화 목록 가져오기
    all_movies_list = df_user['title'].unique().tolist()

    # 사용자가 아직 보지 않은 영화 목록 가져오기
    unwatched_movies_list, _ = get_unwatched_movies(user_id, watched_movies_by_user, all_movies_list)

    # 사용자가 아직 보지 않은 영화 중에서 pred_rating이 높은 상위 N개 영화 추천
    user_pred_data = df_pred[df_pred['userId'] == user_id]
    unwatched_movies_pred = user_pred_data[user_pred_data['title'].isin(unwatched_movies_list)]

    # 평점이 높은 상위 N개 영화 추천
    top_movies = unwatched_movies_pred.sort_values(by='pred_rating', ascending=False).head(top_n)

    return top_movies[['title', 'pred_rating']].values.tolist()

# 사용자로부터 입력 받기
user_id_to_recommend = int(input("사용자 ID를 입력하세요: "))

# 사용자가 아직 보지 않은 영화 중에서 pred_rating이 높은 상위 10개 추천 받기
recommended_movies = recommend_top_movies_for_user_input(user_id_to_recommend, df, df_pred_all, top_n=10)

# 결과 출력
print(f"사용자 {user_id_to_recommend}에게 추천되는 영화:")
for movie, pred_rating in recommended_movies:
    print(f"{movie} - 예측 평점: {pred_rating}")

사용자 ID를 입력하세요: 77
사용자 77에게 추천되는 영화:
Raise Your Voice - 예측 평점: 2.4011953561700015
My Blueberry Nights - 예측 평점: 2.3953321022340166
Little Ashes - 예측 평점: 2.3850405345245176
Cold Creek Manor - 예측 평점: 2.3850405345245176
Come See the Paradise - 예측 평점: 2.369200391866499
Betting on Zero - 예측 평점: 2.362587679147894
Unedited Footage of a Bear - 예측 평점: 2.362587679147894
Kizumonogatari Part 1: Tekketsu - 예측 평점: 2.362587679147894
Love Live! The School Idol Movie - 예측 평점: 2.362587679147894
Black Butler: Book of the Atlantic - 예측 평점: 2.362587679147894


In [25]:
from sklearn.model_selection import train_test_split

_, test_data = train_test_split(df[['userId', 'title', 'rating']],
                test_size=20000, random_state=1234, stratify=df['userId'])
test_data

Unnamed: 0,userId,title,rating
72039,464,Escape from L.A.,3.0
90528,590,Carlito's Way,4.0
20360,135,"Nightmare Before Christmas, The",4.0
42826,288,"Naked Gun 2 1/2: The Smell of Fear, The",4.0
37923,258,Guardians of the Galaxy 2,3.5
...,...,...,...
94958,599,Thor: Ragnarok,3.0
63609,414,Revenge of the Nerds II: Nerds in Paradise,2.0
68727,448,"Wild Bunch, The",5.0
38371,263,Gosford Park,4.5


In [26]:
test_data = pd.merge(test_data, df_pred_all, on=['userId', 'title'], how='left')
test_data

Unnamed: 0,userId,title,rating,pred_rating
0,464,Escape from L.A.,3.0,3.088615
1,590,Carlito's Way,4.0,3.323911
2,135,"Nightmare Before Christmas, The",4.0,3.427994
3,288,"Naked Gun 2 1/2: The Smell of Fear, The",4.0,3.116605
4,258,Guardians of the Galaxy 2,3.5,2.673186
...,...,...,...,...
19995,599,Thor: Ragnarok,3.0,2.668943
19996,414,Revenge of the Nerds II: Nerds in Paradise,2.0,3.212832
19997,448,"Wild Bunch, The",5.0,2.913815
19998,263,Gosford Park,4.5,3.474994


In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

true_y = np.array(test_data['rating'])
pred_y = np.array(test_data['pred_rating'])

mae = mean_absolute_error(y_true=true_y, y_pred=pred_y)
mse = mean_squared_error(y_true=true_y, y_pred=pred_y)
rmse = np.sqrt(mse)

print(f"MAE  : {str(round(mae, 2))}")
print(f"MSE  : {str(round(mse, 2))}")
print(f"RMSE : {str(round(rmse, 2))}")

MAE  : 0.77
MSE  : 0.89
RMSE : 0.95


In [28]:
# 모델이 추천한 영화 개수
n_recommends = sum(1 * (pred_y > 4.0))
n_recommends

267

In [None]:
n_movies = df['title'].nunique()

In [None]:
# Coverage
n_recommends / n_movies

In [None]:
def get_precision(true_y, pred_y, threshold):
    trues = 1 * (true_y >= threshold)
    n_trues = sum(trues)

    pred_trues = 1 * (pred_y >= threshold)

    true_positive = sum(trues + pred_trues == 2)

    precision = true_positive / n_trues

    return precision

In [None]:
get_precision(true_y, pred_y, 3)

In [None]:
df_pred_all.to_csv(path + "user_based_cf_prediction.csv", index=False)