<a href="https://colab.research.google.com/github/rjsdn2308/machine-learning-practice/blob/main/2505221_movie_recommandation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Lens Dataset Recommandation

600명의 사용자가 9,000편의 영화에 10만 건의 평점과 3,600건의 태그를 적용한 데이터셋

- rating (이번 추천 알고리즘에 사용)
- movies
- links
- tags

## 라이브러리 불러오기

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import numpy as np

## 데이터 불러오기 및 확인

In [4]:
# 평점
ratings = pd.read_csv('/content/drive/MyDrive/2025-1/ml/machine-learning-practice/ratings.csv')
ratings_with_datatime = ratings.copy()
ratings_with_datatime['date'] = pd.to_datetime(ratings_with_datatime['timestamp'], unit='s')
ratings_with_datatime.head()

Unnamed: 0,userId,movieId,rating,timestamp,date
0,1,1,4.0,964982703,2000-07-30 18:45:03
1,1,3,4.0,964981247,2000-07-30 18:20:47
2,1,6,4.0,964982224,2000-07-30 18:37:04
3,1,47,5.0,964983815,2000-07-30 19:03:35
4,1,50,5.0,964982931,2000-07-30 18:48:51


In [6]:
tags = pd.read_csv('/content/drive/MyDrive/2025-1/ml/machine-learning-practice/tags.csv')
tags_with_datatime = tags.copy()
tags_with_datatime['date'] = pd.to_datetime(tags_with_datatime['timestamp'], unit='s', errors='coerce')
tags_with_datatime.head()

Unnamed: 0,userId,movieId,tag,timestamp,date
0,2,60756,funny,1445714994,2015-10-24 19:29:54
1,2,60756,Highly quotable,1445714996,2015-10-24 19:29:56
2,2,60756,will ferrell,1445714992,2015-10-24 19:29:52
3,2,89774,Boxing story,1445715207,2015-10-24 19:33:27
4,2,89774,MMA,1445715200,2015-10-24 19:33:20


In [7]:
links = pd.read_csv('/content/drive/MyDrive/2025-1/ml/machine-learning-practice/links.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
movies = pd.read_csv('/content/drive/MyDrive/2025-1/ml/machine-learning-practice/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
#movieId = 60756 이거나 89774인 movies data 출력
movie_id = [60756, 89774]
movies[movies['movieId'].isin(movie_id)]

Unnamed: 0,movieId,title,genres
6801,60756,Step Brothers (2008),Comedy
7697,89774,Warrior (2011),Drama


In [10]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
train_data

Unnamed: 0,userId,movieId,rating,timestamp
80568,509,7347,3.0,1435994597
50582,326,71462,4.0,1322252335
8344,57,2115,3.0,965798155
99603,610,1127,4.0,1479544102
71701,462,2409,2.0,1174438249
...,...,...,...,...
6265,42,4005,4.0,996259059
54886,364,141,4.0,869443367
76820,480,6867,4.0,1179163171
860,6,981,3.0,845556567


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# 사용자-아이템 행렬 생성
train_matrix = train_data.pivot(index='userId', columns='movieId', values='rating')
test_matrix = test_data.pivot(index='userId', columns='movieId', values='rating')
train_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


### 결측값 처리
- 평점이 없는 경우 0으로 처리

In [13]:
train_matrix = train_matrix.fillna(0)
test_matrix = test_matrix.fillna(0)
train_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 사용자 기반 협업 필터링

In [14]:
# 사용자 유사도
user_similarity = cosine_similarity(train_matrix)  # 사용자 간 코사인 유사도 계산
user_similarity = pd.DataFrame(user_similarity, index=train_matrix.index, columns=train_matrix.index)
user_similarity

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.016314,0.049021,0.165799,0.123392,0.118556,0.112563,0.142135,0.056088,0.012906,...,0.070901,0.152097,0.187324,0.067264,0.151517,0.139042,0.198771,0.232811,0.112174,0.143902
2,0.016314,1.000000,0.000000,0.004627,0.000000,0.013391,0.029067,0.032754,0.000000,0.080739,...,0.170123,0.020395,0.014415,0.000000,0.000000,0.019846,0.016076,0.055610,0.032404,0.075810
3,0.049021,0.000000,1.000000,0.000000,0.005770,0.004833,0.000000,0.005911,0.000000,0.000000,...,0.006401,0.005889,0.015344,0.000000,0.012783,0.008884,0.004642,0.009433,0.000000,0.031309
4,0.165799,0.004627,0.000000,1.000000,0.133565,0.090914,0.094497,0.050417,0.000000,0.021991,...,0.075828,0.090252,0.241155,0.054366,0.081585,0.162277,0.083074,0.107276,0.026720,0.068325
5,0.123392,0.000000,0.005770,0.133565,1.000000,0.238812,0.071386,0.393773,0.000000,0.006245,...,0.050523,0.343953,0.101064,0.159651,0.111464,0.086797,0.073278,0.097040,0.205395,0.053090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.139042,0.019846,0.008884,0.162277,0.086797,0.086447,0.137372,0.080967,0.053366,0.061267,...,0.136437,0.077495,0.244189,0.061137,0.132016,1.000000,0.120745,0.224829,0.064349,0.159929
607,0.198771,0.016076,0.004642,0.083074,0.073278,0.135438,0.171735,0.159539,0.014172,0.012561,...,0.093158,0.158940,0.156456,0.101872,0.083353,0.120745,1.000000,0.208673,0.096324,0.097743
608,0.232811,0.055610,0.009433,0.107276,0.097040,0.136393,0.238417,0.155110,0.091135,0.051562,...,0.134926,0.141069,0.188459,0.111872,0.154623,0.224829,0.208673,1.000000,0.110371,0.260886
609,0.112174,0.032404,0.000000,0.026720,0.205395,0.181736,0.052096,0.439794,0.000000,0.028483,...,0.028450,0.306228,0.055558,0.181878,0.093744,0.064349,0.096324,0.110371,1.000000,0.057971


In [15]:
def predict_user_based_cf(user_id, movie_id):
    # 유사도와 평점을 기반으로 예측
    if movie_id not in train_matrix.columns:
        return 0  # 영화가 훈련 데이터에 없으면 0 반환
    similar_users = user_similarity[user_id]
    ratings = train_matrix[movie_id]
    weighted_sum = np.dot(similar_users, ratings)
    similarity_sum = np.sum(similar_users[ratings > 0])
    if similarity_sum == 0:
        return 0
    return weighted_sum / similarity_sum

In [16]:
user_based_predictions = []
for user_id, movie_id, rating in test_data[['userId', 'movieId', 'rating']].values:
    pred = predict_user_based_cf(user_id, movie_id)
    user_based_predictions.append(pred)

In [19]:
from re import T
train_matrix[movie_id[1]]
# 이중에서 값이 0이 아닌것만 출력
series = train_matrix[movie_id[1]][train_matrix[movie_id[1]] > 0]
series

Unnamed: 0_level_0,89774
userId,Unnamed: 1_level_1
50,2.5
65,4.0
68,3.0
103,4.5
105,4.0
159,4.0
222,2.5
351,4.0
495,5.0


user_id 103번과 495는 복싱영화의 평점을 높게 평가

In [20]:
user_id = 103 # 사용자 ID 103번
pred_movie_id = movie_id[0]  # 영화 ID 60756
predicted_rating = predict_user_based_cf(user_id, pred_movie_id)

print(f"{user_id}번 사용자의 영화번호 {pred_movie_id}번의 평점 예측: {predicted_rating:.2f}")

103번 사용자의 영화번호 60756번의 평점 예측: 3.58


In [21]:
user_based_rmse = np.sqrt(mean_squared_error(test_data['rating'], user_based_predictions))
print(f"사용자 기반 협업 필터링 RMSE: {user_based_rmse:.4f}")

사용자 기반 협업 필터링 RMSE: 1.1745


## 아이템 기반 협업 필터링

In [22]:
item_similarity = cosine_similarity(train_matrix.T)  # 아이템 간 코사인 유사도 계산
item_similarity = pd.DataFrame(item_similarity, index=train_matrix.columns, columns=train_matrix.columns)
item_similarity

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.362258,0.191738,0.000000,0.234845,0.278680,0.184020,0.128930,0.140521,0.329772,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.362258,1.000000,0.185543,0.092463,0.218363,0.228204,0.194008,0.160969,0.058837,0.349398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.191738,0.185543,1.000000,0.122155,0.223768,0.177748,0.276536,0.246393,0.194060,0.235465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.092463,0.122155,1.000000,0.035394,0.000000,0.208622,0.189295,0.000000,0.080288,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.234845,0.218363,0.223768,0.035394,1.000000,0.209460,0.328209,0.271967,0.193847,0.180153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193583,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193585,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193587,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [23]:
def predict_item_based(user_id, movie_id):
    # 유사도와 평점을 기반으로 예측
    if movie_id not in train_matrix.columns:
        return 0  # 영화가 훈련 데이터에 없으면 0 반환
    similar_items = item_similarity[movie_id]
    user_ratings = train_matrix.loc[user_id]
    weighted_sum = np.dot(similar_items, user_ratings)
    similarity_sum = np.sum(similar_items[user_ratings > 0])
    if similarity_sum == 0:
        return 0
    return weighted_sum / similarity_sum

In [24]:
train_matrix.loc[user_id]

Unnamed: 0_level_0,103
movieId,Unnamed: 1_level_1
1,4.0
2,4.0
3,0.0
4,0.0
5,0.0
...,...
193581,0.0
193583,0.0
193585,0.0
193587,0.0


In [25]:
item_based_predictions = []
for user_id, movie_id, rating in test_data[['userId', 'movieId', 'rating']].values:
    pred = predict_item_based(user_id, pred_movie_id)
    item_based_predictions.append(pred)

In [26]:
print(f"{user_id}번 사용자의 영화번호 {pred_movie_id}번의 평점 예측: {predicted_rating:.2f}")

567.0번 사용자의 영화번호 60756번의 평점 예측: 3.58


In [27]:
item_based_rmse = np.sqrt(mean_squared_error(test_data['rating'], item_based_predictions))
print(f"아이템 기반 협업 필터링 RMSE: {item_based_rmse:.4f}")

아이템 기반 협업 필터링 RMSE: 0.9564


## 성능 비교

In [28]:
print(f"사용자 기반 협업 필터링 RMSE: {user_based_rmse:.4f}")
print(f"아이템 기반 협업 필터링 RMSE: {item_based_rmse:.4f}")

사용자 기반 협업 필터링 RMSE: 1.1745
아이템 기반 협업 필터링 RMSE: 0.9564


확인문제 1 : 60756번 영화를 본 사용자 중에 1명을 찾아서 해당 사용자의 89774번 영화의 평점을 예측하세요

확인문제 2 : 위 성능 비교 했을 때 사용자 기반과 아이템 기반 중 더 좋은 성능을 나타내는 방법을 쓰세요
- 사용자 기반보다 아이템 기반이 더 평점 예측이 더 정확하다고 볼 수 있습니다. 그 이유는 아이템 기반 협업 필터링이 사용자 기반 협업 필터링보다 평점 예측에서 더 정확하기 때문입니다.
