# Project: Movielens 영화 추천
- Recommendation System
- Collaborative Filtering
- Matrix Factorization

In [1]:
import numpy as np
import pandas as pd
import os

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

## 1. 데이터 준비 및 전처리

In [2]:
rating_file_path='./data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# timestamp 제외
ratings = ratings[["user_id", "movie_id", "ratings"]]

In [4]:
# 3점 이상만 남기기
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어오기
movie_file_path='./data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 2. 데이터 탐색

### ratings에 있는 unique한 영화 개수

In [7]:
ratings["movie_id"].nunique()

3628

### ratings에 있는 unique한 사용자 수

In [8]:
ratings["user_id"].nunique()

6039

### 가장 인기 있는 영화 30개 (인기순)

In [9]:
# 데이터 합치기
data = pd.merge(movies, ratings)
data = data[["user_id", "title", "counts"]]

In [10]:
movie_count = data.groupby("title")["user_id"].count()
movie_count.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

In [11]:
# 유저별 몇 편의 영화를 시청했는지에 대한 통계
user_count = data.groupby("user_id")["title"].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: title, dtype: float64

## 3. 모델 검증을 위한 사용자 초기 정보 세팅
- 내가 선호하는 영화 5가지를 골라 ratings에 추가

In [12]:
my_favorite = ["Toy Story (1995)", "Life Is Beautiful (La Vita è bella) (1997)", "Truman Show, The (1998)",
              "Toy Story 2 (1999)", "Lion King, The (1994)"]
my_watchlist = pd.DataFrame({"user_id": ["soyeong"] * 5, "title": my_favorite, "counts": [5] * 5})

if not data.isin({'user_id':["soyeong"]})['user_id'].any():
    data = data.append(my_watchlist) 

data.tail(10)

Unnamed: 0,user_id,title,counts
836473,5682,"Contender, The (2000)",3
836474,5812,"Contender, The (2000)",4
836475,5831,"Contender, The (2000)",3
836476,5837,"Contender, The (2000)",4
836477,5998,"Contender, The (2000)",4
0,soyeong,Toy Story (1995),5
1,soyeong,Life Is Beautiful (La Vita è bella) (1997),5
2,soyeong,"Truman Show, The (1998)",5
3,soyeong,Toy Story 2 (1999),5
4,soyeong,"Lion King, The (1994)",5


### 모델에 활용하기 위한 전처리

In [13]:
# 고유한 유저, 영화 제목 찾기
user_unique = data['user_id'].unique()
title_unique = data['title'].unique()

# 유저, 영화 제목 indexing
user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}

# 인덱싱이 잘 되었는지 확인 
print(user_to_idx['soyeong']) 
print(title_to_idx['Toy Story (1995)']) 

6039
0


In [14]:
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data  
else:
    print('user_id column indexing Fail!!')

temp_title_data = data['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(data):
    print('title column indexing OK!!')
    data['title'] = temp_title_data
else:
    print('title column indexing Fail!!')

data

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,user_id,title,counts
0,0,0,5
1,1,0,4
2,2,0,4
3,3,0,5
4,4,0,5
...,...,...,...
0,6039,0,5
1,6039,2084,5
2,6039,1514,5
3,6039,2845,5


## 4. CSR matrix

In [15]:
num_user = data['user_id'].nunique()
num_title = data['title'].nunique()

csr_data = csr_matrix((data.counts, (data.user_id, data.title)), shape= (num_user, num_title))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## 5. MF 모델 학습
- ALS (Alternating Least Squares) 사용

In [16]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [17]:
# 모델 선언
als_model = AlternatingLeastSquares(factors=300, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

In [18]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해준다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [19]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

## 6. 나의 선호도 파악

In [20]:
soyeong, toy_story = user_to_idx["soyeong"], title_to_idx["Toy Story (1995)"]

soyeong_vector, toy_story_vector = als_model.user_factors[soyeong], als_model.item_factors[toy_story]

In [21]:
soyeong_vector

array([ 4.95051235e-01, -1.28993496e-01, -3.05350304e-01,  1.84229463e-01,
       -4.71477598e-01,  1.79344997e-01, -6.03165589e-02, -9.10841972e-02,
       -3.04145068e-01,  2.07532924e-02,  1.04689442e-01,  1.01477087e-01,
        7.17327371e-02, -1.08596407e-01,  2.36286148e-01,  3.64758134e-01,
        6.62290275e-01, -4.03824925e-01,  9.14487697e-04,  1.62008554e-01,
        8.61241102e-01,  2.28964344e-01, -1.91724643e-01, -4.36233610e-01,
       -8.38745892e-01, -3.02895516e-01, -1.23656057e-01,  8.70488808e-02,
       -1.17123574e-01, -5.69056332e-01, -1.84489533e-01,  4.41144407e-01,
        2.20572472e-01, -4.44786876e-01,  7.74650514e-01, -1.67753369e-01,
       -1.20355375e-01,  5.91404885e-02, -2.22167507e-01,  3.81161004e-01,
        2.38925088e-02, -5.05977750e-01, -7.69670978e-02, -1.86990470e-01,
       -5.22318073e-02, -3.83317977e-01,  4.50285137e-01,  4.88729104e-02,
        3.21333408e-02,  1.52075425e-01,  2.58306175e-01,  4.29145154e-03,
       -2.77757078e-01,  

In [22]:
toy_story_vector

array([ 2.52168681e-02, -1.58746541e-03, -1.19246291e-02,  2.26705987e-02,
        6.04182307e-04,  3.25270859e-03,  1.23828491e-02, -1.44051742e-02,
        1.31014157e-02, -2.21344884e-02,  1.06206127e-02,  2.24387683e-02,
       -2.43929075e-03,  1.67114718e-03,  1.24537572e-02,  2.98927855e-02,
        3.31712849e-02,  8.11032671e-03,  3.95060629e-02, -9.86091327e-03,
        2.91005224e-02,  3.64965089e-02,  1.21319694e-02,  2.91423197e-03,
       -4.08412814e-02, -2.71479487e-02, -4.45987750e-03,  2.02973224e-02,
       -6.81105210e-03, -3.89305428e-02, -1.94082875e-03,  1.08393263e-02,
        1.32187214e-02, -3.47698443e-02,  1.58060566e-02, -1.27871362e-02,
        1.17232082e-02,  4.22341051e-03,  4.41566750e-04,  1.93621293e-02,
       -1.12591428e-03, -9.56149772e-03,  5.37085580e-03,  8.41210131e-03,
       -1.01212040e-02, -2.42170338e-02, -2.05530250e-03,  1.60018429e-02,
       -7.66364392e-03, -1.64587423e-02,  1.58251394e-02, -5.87527780e-03,
       -4.30398062e-03,  

In [23]:
np.dot(soyeong_vector, toy_story_vector)

0.8949164

## 7. 내가 좋아하는 영화와 비슷한 영화 추천 받기
- 옛날 영화를 잘 몰라서 `movies` 데이터프레임에서 장르까지 가져와서 비교해보았다...

In [24]:
idx_to_title = {v:k for k,v in title_to_idx.items()}

def get_similar_movie(movie_name: str):
    movie_id = title_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_title[i[0]] for i in similar_movie]
    return similar_movie

In [41]:
get_similar_movie("Toy Story 2 (1999)")

['Toy Story 2 (1999)',
 'Toy Story (1995)',
 "Bug's Life, A (1998)",
 'Tigger Movie, The (2000)',
 'Screwed (2000)',
 'Careful (1992)',
 'Big Bully (1996)',
 'Amityville Curse, The (1990)',
 'Paris, France (1993)',
 'Air Bud: Golden Receiver (1998)']

In [36]:
movies.loc[movies.title.isin(get_similar_movie("Toy Story 2 (1999)"))]

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
74,75,Big Bully (1996),Comedy|Drama
555,559,"Paris, France (1993)",Comedy
741,751,Careful (1992),Comedy
1308,1328,"Amityville Curse, The (1990)",Horror
2083,2152,Air Bud: Golden Receiver (1998),Children's|Comedy
2286,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy
3045,3114,Toy Story 2 (1999),Animation|Children's|Comedy
3218,3287,"Tigger Movie, The (2000)",Animation|Children's
3527,3596,Screwed (2000),Comedy


In [42]:
get_similar_movie("Toy Story 2 (1999)")

['Toy Story 2 (1999)',
 'Toy Story (1995)',
 "Bug's Life, A (1998)",
 'Tigger Movie, The (2000)',
 'Screwed (2000)',
 'Careful (1992)',
 'Big Bully (1996)',
 'Amityville Curse, The (1990)',
 'Paris, France (1993)',
 'Air Bud: Golden Receiver (1998)']

In [35]:
movies.loc[movies.title.isin(get_similar_movie("Toy Story 2 (1999)"))]

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
74,75,Big Bully (1996),Comedy|Drama
555,559,"Paris, France (1993)",Comedy
741,751,Careful (1992),Comedy
1308,1328,"Amityville Curse, The (1990)",Horror
2083,2152,Air Bud: Golden Receiver (1998),Children's|Comedy
2286,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy
3045,3114,Toy Story 2 (1999),Animation|Children's|Comedy
3218,3287,"Tigger Movie, The (2000)",Animation|Children's
3527,3596,Screwed (2000),Comedy


In [43]:
get_similar_movie("Life Is Beautiful (La Vita è bella) (1997)")

['Life Is Beautiful (La Vita è bella) (1997)',
 'Other Side of Sunday, The (Søndagsengler) (1996)',
 'Misérables, Les (1998)',
 'Mary Reilly (1996)',
 'Commandments (1997)',
 'Before the Rain (Pred dozhdot) (1994)',
 'Very Thought of You, The (1998)',
 'Bitter Sugar (Azucar Amargo) (1996)',
 'Snows of Kilimanjaro, The (1952)',
 'Nosferatu a Venezia (1986)']

In [37]:
movies.loc[movies.title.isin(get_similar_movie("Life Is Beautiful (La Vita è bella) (1997)"))]

Unnamed: 0,movie_id,title,genre
90,92,Mary Reilly (1996),Drama|Thriller
212,214,Before the Rain (Pred dozhdot) (1994),Drama
1044,1058,Bitter Sugar (Azucar Amargo) (1996),Drama
1328,1349,Nosferatu a Venezia (1986),Horror
1485,1520,Commandments (1997),Romance
1804,1873,"Misérables, Les (1998)",Drama
2255,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama
2765,2834,"Very Thought of You, The (1998)",Comedy|Romance
3138,3207,"Snows of Kilimanjaro, The (1952)",Adventure
3747,3817,"Other Side of Sunday, The (Søndagsengler) (1996)",Comedy|Drama


In [44]:
get_similar_movie("Truman Show, The (1998)")

['Truman Show, The (1998)',
 'Fried Green Tomatoes (1991)',
 'Apollo 13 (1995)',
 "Mr. Holland's Opus (1995)",
 'Maybe, Maybe Not (Bewegte Mann, Der) (1994)',
 'Jamaica Inn (1939)',
 'Lady of Burlesque (1943)',
 'Ashes of Time (1994)',
 "Gone Fishin' (1997)",
 "Enfer, L' (1994)"]

In [40]:
movies.loc[movies.title.isin(get_similar_movie("Truman Show, The (1998)"))]

Unnamed: 0,movie_id,title,genre
61,62,Mr. Holland's Opus (1995),Drama
148,150,Apollo 13 (1995),Drama
261,264,"Enfer, L' (1994)",Drama
747,757,Ashes of Time (1994),Drama
849,860,"Maybe, Maybe Not (Bewegte Mann, Der) (1994)",Comedy
859,870,Gone Fishin' (1997),Comedy
946,958,Lady of Burlesque (1943),Comedy|Mystery
1251,1271,Fried Green Tomatoes (1991),Drama
1636,1682,"Truman Show, The (1998)",Drama
2138,2207,Jamaica Inn (1939),Drama


In [45]:
get_similar_movie("Lion King, The (1994)")

['Lion King, The (1994)',
 'Beauty and the Beast (1991)',
 'Aladdin (1992)',
 'Anastasia (1997)',
 'Hunchback of Notre Dame, The (1996)',
 'Mulan (1998)',
 'Antz (1998)',
 'Cinderella (1950)',
 'Tarzan (1999)',
 'Snow White and the Seven Dwarfs (1937)']

In [39]:
movies.loc[movies.title.isin(get_similar_movie("Lion King, The (1994)"))]

Unnamed: 0,movie_id,title,genre
360,364,"Lion King, The (1994)",Animation|Children's|Musical
584,588,Aladdin (1992),Animation|Children's|Comedy|Musical
590,594,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
591,595,Beauty and the Beast (1991),Animation|Children's|Musical
773,783,"Hunchback of Notre Dame, The (1996)",Animation|Children's|Musical
1009,1022,Cinderella (1950),Animation|Children's|Musical
1642,1688,Anastasia (1997),Animation|Children's|Musical
1838,1907,Mulan (1998),Animation|Children's
2225,2294,Antz (1998),Animation|Children's
2618,2687,Tarzan (1999),Animation|Children's


- 대체로 비슷한 장르의 영화가 유사도가 높게 나온 것을 확인할 수 있다.
- 시리즈 영화인 토이스토리의 경우 당연하게도 서로가 유사도가 높게 나온 것을 확인할 수 있다.
- 라이온킹의 경우 다른 디즈니 영화가 유사도가 높게 나왔다는 점이 인상 깊다.

## 8. 내가 가장 좋아할 만한 영화들 추천 받기

In [30]:
user = user_to_idx["soyeong"]
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
for i in movie_recommended:
    print(f"{idx_to_title[i[0]]}: {i[1]:.3f}")

Beauty and the Beast (1991): 0.341
Aladdin (1992): 0.322
Good Will Hunting (1997): 0.263
Bug's Life, A (1998): 0.243
Fried Green Tomatoes (1991): 0.183
Iron Giant, The (1999): 0.174
Antz (1998): 0.169
Apollo 13 (1995): 0.165
Three Kings (1999): 0.165
Tarzan (1999): 0.163
Mulan (1998): 0.151
Planet of the Apes (1968): 0.150
Ghostbusters (1984): 0.148
Babe (1995): 0.146
Home Alone (1990): 0.145
Star Wars: Episode I - The Phantom Menace (1999): 0.143
Star Wars: Episode IV - A New Hope (1977): 0.140
Full Monty, The (1997): 0.139
Forrest Gump (1994): 0.134
Quiz Show (1994): 0.133


- 디즈니 영화 포함 애니메이션이 많이 보이는 것 같다. 아무래도 토이스토리1, 2, 라이온킹이 애니메이션이다보니 애니메이션을 많이 추천해준 것 같다.
- 대체로 내가 좋아한다고 넣어 놓은 영화와 비슷한 특성을 가진 영화들이 추천된 것을 확인할 수 있다.

## 회고

- 다양한 값으로 실험을 한 결과 `factor`나 `iterations`를 너무 키운 경우 내적은 1에 가까워지지만 비슷한 영화를 추천해줄 때 제대로 추천된건지를 모르겠다는 문제가 생긴다는 것을 발견했다. (예를 들면 토이스토리에 청불 영화를 추천한다던가...) 따라서 적절한 `factor` 값과 `iterations` 값을 설정할 필요가 있다.
- 데이터셋이 오래돼서 그런지 잘 모르는 영화가 더 많아서 결과 분석이 너무 어려웠다. 사실 5개 영화 선정하는 것부터 어려웠다...
- 더 최신 데이터셋으로 분석을 한다면 아는 영화가 많아서 좀 더 결과 분석을 잘 할 수 있을 것 같다는 생각이 들었다.