## 1. 데이터 준비 및 전처리

In [1]:
import pandas as pd
import os

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
ratings.rename(columns={'rating':'count'}, inplace=True)
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 836478, dtype: int64

In [4]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings = ratings.join(movies.set_index('movie_id'), on='movie_id')
ratings.head()

Unnamed: 0,user_id,movie_id,count,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [6]:
ratings = ratings.drop(columns=['timestamp', 'genre'])
ratings.head()

Unnamed: 0,user_id,movie_id,count,title
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,1,661,3,James and the Giant Peach (1996)
2,1,914,3,My Fair Lady (1964)
3,1,3408,4,Erin Brockovich (2000)
4,1,2355,5,"Bug's Life, A (1998)"


## 2. 데이터 분석

In [7]:
print('# of movie_id: ', ratings['movie_id'].nunique())
print('# of title   : ', ratings['title'].nunique())
print('# of user_id : ', ratings['user_id'].nunique())

# of movie_id:  3628
# of title   :  3628
# of user_id :  6039


In [8]:
movie_count = ratings.groupby('title')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

In [9]:
user_count = ratings.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64

## 3. 선호하는 영화 추가

In [10]:
movies[movies['title'].str.lower().str.contains('story', regex=False)]

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
124,126,"NeverEnding Story III, The (1994)",Adventure|Children's|Fantasy
292,295,"Pyromaniac's Love Story, A (1995)",Comedy|Romance
833,844,"Story of Xinghua, The (1993)",Drama
865,876,Police Story 4: Project S (Chao ji ji hua) (1993),Action
886,898,"Philadelphia Story, The (1940)",Comedy|Romance
1124,1140,Entertaining Angels: The Dorothy Day Story (1996),Drama
1608,1654,FairyTale: A True Story (1997),Children's|Drama|Fantasy
1728,1787,Paralyzing Fear: The Story of Polio in America...,Documentary
1782,1851,Leather Jacket Love Story (1997),Drama|Romance


In [11]:
my_favorite_id = [1, 318, 527, 1210, 2571]

my_favorite_title = []
for i in my_favorite_id:
    my_favorite_title.extend(list(movies[movies['movie_id'] == i]['title']))

# Jeesu 가 각 영화를 7번씩 봤다고 가정
my_movielist = pd.DataFrame({'user_id': ['Jeesu']*5, 'movie_id': my_favorite_id, 'count': [7]*5, 'title': my_favorite_title})
my_movielist

Unnamed: 0,user_id,movie_id,count,title
0,Jeesu,1,7,Toy Story (1995)
1,Jeesu,318,7,"Shawshank Redemption, The (1994)"
2,Jeesu,527,7,Schindler's List (1993)
3,Jeesu,1210,7,Star Wars: Episode VI - Return of the Jedi (1983)
4,Jeesu,2571,7,"Matrix, The (1999)"


In [12]:
if not ratings.isin({'user_id':['Jeesu']})['user_id'].any():
    ratings = ratings.append(my_movielist, ignore_index=True)

ratings.tail(10)

Unnamed: 0,user_id,movie_id,count,title
836473,6040,1090,3,Platoon (1986)
836474,6040,1094,5,"Crying Game, The (1992)"
836475,6040,562,5,Welcome to the Dollhouse (1995)
836476,6040,1096,4,Sophie's Choice (1982)
836477,6040,1097,4,E.T. the Extra-Terrestrial (1982)
836478,Jeesu,1,7,Toy Story (1995)
836479,Jeesu,318,7,"Shawshank Redemption, The (1994)"
836480,Jeesu,527,7,Schindler's List (1993)
836481,Jeesu,1210,7,Star Wars: Episode VI - Return of the Jedi (1983)
836482,Jeesu,2571,7,"Matrix, The (1999)"


## 4. CSR matrix 생성

In [13]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [14]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data
else:
    print('user_id column indexing Fail!!')

temp_movie_data = ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('title column indexing OK!!')
    ratings['title'] = temp_movie_data
else:
    print('title column indexing Fail!!')

ratings

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,user_id,movie_id,count,title
0,0,1193,5,0
1,0,661,3,1
2,0,914,3,2
3,0,3408,4,3
4,0,2355,5,4
...,...,...,...,...
836478,6039,1,7,40
836479,6039,318,7,157
836480,6039,527,7,23
836481,6039,1210,7,64


In [15]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['title'].nunique()

csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.title)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## 5. 모델 설계 및 훈련

In [16]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [17]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [18]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [19]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

## 6. 훈련된 모델이 예측한 나의 선호도

In [20]:
Jeesu, matrix = user_to_idx['Jeesu'], movie_to_idx['Matrix, The (1999)']
Jeesu_vector, matrix_vector = als_model.user_factors[Jeesu], als_model.item_factors[matrix]

In [21]:
Jeesu_vector

array([ 0.22239348, -0.3168195 , -0.26046538,  0.6734212 , -0.24431805,
        0.21579967, -1.1013181 ,  0.43396166, -0.33799222, -0.6691148 ,
       -0.19867535, -0.04319663, -1.4126056 , -0.37903595, -0.23075736,
       -0.27348232,  0.5447795 , -1.2467895 ,  0.44403863,  1.241405  ,
        0.31537205, -0.68186474, -0.61624056, -0.47387293, -0.38442814,
       -0.38533205, -0.85754913,  1.0545585 ,  0.38465205, -0.25560248,
       -0.11596093,  0.23253345,  0.972476  ,  0.01864186,  1.0975071 ,
        0.9185473 ,  0.24222507, -0.09363838, -0.7929164 , -0.81402844,
       -0.57889515,  0.48364404,  1.1451944 , -0.63406277, -0.75636846,
        0.31936118, -0.3234471 , -0.07210778,  0.76734936, -0.19731219,
       -1.2607124 ,  0.43144655, -0.37367713,  0.10500027, -0.01447367,
       -0.4948515 ,  0.94411033,  0.44077197,  0.14724547,  1.0377406 ,
       -0.4994526 ,  0.6547329 ,  0.37054935,  0.97640914, -0.8976363 ,
        0.05117136,  0.12144016, -0.80120486,  0.4468796 ,  0.12

In [22]:
matrix_vector

array([ 0.01483576,  0.00797235, -0.03874092,  0.00855712,  0.00871295,
        0.02584242, -0.00437622,  0.01051394, -0.00843159,  0.0036249 ,
        0.01684712,  0.00701199, -0.01947048, -0.0016732 , -0.00522568,
        0.01257288,  0.02094661, -0.02168507,  0.01551374,  0.03099568,
        0.02035817, -0.01826327, -0.0085566 , -0.0010863 , -0.01480625,
       -0.00182447, -0.01189582,  0.02059247,  0.00320689, -0.0194292 ,
       -0.00890909,  0.0140198 ,  0.00461244,  0.00821592,  0.02708092,
        0.00792141,  0.02263794, -0.0052562 , -0.0020156 , -0.01354913,
       -0.01951911,  0.00238318,  0.02964512, -0.00414809, -0.01049211,
        0.00152277,  0.01631021,  0.01628448,  0.03671789,  0.01335617,
       -0.01944734,  0.00433741,  0.02753021,  0.0241715 , -0.00945017,
       -0.01353536,  0.02207344,  0.04262766,  0.01495815,  0.00245395,
        0.02691688, -0.00806302,  0.02580137, -0.00182537, -0.02134087,
       -0.02420945,  0.01068325,  0.0119254 ,  0.01359615,  0.01

In [23]:
np.dot(Jeesu_vector, matrix_vector)

0.56084496

In [24]:
Platoon = movie_to_idx['Platoon (1986)']
Platoon_vector = als_model.item_factors[Platoon]
np.dot(Jeesu_vector, Platoon_vector)

-0.0076178843

## 7. 내가 좋아하는 영화와 비슷한 영화 추천

In [25]:
favorite_movie = 'Matrix, The (1999)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie 

[(124, 1.0000001),
 (92, 0.7565316),
 (62, 0.6778978),
 (141, 0.5965325),
 (200, 0.5646434),
 (145, 0.55691904),
 (375, 0.5445282),
 (107, 0.53447616),
 (175, 0.49980676),
 (317, 0.46242967),
 (75, 0.44864395),
 (44, 0.43025798),
 (117, 0.40098858),
 (3053, 0.37123764),
 (372, 0.37039033)]

In [26]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['Matrix, The (1999)',
 'Terminator 2: Judgment Day (1991)',
 'Total Recall (1990)',
 'Fugitive, The (1993)',
 'Terminator, The (1984)',
 'Fifth Element, The (1997)',
 'Face/Off (1997)',
 'Jurassic Park (1993)',
 'Men in Black (1997)',
 'Twelve Monkeys (1995)',
 'Hunt for Red October, The (1990)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Sunchaser, The (1996)',
 'X-Men (2000)']

In [27]:
def get_similar_movie(movie_title: str):
    movie_id = movie_to_idx[movie_title]
    similar_movie = als_model.similar_items(movie_id, N=15)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [28]:
get_similar_movie('Matrix, The (1999)')

['Matrix, The (1999)',
 'Terminator 2: Judgment Day (1991)',
 'Total Recall (1990)',
 'Fugitive, The (1993)',
 'Terminator, The (1984)',
 'Fifth Element, The (1997)',
 'Face/Off (1997)',
 'Jurassic Park (1993)',
 'Men in Black (1997)',
 'Twelve Monkeys (1995)',
 'Hunt for Red October, The (1990)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Sunchaser, The (1996)',
 'X-Men (2000)']

In [29]:
get_similar_movie('Matrix, The (1999)')

['Matrix, The (1999)',
 'Terminator 2: Judgment Day (1991)',
 'Total Recall (1990)',
 'Fugitive, The (1993)',
 'Terminator, The (1984)',
 'Fifth Element, The (1997)',
 'Face/Off (1997)',
 'Jurassic Park (1993)',
 'Men in Black (1997)',
 'Twelve Monkeys (1995)',
 'Hunt for Red October, The (1990)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Sunchaser, The (1996)',
 'X-Men (2000)']

## 8. 내가 좋아할 만한 영화 추천

In [30]:
user = user_to_idx['Jeesu']

movie_recommended = als_model.recommend(user, csr_data, N=15, filter_already_liked_items=True)
movie_recommended

[(48, 0.66096216),
 (121, 0.6216024),
 (117, 0.6206075),
 (50, 0.5479185),
 (44, 0.537216),
 (160, 0.42927063),
 (92, 0.40269148),
 (222, 0.3799078),
 (4, 0.3746876),
 (322, 0.3708324),
 (248, 0.34858036),
 (51, 0.34565386),
 (87, 0.34548852),
 (22, 0.33489493),
 (99, 0.32964498)]

In [31]:
[idx_to_movie[i[0]] for i in movie_recommended]

['Saving Private Ryan (1998)',
 'Silence of the Lambs, The (1991)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Toy Story 2 (1999)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Forrest Gump (1994)',
 'Terminator 2: Judgment Day (1991)',
 'Pulp Fiction (1994)',
 "Bug's Life, A (1998)",
 'Babe (1995)',
 'Good Will Hunting (1997)',
 'Fargo (1996)',
 'Braveheart (1995)',
 'Back to the Future (1985)',
 'American Beauty (1999)']

In [32]:
recommended = movie_to_idx['Jurassic Park (1993)']
explain = als_model.explain(user, csr_data, itemid=recommended)

[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('Matrix, The (1999)', 0.21786752456832997),
 ("Schindler's List (1993)", 0.017816973511818737),
 ('Star Wars: Episode VI - Return of the Jedi (1983)', 0.012754411769061443),
 ('Shawshank Redemption, The (1994)', -0.030702013975787052),
 ('Toy Story (1995)', -0.03437061507623886)]

# 회고
추천 시스템이 어떤 과정을 통해 작동하는지 살펴보는데 의미를 두었습니다.<br>
다음 깃허브를 참고하여 해당 노드의 과제를 작성하였습니다.<br>
https://github.com/LilPark/AIFFEL_project/tree/main/Exploration_09<br>