### Load Data

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import numpy as np
import os

#### 1. 영화별 평점이 나와있는 ratings.dat 불러오기

In [2]:

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id','movie_id','ratings','timestamp']
ratings = pd.read_csv(rating_file_path,sep='::',names=ratings_cols,engine='python',encoding='ISO-8859-1')
original_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


#### 2. 평점을 영화 재생횟수로 고려하기 
- 3회 이상 시청한 영화만 남기기
- column name 을 ratings 에서 counts 로 바꾸기

In [3]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'original_data_size: {original_data_size},filtered_data_size: {filtered_data_size}')
print(f'ratio of remaining Data is {filtered_data_size/original_data_size:.2%}')

original_data_size: 1000209,filtered_data_size: 836478
ratio of remaining Data is 83.63%


In [4]:
ratings.rename(columns={'ratings':'counts'},inplace=True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

#### 3. 영화의 제목을 확인 할 수 있는 movies.dat 불러오기

In [6]:

movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id','title','genre']
movies = pd.read_csv(movie_file_path,sep='::',names=cols,engine='python',encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


#### 4. movie_id 를 기준으로 두 dataframe 합치기
- 합친 후 필요한 column 만 가져오기

In [7]:
merged = pd.merge(ratings,movies,on='movie_id')
merged.head()

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [8]:
using_cols = ['user_id','title','counts']
merged = merged[using_cols]
merged.head(10)

Unnamed: 0,user_id,title,counts
0,1,One Flew Over the Cuckoo's Nest (1975),5
1,2,One Flew Over the Cuckoo's Nest (1975),5
2,12,One Flew Over the Cuckoo's Nest (1975),4
3,15,One Flew Over the Cuckoo's Nest (1975),4
4,17,One Flew Over the Cuckoo's Nest (1975),5
5,18,One Flew Over the Cuckoo's Nest (1975),4
6,19,One Flew Over the Cuckoo's Nest (1975),5
7,24,One Flew Over the Cuckoo's Nest (1975),5
8,28,One Flew Over the Cuckoo's Nest (1975),3
9,33,One Flew Over the Cuckoo's Nest (1975),5


### Data Analysis 
#### 1. 데이터 확인하기 
- user_id/title 개수 확인 
- 영화별로 시청한 user 확인
- user 별 시청한 영화 개수, 평균개수 등 확인

In [9]:
merged['user_id'].nunique()

6039

In [10]:
merged['title'].nunique()

3628

In [11]:
movies_count = merged.groupby('title')['user_id'].count()
movies_count.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

In [12]:
movies_count = merged.groupby('user_id')['title'].count()
movies_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: title, dtype: float64

#### 2. 내가 좋아하는 영화 5 개 data 에 추가하기 

In [13]:
fav_movies = ['Usual Suspects, The (1995)','Toy Story (1995)','Ghostbusters (1984)','Jurassic Park (1993)','Back to the Future (1985)']

fav_movielist = pd.DataFrame({'user_id' : ['Jaewon']*5,'title':fav_movies,'counts':[5]*5})

if not merged.isin({'user_id':['Jaewon']})['user_id'].any():
  merged = merged.append(fav_movielist)

merged.tail(10)

Unnamed: 0,user_id,title,counts
836473,5851,One Little Indian (1973),5
836474,5854,Slaughterhouse (1987),4
836475,5854,"Promise, The (Versprechen, Das) (1994)",3
836476,5938,"Five Wives, Three Secretaries and Me (1998)",4
836477,5948,Identification of a Woman (Identificazione di ...,5
0,Jaewon,"Usual Suspects, The (1995)",5
1,Jaewon,Toy Story (1995),5
2,Jaewon,Ghostbusters (1984),5
3,Jaewon,Jurassic Park (1993),5
4,Jaewon,Back to the Future (1985),5


In [14]:
user_unique = merged['user_id'].unique()
movie_unique = merged['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [15]:
print(user_to_idx['Jaewon'])
print(movie_to_idx['Back to the Future (1985)'])

6039
22


#### 3. CSR Matrix 만들기
- 숫자로 indexing 
- csr matrix import 해서 matrix 만들기 

In [16]:
temp_user_data = merged['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(merged):
  print('user_id column indexing done!')
  merged['user_id'] = temp_user_data
else : 
  print('user_is column indexing failed!')

temp_movie_data = merged['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(merged):
  print('title column indexing done!')
  merged['title'] = temp_movie_data
else :
  print('title column indexing failed!')

merged

user_id column indexing done!
title column indexing done!


Unnamed: 0,user_id,title,counts
0,0,0,5
1,1,0,5
2,2,0,4
3,3,0,4
4,4,0,5
...,...,...,...
0,6039,233,5
1,6039,40,5
2,6039,243,5
3,6039,107,5


In [17]:
from scipy.sparse import csr_matrix
num_user = merged['user_id'].nunique()
num_movie = merged['title'].nunique()

csr_data = csr_matrix((merged.counts,(merged.user_id,merged.title)),shape=(num_user,num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### Train Model 
#### 1. als(alternating least square) 모델 만들기

In [18]:
from implicit.als import AlternatingLeastSquares
import os 
os.environ['OPENBLABS_NUM_THREADS'] = '1'
os.environ['KMP_DUPLICATE_LIB)OK'] = 'True'
os.environ['MKL_NUM_THREADS']='1'

In [19]:

als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=20, dtype=np.float32)

In [20]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

#### 2. model 훈련하기 

In [21]:
als_model.fit(csr_data_transpose)

  0%|          | 0/20 [00:00<?, ?it/s]

#### 3. 훈련된 모델이 예측한 나의 선호도 
- 선호하는 5 가지 영화중 하나
- 새로운 영화

In [22]:
jaewon , back_to_the_future = user_to_idx['Jaewon'],movie_to_idx['Back to the Future (1985)']
jaewon_vector , back_to_the_future_vector = als_model.user_factors[jaewon],als_model.item_factors[back_to_the_future] 

In [23]:
jaewon_vector

array([ 1.0039401 ,  0.15477599,  0.8115068 , -0.05312523,  0.06368515,
        0.31912154, -0.21733977, -0.31361791,  0.78007114, -0.48762318,
        0.1568063 ,  0.5083266 ,  0.01336254, -0.14063902, -0.55118644,
        0.2245064 , -0.29520997,  0.42702883, -0.5916421 , -0.00941172,
        0.7875748 ,  0.29752547, -0.09580057,  0.23621115,  0.31500491,
        0.2990652 , -0.5361543 ,  0.43162662, -0.31439376,  0.06487536,
       -0.21490678,  0.3116193 , -0.32982013,  0.46165133,  0.50567454,
       -0.27988636, -0.9332422 ,  0.47494745,  0.5174764 , -0.02913843,
        0.63854545, -0.28704312, -0.13979822, -0.67018163, -0.6479481 ,
       -0.59460217,  0.35669482,  0.02071229, -0.6233336 ,  0.5618634 ,
        0.87823254, -0.74978137, -0.53195506, -0.4214916 ,  0.29653928,
        0.20780286, -0.08009017, -0.56844175,  0.03161869,  0.42029092,
        0.7911956 , -0.62884146, -1.0259118 , -0.6140199 ,  0.1883449 ,
        0.07481588, -0.5637767 , -0.05777462,  0.6807662 , -0.59

In [24]:
back_to_the_future_vector

array([ 0.04758621,  0.0122603 ,  0.01506807,  0.00964897,  0.01927093,
        0.02216825,  0.02149219,  0.01494613,  0.03002862, -0.00352669,
        0.01439498,  0.01747299,  0.00171465, -0.01200676, -0.02643624,
        0.01660401, -0.01265233, -0.02126778, -0.05035632, -0.00248125,
        0.03242754,  0.00635158,  0.00986439, -0.01364972,  0.01711367,
        0.02874185,  0.00745932,  0.00749963, -0.00029279,  0.01824832,
       -0.00825911,  0.0119293 , -0.02774065,  0.02236015,  0.01290641,
        0.01972909,  0.01248378,  0.02412734,  0.00962327, -0.01145486,
        0.03956484, -0.04121414,  0.01342345,  0.00238173, -0.00519035,
        0.00917786,  0.02108221,  0.00266344,  0.01174145,  0.00503183,
        0.04831843, -0.01667726, -0.01428853, -0.02398839,  0.01723446,
        0.00016732,  0.01137581, -0.03009753,  0.01434244,  0.06763165,
        0.01673939, -0.02924855, -0.00626529, -0.00578025,  0.00732535,
        0.01130605, -0.00319356,  0.02095054,  0.01593   , -0.01

In [25]:

np.dot(jaewon_vector, back_to_the_future_vector)

0.5841144

In [26]:
man_in_black = movie_to_idx['Men in Black (1997)']
man_in_black_vector = als_model.item_factors[man_in_black]
np.dot(jaewon_vector,man_in_black_vector)

0.33695018

#### 4. 내가 좋아하는 영화 와 비슷한 영화 추천 받기

In [29]:
favorite_title = 'Toy Story (1995)'
title_id = movie_to_idx[favorite_title]
similar_title = als_model.similar_items(title_id, N=15)
similar_title

[(40, 1.0),
 (50, 0.805633),
 (322, 0.6440299),
 (4, 0.59782094),
 (33, 0.5503109),
 (110, 0.49356866),
 (330, 0.45245227),
 (10, 0.4128165),
 (20, 0.40009832),
 (255, 0.39400917),
 (126, 0.3596125),
 (160, 0.34530348),
 (34, 0.323835),
 (478, 0.3073842),
 (32, 0.30479684)]

In [31]:

idx_to_title = {v:k for k,v in movie_to_idx.items()}
[idx_to_title[i[0]] for i in similar_title]

['Toy Story (1995)',
 'Toy Story 2 (1999)',
 'Babe (1995)',
 "Bug's Life, A (1998)",
 'Aladdin (1992)',
 'Groundhog Day (1993)',
 'Lion King, The (1994)',
 'Beauty and the Beast (1991)',
 'Pleasantville (1998)',
 "There's Something About Mary (1998)",
 'Shakespeare in Love (1998)',
 'Forrest Gump (1994)',
 'Mulan (1998)',
 "Wayne's World (1992)",
 'Hercules (1997)']

In [32]:
# 몇 번 더 반복해서 확인하기 위해 위의 코드 함수화
def get_similar_title(title_name: str):
    title_id = movie_to_idx[title_name]
    similar_title = als_model.similar_items(title_id)
    similar_title = [idx_to_title[i[0]] for i in similar_title]
    return similar_title

In [33]:
get_similar_title('Back to the Future (1985)')

['Back to the Future (1985)',
 "Ferris Bueller's Day Off (1986)",
 'When Harry Met Sally... (1989)',
 'Ghostbusters (1984)',
 'Big (1988)',
 'Back to the Future Part II (1989)',
 'Bull Durham (1988)',
 'Fish Called Wanda, A (1988)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Cocoon (1985)']

In [34]:
get_similar_title('Jurassic Park (1993)')

['Jurassic Park (1993)',
 'Men in Black (1997)',
 'Terminator 2: Judgment Day (1991)',
 'Total Recall (1990)',
 'Matrix, The (1999)',
 'Lost World: Jurassic Park, The (1997)',
 'Schlafes Bruder (Brother of Sleep) (1995)',
 'Braveheart (1995)',
 'Fifth Element, The (1997)',
 'Independence Day (ID4) (1996)']

#### 5. 내가 좋아할 만한 영화 추천 받기

In [36]:
user = user_to_idx['Jaewon']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(38, 0.48053926),
 (50, 0.4567163),
 (124, 0.37991336),
 (92, 0.37378192),
 (224, 0.35160357),
 (175, 0.33695018),
 (4, 0.31004956),
 (15, 0.30960327),
 (13, 0.2966922),
 (110, 0.28251863),
 (121, 0.27358285),
 (5, 0.2646121),
 (51, 0.26310164),
 (201, 0.2587632),
 (160, 0.25363958),
 (19, 0.25144356),
 (488, 0.24932365),
 (126, 0.2435907),
 (169, 0.23674837),
 (99, 0.23549685)]

In [37]:
[idx_to_title[i[0]] for i in movie_recommended]

['Sixth Sense, The (1999)',
 'Toy Story 2 (1999)',
 'Matrix, The (1999)',
 'Terminator 2: Judgment Day (1991)',
 'L.A. Confidential (1997)',
 'Men in Black (1997)',
 "Bug's Life, A (1998)",
 'Airplane! (1980)',
 "Ferris Bueller's Day Off (1986)",
 'Groundhog Day (1993)',
 'Silence of the Lambs, The (1991)',
 'Princess Bride, The (1987)',
 'Fargo (1996)',
 'Who Framed Roger Rabbit? (1988)',
 'Forrest Gump (1994)',
 'Big (1988)',
 'When Harry Met Sally... (1989)',
 'Shakespeare in Love (1998)',
 'Fish Called Wanda, A (1988)',
 'American Beauty (1999)']

In [40]:
toy_story = movie_to_idx['Toy Story (1995)']
explain = als_model.explain(user, csr_data, itemid=toy_story)

In [41]:
# 이 추천에 기여한 기여도
[(idx_to_title[i[0]], i[1]) for i in explain[1]]

[('Toy Story (1995)', 0.42294844277524163),
 ('Back to the Future (1985)', 0.03952927358861908),
 ('Ghostbusters (1984)', 0.019123383438083143),
 ('Usual Suspects, The (1995)', 0.007162553006364999),
 ('Jurassic Park (1993)', -0.028203235941336113)]