In [1]:
import numpy as np
import scipy
import implicit
import pandas as pd
import os

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


In [2]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings['counts'].head()

0    5
1    3
2    3
3    4
4    5
Name: counts, dtype: int64

In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 분석 시작

In [6]:
using_col = ['user_id', 'movie_id', 'counts']
data = ratings[using_col]
data.head(5)

Unnamed: 0,user_id,movie_id,counts
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


1. ratings에 있는 유니크한 영화 개수
2. ratings에 있는 유니크한 사용자 수
3. 가장 인기 있는 영화 30개(인기순)

In [7]:
# 유저 수
print("User num: ", data['user_id'].nunique())
# 영화 수
print("Movie num: ", data['movie_id'].nunique())
# 평점은 어차피 3,4,5
print("Rating: ", data['counts'].nunique())

User num:  6039
Movie num:  3628
Rating:  3


In [8]:
import copy
data_new = data.copy()
data_new['view'] = data_new['movie_id'].value_counts()
data_new = data_new.sort_values(by=['view','counts'],  ascending=False)
# 가장 유저가 많이 본 무비 아이디 정렬해서 pop_movie에 저장 그리고 평점도
data_new

Unnamed: 0,user_id,movie_id,counts,view
2858,23,1258,4,3211.0
260,5,1535,4,2910.0
1196,10,2046,4,2885.0
2028,18,2739,5,2561.0
589,8,377,4,2509.0
...,...,...,...,...
1000174,6040,2745,3,
1000180,6040,1036,3,
1000197,6040,2020,3,
1000198,6040,2021,3,


In [9]:
pop_movie_df = pd.merge(data_new, movies, left_on="movie_id", right_on="movie_id", how="left")
pop_movie_df.head(30)

Unnamed: 0,user_id,movie_id,counts,view,title,genre
0,23,1258,4,3211.0,"Shining, The (1980)",Horror
1,5,1535,4,2910.0,Love! Valour! Compassion! (1997),Drama|Romance
2,10,2046,4,2885.0,Flight of the Navigator (1986),Adventure|Children's|Sci-Fi
3,18,2739,5,2561.0,"Color Purple, The (1985)",Drama
4,8,377,4,2509.0,Speed (1994),Action|Romance|Thriller
5,8,2712,3,2498.0,Eyes Wide Shut (1999),Drama
6,10,1247,3,2473.0,"Graduate, The (1967)",Drama|Romance
7,22,223,4,2434.0,Clerks (1994),Comedy
8,6,1569,4,2413.0,My Best Friend's Wedding (1997),Comedy|Romance
9,22,2005,4,2385.0,"Goonies, The (1985)",Adventure|Children's|Fantasy


In [10]:
pop_movie_df['title'] = pop_movie_df['title'].str.lower()
pop_movie_df['genre'] = pop_movie_df['genre'].str.lower()
pop_movie_df.head(5)

Unnamed: 0,user_id,movie_id,counts,view,title,genre
0,23,1258,4,3211.0,"shining, the (1980)",horror
1,5,1535,4,2910.0,love! valour! compassion! (1997),drama|romance
2,10,2046,4,2885.0,flight of the navigator (1986),adventure|children's|sci-fi
3,18,2739,5,2561.0,"color purple, the (1985)",drama
4,8,377,4,2509.0,speed (1994),action|romance|thriller


In [11]:
pop_movie_df[pop_movie_df['title'].str.contains('lady and the')]
pop_movie_df[pop_movie_df['title'].str.contains('cinderella')]
pop_movie_df[pop_movie_df['title'].str.contains('snow')]

my_favor_id = [2080, 1022, 594]
my_favor_movie = []
for i in my_favor_id:
    condition = pop_movie_df.movie_id==i
    movie = pop_movie_df.loc[condition].title.iloc[0]
    my_favor_movie.append(movie)
my_playlist = pd.DataFrame({'user_id':['jennie97']*3, 'movie_id': my_favor_id, 'counts': [8]*3})

if not data.isin({'user_id':['jennie97']})['user_id'].any():
    data = data.append(my_playlist)

data.tail(8)
print(my_favor_movie)

['lady and the tramp (1955)', 'cinderella (1950)', 'snow white and the seven dwarfs (1937)']


## CSR matrix를 직접 만들어 봅시다.

In [12]:
data = pd.merge(data, movies, left_on="movie_id", right_on="movie_id", how="left")
data['title'] = data['title'].str.lower()

# 고유한 유저, 영화를 찾아내는 코드
user_unique = data['user_id'].unique()
print(user_unique)
movie_unique = data['title'].unique()
print(movie_unique)


[1 2 3 ... 6039 6040 'jennie97']
["one flew over the cuckoo's nest (1975)"
 'james and the giant peach (1996)' 'my fair lady (1964)' ...
 'promise, the (versprechen, das) (1994)'
 'five wives, three secretaries and me (1998)'
 'identification of a woman (identificazione di una donna) (1982)']


In [13]:
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}
movie_to_idx

{"one flew over the cuckoo's nest (1975)": 0,
 'james and the giant peach (1996)': 1,
 'my fair lady (1964)': 2,
 'erin brockovich (2000)': 3,
 "bug's life, a (1998)": 4,
 'princess bride, the (1987)': 5,
 'ben-hur (1959)': 6,
 'christmas story, a (1983)': 7,
 'snow white and the seven dwarfs (1937)': 8,
 'wizard of oz, the (1939)': 9,
 'beauty and the beast (1991)': 10,
 'gigi (1958)': 11,
 'miracle on 34th street (1947)': 12,
 "ferris bueller's day off (1986)": 13,
 'sound of music, the (1965)': 14,
 'airplane! (1980)': 15,
 'tarzan (1999)': 16,
 'bambi (1942)': 17,
 'awakenings (1990)': 18,
 'big (1988)': 19,
 'pleasantville (1998)': 20,
 'wallace & gromit: the best of aardman animation (1996)': 21,
 'back to the future (1985)': 22,
 "schindler's list (1993)": 23,
 'meet joe black (1998)': 24,
 'pocahontas (1995)': 25,
 'e.t. the extra-terrestrial (1982)': 26,
 'titanic (1997)': 27,
 'ponette (1996)': 28,
 'close shave, a (1995)': 29,
 'antz (1998)': 30,
 'girl, interrupted (1999)':

In [14]:
# user_to_idx.get을 통해 user_id 칼럼의 모든 값을 인덱싱한 series 구하기
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):
    print("user_id column indexing is completed!")
    data['user_id'] = temp_user_data
else:
    print("user_id column indexing failed!")

user_id column indexing is completed!


In [15]:
temp_movie_data = data['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(data):
    print('movie_id column indexing is completed!')
    data['movie_id'] = temp_movie_data
else:
    print('movie_id column indexing failed!')

movie_id column indexing is completed!


In [16]:
data.head(6)

Unnamed: 0,user_id,movie_id,counts,title,genre
0,0,0,5,one flew over the cuckoo's nest (1975),Drama
1,0,1,3,james and the giant peach (1996),Animation|Children's|Musical
2,0,2,3,my fair lady (1964),Musical|Romance
3,0,3,4,erin brockovich (2000),Drama
4,0,4,5,"bug's life, a (1998)",Animation|Children's|Comedy
5,0,5,3,"princess bride, the (1987)",Action|Adventure|Comedy|Romance


In [19]:
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
print(num_user)
num_movie = data['title'].unique()
print(num_movie)

csr_data = csr_matrix((data.counts, (data.user_id, data.movie_id)))
csr_data

6040
["one flew over the cuckoo's nest (1975)"
 'james and the giant peach (1996)' 'my fair lady (1964)' ...
 'promise, the (versprechen, das) (1994)'
 'five wives, three secretaries and me (1998)'
 'identification of a woman (identificazione di una donna) (1982)']


<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836481 stored elements in Compressed Sparse Row format>

In [20]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [21]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=20, dtype=np.float32)

In [22]:
csr_data_t = csr_data.T
csr_data_t.shape

(3628, 6040)

In [23]:
als_model.fit(csr_data_t)

  0%|          | 0/20 [00:00<?, ?it/s]

In [25]:
jennie, movie = user_to_idx['jennie97'], movie_to_idx[my_favor_movie[0]]
jennie_vector, movie_vector = als_model.user_factors[jennie], als_model.item_factors[movie]
print("나의 선호도:", round(np.dot(jennie_vector, movie_vector),2))

나의 선호도: 0.6


## 내가 좋아하는 영화와 비슷한 영화 추천 받기

In [26]:
want_movie = "10 things i hate about you (1999)"
want_movie_id = movie_to_idx[want_movie]
similar_movie = als_model.similar_items(want_movie_id, N=15)
similar_movie

idx_to_movie = {v:k for k,v in movie_to_idx.items()}
idx_to_movie

{0: "one flew over the cuckoo's nest (1975)",
 1: 'james and the giant peach (1996)',
 2: 'my fair lady (1964)',
 3: 'erin brockovich (2000)',
 4: "bug's life, a (1998)",
 5: 'princess bride, the (1987)',
 6: 'ben-hur (1959)',
 7: 'christmas story, a (1983)',
 8: 'snow white and the seven dwarfs (1937)',
 9: 'wizard of oz, the (1939)',
 10: 'beauty and the beast (1991)',
 11: 'gigi (1958)',
 12: 'miracle on 34th street (1947)',
 13: "ferris bueller's day off (1986)",
 14: 'sound of music, the (1965)',
 15: 'airplane! (1980)',
 16: 'tarzan (1999)',
 17: 'bambi (1942)',
 18: 'awakenings (1990)',
 19: 'big (1988)',
 20: 'pleasantville (1998)',
 21: 'wallace & gromit: the best of aardman animation (1996)',
 22: 'back to the future (1985)',
 23: "schindler's list (1993)",
 24: 'meet joe black (1998)',
 25: 'pocahontas (1995)',
 26: 'e.t. the extra-terrestrial (1982)',
 27: 'titanic (1997)',
 28: 'ponette (1996)',
 29: 'close shave, a (1995)',
 30: 'antz (1998)',
 31: 'girl, interrupted (199

In [37]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    recommend = []
    for i in similar_movie:
        recommend.append(idx_to_movie[i[0]])
    return print("What artists do I like? ===> ", recommend[1:3])

In [38]:
get_similar_movie("10 things i hate about you (1999)")

What artists do I like? ===>  ["she's all that (1999)", 'never been kissed (1999)']


In [53]:
user = user_to_idx['jennie97']
movie_recommend = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
foru = []
for i in movie_recommend:
    foru.append(idx_to_movie[i[0]])
print("This is for U! ==>", foru[1:4])

This is for U! ==> ['jungle book, the (1967)', 'sleeping beauty (1959)', 'peter pan (1953)']
