# Movielens data load 

In [249]:
import os
import pandas as pd
#ratings load
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating','timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings) #rating이 3 이하인 항목은 제외할 때, 비율을 알기위해 저장 
ratings=ratings.drop(['timestamp'],axis=1)# timestamp열은 사용하지 않음 

ratings.head()


Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [261]:
len(ratings['movie_id'].unique()) #ratings에 있는 유니크한 영화 개수

3628

In [262]:
len(ratings['user_id'].unique()) #rating에 있는 유니크한 사용자 수

6040

In [250]:
#users.dat load 
users_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/users.dat'
users_cols = ['user_id', 'sex', 'age','job','code']
users = pd.read_csv(users_file_path, sep='::', names=users_cols, engine='python')
users=users.drop(['code'],axis=1)

#나의 정보를 users에 포함 
users=users.append(pd.DataFrame({"user_id":[6041],"sex":['M'],"age":[25],"job":[1]}))
users.tail(10)

Unnamed: 0,user_id,sex,age,job
6031,6032,M,45,7
6032,6033,M,50,13
6033,6034,M,25,14
6034,6035,F,25,1
6035,6036,F,25,15
6036,6037,F,45,1
6037,6038,F,56,1
6038,6039,F,45,0
6039,6040,M,25,6
0,6041,M,25,1


In [251]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')

movies.tail(10)

Unnamed: 0,movie_id,title,genre
3873,3943,Bamboozled (2000),Comedy
3874,3944,Bootmen (2000),Comedy|Drama
3875,3945,Digimon: The Movie (2000),Adventure|Animation|Children's
3876,3946,Get Carter (2000),Action|Drama|Thriller
3877,3947,Get Carter (1971),Thriller
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


In [252]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [253]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [254]:

def find_movie_title(str): #title을 이용하여 movie_id를 얻는 함수 작성
    temp=movies[movies['movie_id']==str]
    return temp.iloc[0]['title']

def find_movie_id(str): #movie_id를 이용하여 title을 얻는 함수 작성
    temp=movies[movies['title']==str]
    return temp.iloc[0]['movie_id']
   
    


In [255]:
# 본인이 좋아하시는 아티스트 데이터로 바꿔서 추가하셔도 됩니다! 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_favorite = ['Tommy Boy (1995)' , 'While You Were Sleeping (1995)' ,'Bullets Over Broadway (1994)' ,'Four Weddings and a Funeral (1994)' ,'Speechless (1994)']
my_favorite_list=[]
for i in my_favorite:
    my_favorite_list.append(find_movie_id(i))
print(my_favorite_list)

# 'zimin'이라는 user_id가 위 아티스트의 노래를 30회씩 들었다고 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': [6041]*5, 'movie_id': my_favorite_list, 'count':[5]*5})
#print(my_playlist)

if not ratings.isin({'user_id':['6041']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.


[333, 339, 348, 357, 378]


Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4
0,6041,333,5
1,6041,339,5
2,6041,348,5
3,6041,357,5
4,6041,378,5


In [256]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()

num_movie = ratings['movie_id'].nunique()


csr_data = csr_matrix((ratings["count"], (ratings.user_id, ratings.movie_id)))
csr_data


<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [257]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose


# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

# 영화 추천 함수화

In [258]:
def get_similar_movie(movie_name: str):
    
    movie_id=find_movie_id(movie_name)
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [find_movie_title(i[0]) for i in similar_movie]
    return similar_movie


get_similar_movie('Titanic (1997)')


['Titanic (1997)',
 "Mr. Holland's Opus (1995)",
 'Jerry Maguire (1996)',
 'Apollo 13 (1995)',
 'Ever After: A Cinderella Story (1998)',
 'Held Up (2000)',
 'Truman Show, The (1998)',
 'Bridges of Madison County, The (1995)',
 'Sliding Doors (1998)',
 "You've Got Mail (1998)"]

# 유저에게 영화 추천 함수화

In [265]:
def movie_recommend(user_id):
    user = user_id
    # recommend에서는 user*item CSR Matrix를 받습니다.
    movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
    return [find_movie_title(i[0]) for i in movie_recommended]


movie_recommend(6041)

['L.A. Story (1991)',
 'Much Ado About Nothing (1993)',
 'Dave (1993)',
 'Pretty Woman (1990)',
 'Sleepless in Seattle (1993)',
 'Clueless (1995)',
 "Muriel's Wedding (1994)",
 'Notting Hill (1999)',
 'Strictly Ballroom (1992)',
 "My Best Friend's Wedding (1997)",
 'Defending Your Life (1991)',
 'I.Q. (1994)',
 'Wedding Singer, The (1998)',
 'So I Married an Axe Murderer (1993)',
 'Truth About Cats & Dogs, The (1996)',
 'Waking Ned Devine (1998)',
 'Mighty Aphrodite (1995)',
 'Groundhog Day (1993)',
 'Sabrina (1995)',
 'Tin Cup (1996)']