## 아이템 기반 협업 필터링
    https://grouplens.org/datasets/movielens/latest/ 사이트에서 ml-latest-small.zip 파일을 내려받는다
    해당 파일은 십만개의 평점(ratings) 정보를 가지고 있다.
    
    협업 필터링은 사용자기반 / 아이템기반으로 분류된다. 이중에서 아이템 기반으로 협엽 필터링을 구현한다
    협업필터링 기반의 영화 추천을 위해서는 영화의 평점을 매긴 사용자-영화평점 행렬 데이타가 필요하다
    이를 위해서 Grouplens 사이트에서 만든 MoviesLens 데이타셋을 이용해 보겠다.

## 1. DataLoading and Data PreProcessiong

In [14]:
import pandas as pd
import numpy as np

movies = pd.read_csv('../data/movies.csv') 
ratings = pd.read_csv('../data/ratings_1.csv')
print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [15]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


ratings.csv 은 사용자별로 영화에 대한 평점을 매긴 데이타셋.<br>
평점은 최소 0.5~최대 5점 사이이며 0.5단위로 평점이 부여된다.<br>
timestamp는 현재로서는 아무의미 없다.

In [16]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [17]:
# 필요한 컬럼만으로 다시 ratings를 만듦
ratings = ratings[['userId', 'movieId', 'rating']]

In [18]:
# 사용자를 row로, 모든 영화정보를 컬럼으로 구성한 데이타셋을 다시 만들자..pivot_table사용
# q1. 결과를 확인하면 NaN값이 많이 보입니다. 이값은 평점을 입력하지 않아서 생기는 사용자 값입니다.
# q2. 컬럼명에 movieid값이 숫자로 입력됨..가독성이 떨어짐...영화title로 변경할 필요성.

ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')
ratings_matrix.head(10)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [30]:
# 작성 1. title 컬럼을 얻기 위해 movies 와 조인 수행
rating_movies = pd.merge(ratings, movies, on = 'movieId')
rating_movies

# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')
ratings_matrix

# ratings_matrix = # 작성 3. NaN 값을 모두 0 으로 변환
ratings_matrix.fillna(0, inplace=True)

# ratings_matrix.head(3)
ratings_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. 영화와 영화들 간 유사도 산출 - Cosine Similarity

#### 이제 사용자와 변환된 영화 평점 데이타 행렬 데이타셋을 이용해서 영화간 유사도 특정 해보자
    유사도는 코사인 유사도를 사용할 것이며 사이킷런의 cosine_similarity() 함수를 사용한다
    하지만, ratings_matrix 데이타셋에 cosine_similarity() 함수를 적용하면 영화간 유사도를 측정할 수 없다
    cosine_similarity() 함수는 기준이 되는 행과 다른 행을 비교하여 유사도를 측정하는 함수이기에 
    지금과 같은 행렬 구조는 userid가 기준인 행 레벨 데이타 이므로 사용자간의 유사도가 만들어진다.
    어떻게 해결해야 할까

In [32]:
# 작성 4. 행과 열을 바꿈
ratings_matrix_T = rating_movies.pivot_table(values='rating', index='title', columns='userId', )
ratings_matrix_T.fillna(0, inplace=True)
ratings_matrix_T.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
from sklearn.metrics.pairwise import cosine_similarity

# 작성 5. cosine_similarity함수를 이용해서 영화간의 유사도를 측정
item_sim = cosine_similarity(ratings_matrix_T)
item_sim

# 작성 6. cosine_similarity() 로 반환된 넘파이 행렬을 영화명을 매핑하여 DataFrame으로 변환
item_sim_df = pd.DataFrame(data = item_sim, index = ratings_matrix_T.index, columns = ratings_matrix_T.index)

print(item_sim_df.shape)
item_sim_df.head(3)


(9719, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# 작성 7. Godfather와 유사도가 높은 상위 6개 영화를 추려보자
item_sim_df.loc['Godfather, The (1972)'].sort_values(ascending=False)[:6]

title
Godfather, The (1972)                        1.000000
Godfather: Part II, The (1974)               0.821773
Goodfellas (1990)                            0.664841
One Flew Over the Cuckoo's Nest (1975)       0.620536
Star Wars: Episode IV - A New Hope (1977)    0.595317
Fargo (1996)                                 0.588614
Name: Godfather, The (1972), dtype: float64

In [38]:
# 작성 8. Inception (2010)과 유사도가 높은 상위 6개 영화를 추려보자. 이때 자신의 유사도는 제외
item_sim_df.loc['Inception (2010)'].sort_values(ascending=False)[1:6]

title
Dark Knight, The (2008)          0.727263
Inglourious Basterds (2009)      0.646103
Shutter Island (2010)            0.617736
Dark Knight Rises, The (2012)    0.617504
Fight Club (1999)                0.615417
Name: Inception (2010), dtype: float64

## 3. 아이템 기반 인접 이웃 협업 필터링으로 개인화된 영화 추천

#### 앞에서 만든 아이템기반의 영화 유사도는 모든 사용자의 평점을 기준으로 영화간의 유사도를 생성했다
    하지만 이는 개인적인 취향을 반영하지 않고 영화간의 유사도만 가지고 추천한 것이다.
    이번에는 영화유사도 데이타를 이용해서 협업 필터링을 이용해 개인에게 최적화된 영화를 추천하는 알고리즘
    을 구현해 보자, 개인화된 영화 추천의 가장 큰 특징은 개인이 아직 관람하지 않은 영화를 추천할수 있다.
    아직 관람하지 않은 영화에 대해서 아이템 유사도와 기존에 관람한 영화의 평점 데이타를 기반으로 해서
    새롭게 모든 영화의 예측평점을 계산한 후 높은 예측 평점을 가진 영화를 추천하는 방식이다.
    

In [None]:
'''
앞에서 나온 결과
영화간의 유사도 - item_sim_df
각 개인별 영화평점 - ratings_matrix
2개의 변수를 계속 활용해서 사용자별 최적화된 영화 평점 스코어를 예측하는 함수를 작성한다.
함수명 predict_rating()
인자값은 사용자-영화 평점 행렬, 영화간 유사도 행렬 입력받고

predict_rating의 결과 [사용자별 영화예측 평점]
= 사용자 u의 모든 영화에 대한 실제 평점과 영화 i의 다른 모든 영화와의 코사인 유사도를 곱(dot)한 값을
  정규화를 위해 

'''

영화간의 유사도 - item_sim_df
각 개인별 영화평점 - ratings_matrix
2개의 변수를 계속 활용해서 사용자별 최적화된 영화 평점 스코어를 예측하는 함수를 작성한다.
함수명 predict_rating()
인자값은 사용자-영화 평점 행렬, 영화간 유사도 행렬 입력받고

사용자별 영화예측 평점 $$ R_{u,i} $$   는 사용자 u에 대한 실제 평점과 영화 i의 다른 모든 영화와의 
코사인 유사도를 곱(dot)한 값을 정규화를 위해  $$ {\sum{^N (|S_{i,N}|) }} $$ 로 나눈것을 의미한다.

$$
R_{u,i}={\sum{^N (S_{i,N} * R_{u,N})}   / \sum{^N (|S_{i,N}|)}}
$$


In [39]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr)/ np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

In [40]:
# predict_rating 함수를 이용해서 개인화된 예측 평점을 구해보자.
predict_rating(ratings_matrix.values,item_sim_df.values)
predict_rating

<function __main__.predict_rating(ratings_arr, item_sim_arr)>

In [None]:
'''
예측평점이 사용자별 영화의 실제 평점과 영화의 코사인 유사도를 dot한 값이기 때문에 기존에 영화를 
관람하지 않아 0에 해당했던 실제 영화 평점이 예측에서는 값이 부여되는 경우가 많이 발생한다.
또한 예측평점이 실제 평점에 비해 작을수도 있다. 이는 내적결과를 코사인 유사도 백터합으로 나누었기
때문에 발생하는 현상이다.
'''

In [41]:
ratings_pred = predict_rating(ratings_matrix.values , item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)
ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.21207,0.192921,0.136024,0.292955,0.720347
2,0.01826,0.042744,0.018861,0.0,0.0,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.01564,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.0
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.0118,0.012225,0.0,0.008194,0.007017,0.009229,0.01042,0.084501
