In [1]:
# 콘텐츠 기반 필터링
# 협업 필터링

# - 최근점 이웃(사용자) 기반 협업필터링 - 사용자 평점 매트릭스
#   - 사용자 기반
#   - 아이템 기반

# - 잠재요인 협업 필터링 - 행렬 변환

In [2]:
# 사용자 평점
# 명시적 : 영화감상, 상품평점, 좋아요, 리뷰..
# 암시적 : 조회, 구매, 방문수, 머문 시간..

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
movies = pd.read_csv('data_files/movie.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings = pd.read_csv('data_files/rating.csv')

In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB


## 사용자 기반

In [8]:
merge_df = pd.merge(movies, ratings, on='movieId')

In [9]:
merge_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41


In [10]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     int64  
 4   rating     float64
 5   timestamp  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 1.0+ GB


In [11]:
ratings = merge_df[['movieId', 'userId', 'rating']]

In [12]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   movieId  int64  
 1   userId   int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 610.4 MB


In [13]:
ratings['movieId'] = ratings['movieId'].astype('int16')
ratings['userId'] = ratings['userId'].astype('int16')
ratings['rating'] = ratings['rating'].astype('int16')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['movieId'] = ratings['movieId'].astype('int16')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['userId'] = ratings['userId'].astype('int16')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['rating'] = ratings['rating'].astype('int16')


In [14]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   movieId  int16
 1   userId   int16
 2   rating   int16
dtypes: int16(3)
memory usage: 267.0 MB


In [15]:
ratings_matrix = ratings.pivot_table(index='userId', columns='movieId', values = 'rating')

In [16]:
ratings_matrix.shape

(65536, 24139)

In [17]:
import pickle

In [18]:
with open('ratings_matrix.pickle', 'wb') as f:
    pickle.dump(ratings_matrix, f)

In [19]:
with open('ratings_matrix.pickle', 'rb')  as f:
    ratings_matrix = pickle.load(f)

In [20]:
ratings_matrix.fillna(0, inplace=True)
ratings_matrix.head()

movieId,-32768,-32766,-32759,-32757,-32755,-32753,-32749,-32748,-32747,-32744,...,32739,32741,32743,32748,32750,32752,32754,32758,32760,32762
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-32768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-32767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-32766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-32765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-32764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# 고객님과 유사한 사용자가 다음의 콘텐츠도 규매 하였습니다.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
user_sim = cosine_similarity(ratings_matrix) # row 간에 cosine 유사도를 계산
user_sim

In [None]:
user_sim_sort = user_sim.argsort(axis=1)[:, ::-1]
user_sim_sort

In [None]:
user_sim_sort[10][1:11] # 유사도가 높은 사용자 상위 10

In [None]:
df.sort_values(ascending=False)

In [None]:
user_sim.sort

In [None]:
def sim_user_based_recomm(user_sim, ratings_matrix, user_id, n_top=10):
    user_sim_sort = user_sim[user_id].argsort()[::-1][:n_top * 5]
    df = ratings_matrix.iloc[user_sim.sort[n_top*3][1, 11]]
    return df.sum().sort_values(acsending=False)[:n_top]

In [None]:
sim_user_based_recomm(user_sim, ratings_matrix, 50, n_top=10)

## 아이템 기반 이웃 협업 필터링..

In [None]:
ratings_matrix.T

In [None]:
# 사용자 평점이 기간 영화들 간의 유사도 계산

In [None]:
item_sim = cosine_similarity(ratings_matrix)

In [None]:
item_sim.shape

In [None]:
pd.DataFrame(item_sim, index = ratings_matrix.columns, columns = ratings_matrix.columns)

In [None]:
# 사용자 평점 기준으로 유사한 

In [None]:
item_sim_df['Godfather, The (1972)' ].sort_values(ascending=False)[:10]

In [None]:
item_sim_df["Schindler's List (1993)"].sort_values(ascending=False)[1:10]

In [None]:
# 영화의 유사도와 사용자 평점을 함계 고려한 영화 추천

In [None]:
ratings_matrix.head()

In [None]:
# 영화의 유사도와 