In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important;}</style>"))

# Collaborative Filtering

### 내가 이 상품을 사면 만족스러울 것 같아? 아니면 안만족스러울 것 같아? 를 예측하는 알고리즘

In [2]:
import numpy as np
import pandas as pd

## Load Dataset

In [3]:
ratings_url = 'https://bit.ly/dsml-01-ratings'

data = pd.read_csv(ratings_url)

data.head()

Unnamed: 0,사람,책,평점
0,민지,백설공주,5.0
1,민지,신데렐라,4.0
2,민지,어린왕자,1.0
3,민지,흥부전,3.0
4,현우,노인과바다,3.0


In [4]:
ratings = pd.pivot_table(data, index = '사람', columns = '책', values = '평점')

ratings

책,노인과바다,백설공주,신데렐라,어린왕자,콩쥐팥쥐,흥부전
사람,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
민수,3.0,4.0,4.0,3.0,4.0,
민지,,5.0,4.0,1.0,,3.0
지민,4.0,1.0,,5.0,2.0,3.0
지연,5.0,,3.0,4.0,3.0,3.0
현우,3.0,,2.0,,1.0,2.0


## 코사인 유사도 계산하기

In [5]:
u = np.array([3, 4, 3])
v = np.array([3, 2, 4])

uvdot = (u * v).sum()

norm1 = (u ** 2).sum()
norm2 = (v ** 2).sum()

score = uvdot / np.sqrt(norm1 * norm2)

score

0.9235481451827989

In [6]:
# 평점을 안남긴 경우가 있는 경우

u = np.array([np.nan, 4, 3])
v = np.array([3, 2, np.nan])

# np.isfinite()= ~np.isnan()
# u와 v 모두 True인 경우만 살림
mask = np.isfinite(u) & np.isfinite(v)

u = u[mask]
u

array([4.])

In [7]:
# mask를 적용하고 계산해야 정확함
# 모듈화

def get_cosine_similarity(u, v):
    mask = np.isfinite(u) & np.isfinite(v)

    u = u[mask]
    v = v[mask]

    uvdot = (u * v).sum()

    norm1 = (u ** 2).sum()
    norm2 = (v ** 2).sum()

    score = uvdot / np.sqrt(norm1 * norm2)

    return score

u = np.array([np.nan, 4, 3])
v = np.array([3, 2, 4])

get_cosine_similarity(u, v)

0.8944271909999159

In [8]:
u = ratings.loc['민수']
v = ratings.loc['민지']

get_cosine_similarity(u, v)

0.9398272507881658

In [9]:
# 모든 사람에 대한 경우 구하기

from itertools import product

index_combinations = list(product(ratings.index, repeat = 2))

index_combinations

[('민수', '민수'),
 ('민수', '민지'),
 ('민수', '지민'),
 ('민수', '지연'),
 ('민수', '현우'),
 ('민지', '민수'),
 ('민지', '민지'),
 ('민지', '지민'),
 ('민지', '지연'),
 ('민지', '현우'),
 ('지민', '민수'),
 ('지민', '민지'),
 ('지민', '지민'),
 ('지민', '지연'),
 ('지민', '현우'),
 ('지연', '민수'),
 ('지연', '민지'),
 ('지연', '지민'),
 ('지연', '지연'),
 ('지연', '현우'),
 ('현우', '민수'),
 ('현우', '민지'),
 ('현우', '지민'),
 ('현우', '지연'),
 ('현우', '현우')]

In [10]:
from itertools import product

# 모듈화
def get_cosine_similarity_table(ratings):
    index_combinations = list(product(ratings.index, repeat = 2))

    similarity_list = []

    for uname, vname in index_combinations:
        # 이렇게 해도 됌
        # u, v = ratings.loc[uname], ratings.loc[vname]
        u = ratings.loc[uname]
        v = ratings.loc[vname]

        score = get_cosine_similarity(u, v)

        #{}는 dictionary
        similarity = {
            'u': uname,
            'v': vname,
            'score': score,
        }

        similarity_list.append(similarity)

    similarity_list = pd.DataFrame(similarity_list)

    similarity_table = pd.pivot_table(similarity_list, index = 'u', columns = 'v', values = 'score')

    return similarity_table

similarity_table = get_cosine_similarity_table(ratings)

similarity_table

v,민수,민지,지민,지연,현우
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
민수,1.0,0.939827,0.813206,0.938986,0.876523
민지,0.939827,1.0,0.542857,0.840841,0.989949
지민,0.813206,0.542857,1.0,0.974406,0.992583
지연,0.938986,0.840841,0.974406,1.0,0.980581
현우,0.876523,0.989949,0.992583,0.980581,1.0


## 평점 예측하기

In [11]:
def predict_rating(user_name, book_name):
    # 민지의 노인과바다의 평점을 예측하는 것이므로 민지는 빼야함, 어차피 Nan값
    # 자기자신은 제거
    # similarity도 민지는 1.0이 나옴
    neighbors_ratings = ratings[book_name].drop(index = user_name)
    neighbors_similarity = similarity_table[user_name].drop(index = user_name)

    nominator = (neighbors_ratings * neighbors_similarity).sum()
    denominator = neighbors_similarity.sum()

    score = nominator / denominator
    return score

predict_rating('민지', '노인과바다')

3.671361398092429

In [12]:
predict_rating('민수', '흥부전')

2.7543750620420546

In [13]:
# 모든 유저와 모든 상품에 대해

def predict_rating_table(ratings):
    rating_combinations = list(product(ratings.index, ratings.columns))

    rating_list = []

    for user_name, book_name in rating_combinations:
        score = predict_rating(user_name, book_name)
        rating_predict = {
            'user': user_name,
            'book': book_name,
            'score': score
        }

        rating_list.append(rating_predict)

    rating_list = pd.DataFrame(rating_list)

    rating_table = pd.pivot_table(rating_list, index = 'user', columns = 'book', values = 'score')

    rating_table = rating_table[ratings.isnull()]

    return rating_table

rating_table = predict_rating_table(ratings)
rating_table

book,노인과바다,백설공주,신데렐라,어린왕자,콩쥐팥쥐,흥부전
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
민수,,,,,,2.754375
민지,3.671361,,,,2.522275,
지민,,,3.109381,,,
지연,,2.392236,,,,
현우,,2.46076,,3.256756,,


## 실제 사용해보기

### Case 1. 지금 민지에게 가장 추천하고 싶은 책은?

In [14]:
def predict_book(user_name, k = 1):
    prediction_list = rating_table.loc[user_name].sort_values(ascending = False)
    prediction_list = prediction_list.head(k).index

    return prediction_list

predict_book('민지')

Index(['노인과바다'], dtype='object', name='book')

In [15]:
predict_book('현우', k = 2)

Index(['어린왕자', '백설공주'], dtype='object', name='book')

### Case 2. 지금 백설공주 책에 가장 관심 있을 것 같은 사용자는?

In [16]:
def predict_user(book_name, k = 1):
    prediction_list = rating_table[book_name].sort_values(ascending = False)
    
    prediction_list = prediction_list.head(k).index
    
    return prediction_list

predict_user('백설공주', k = 2)

Index(['현우', '지연'], dtype='object', name='user')

In [17]:
predict_user('노인과바다')

Index(['민지'], dtype='object', name='user')

## User_based가 있고 Item_based가 있는데 
## 보통 Item_based를 많이 씀
* User는 이용자가 많으면 많을수록 몇백만까지 늘어나지만


* Item의 개수는 즉, 상품의 개수는 늘어나는 데 한계가 있음

## 보통 User based면 Cosine, Item based면 Pearson 씀

## Cosine Similarity와 Pearson Correlation Coefficient 둘 다 써보고 좋은거 쓰기

## baseline보정이 평균보정보다 항상 좋은건 아님 이거 역시 마찬가지로

## 두 개 다 써보고 판단