## Ch22. Recommender Systems

In [48]:
import math, random
from collections import defaultdict, Counter
from linear_algebra import dot

In [49]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

### 인지도를 활용한 추천
인기 있는 것을 추천하는 방법

In [50]:
popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests).most_common()

# 단순히 인기 있는 것을 추천
print (popular_interests)

[('Python', 4), ('R', 4), ('Big Data', 3), ('probability', 3), ('Java', 3), ('regression', 3), ('HBase', 3), ('statistics', 3), ('scikit-learn', 2), ('C++', 2), ('MongoDB', 2), ('libsvm', 2), ('neural networks', 2), ('pandas', 2), ('Hadoop', 2), ('artificial intelligence', 2), ('machine learning', 2), ('deep learning', 2), ('Cassandra', 2), ('Postgres', 2), ('statsmodels', 2), ('support vector machines', 1), ('programming languages', 1), ('theory', 1), ('databases', 1), ('Spark', 1), ('MapReduce', 1), ('mathematics', 1), ('decision trees', 1), ('Mahout', 1), ('numpy', 1), ('MySQL', 1), ('NoSQL', 1), ('Storm', 1), ('scipy', 1), ('Haskell', 1)]


In [51]:
# 사용자가 관심사로 선택하지 않은 항목들을 전체 인기순으로 사용자에게 추천
def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency)
                   for interest, frequency in popular_interests
                   if interest not in user_interests]
    return suggestions[:max_results]

In [52]:
# 사용자 1의 관심사
print (users_interests[1])
# 사용자 1의 관심사 이외의 인기순 추천
print (most_popular_new_interests(users_interests[1], 5))
print()
# 사용자 3의 관심사
print (users_interests[3])
# 사용자 3의 관심사 이외의 인기순 추천
print (most_popular_new_interests(users_interests[3], 5))

['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
[('Python', 4), ('R', 4), ('Big Data', 3), ('probability', 3), ('Java', 3)]

['R', 'Python', 'statistics', 'regression', 'probability']
[('Big Data', 3), ('Java', 3), ('HBase', 3), ('scikit-learn', 2), ('C++', 2)]


'많은 사람들이 파이썬에 관심을 가지고 있으니 당신도 좀 관심을 가져봐' 라는 접근은 마케팅적으로 그리 좋은 방법은 아니다.
관심사에 대한 데이터가 어느 정도 축적되었다면 조금 더 개인화된 추천을 어떻게 할 수 있을지 생각해 보는 게 좋다.

### 사용자 기반 협업 필터링
특정 사용자 A와 유사한 다른 사용자 B를 찾은 후, B의 관심사를 추천
그러기 위해 먼저 사용자들 간 유사도를 정의할 수 있어야 한다.
여기에서는 코사인 유사도(cosine similarity)라는 지표를 사용

In [53]:
#
# user-based filtering
#

def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

코사인 유사도는 벡터 v, w 사이의 '각도'를 잰다.<br />
완전히 같은 방향 = 1<br />
정반대 방향 = -1

In [54]:
unique_interests = sorted(list({ interest
                                 for user_interests in users_interests
                                 for interest in user_interests }))
# 관심사 목록
print (unique_interests)

['Big Data', 'C++', 'Cassandra', 'HBase', 'Hadoop', 'Haskell', 'Java', 'Mahout', 'MapReduce', 'MongoDB', 'MySQL', 'NoSQL', 'Postgres', 'Python', 'R', 'Spark', 'Storm', 'artificial intelligence', 'databases', 'decision trees', 'deep learning', 'libsvm', 'machine learning', 'mathematics', 'neural networks', 'numpy', 'pandas', 'probability', 'programming languages', 'regression', 'scikit-learn', 'scipy', 'statistics', 'statsmodels', 'support vector machines', 'theory']


In [55]:
def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose i-th element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    """unique_interests[i]가 관심사 list에 존재한다면
    i번째 요소가 1이고, 존재하지 않는다면 0인 벡터를 생성"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

# 사용자 관심사에 대한 행렬
user_interest_matrix = list(map(make_user_interest_vector, users_interests))

# 사용자 간 유사도 계산
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                      for interest_vector_i in user_interest_matrix]

print (user_interest_matrix)
print ()
print (user_similarities)

[[1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0,

In [56]:
def most_similar_users_to(user_id):
    # 유사도가 0이 아닌 모든 사용자들을 찾는다.
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero
             if user_id != other_user_id and similarity > 0]  # similarity

    # 유사도 기준으로 정렬
    return sorted(pairs,                                      # sort them
                  key=lambda pair: pair[1],                   # most similar
                  reverse=True)                               # first

# 이 결과값을 이용해서,
# 각각의 관심사에 대해 해당 관심사에 관심이 있는 다른 사용자와의 유사도를 모두 더해 준다.
def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    # 모든 유사도를 더함
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity

    # convert them to a sorted list
    # 정렬된 list로 변환
    suggestions = sorted(suggestions.items(),
                         key=lambda pair: pair[1],
                         reverse=True)

    # and (maybe) exclude already-interests
    # (원한다면) 이미 관심하로 표시한 것은 제외한다.
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

In [57]:
# 사용자 0과 유사도가 가장 높은 사용자 출력
print (most_similar_users_to(0))
print ()
# 사용자 기반 추천
print (user_based_suggestions(0, False))

[(9, 0.5669467095138409), (1, 0.3380617018914066), (8, 0.1889822365046136), (13, 0.1690308509457033), (5, 0.1543033499620919)]

[('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('Postgres', 0.50709255283711), ('NoSQL', 0.3380617018914066), ('artificial intelligence', 0.1889822365046136), ('neural networks', 0.1889822365046136), ('deep learning', 0.1889822365046136), ('databases', 0.1690308509457033), ('MySQL', 0.1690308509457033), ('programming languages', 0.1543033499620919), ('Haskell', 0.1543033499620919), ('C++', 0.1543033499620919), ('Python', 0.1543033499620919), ('R', 0.1543033499620919)]


### 상품 기반 협업 필터링
관심사 자체에 대한 유사도를 구하는 방법<br />
사용자-관심사 행렬의 전치행렬(transpose)을 구한다. 이 행렬은 관심사가 행, 사용자가 열이 된다.<br />
관심사에 특정 사용자가 관심을 가지면 1, 관심을 가지지 않으면 0

In [58]:
#
# Item-Based Collaborative Filtering
#

# 사용자-관심사 행렬의 전치행렬(transpose)
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                        for j, _ in enumerate(unique_interests)]

# 관심사들의 유사도
interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

print (interest_user_matrix)
print ()
print (interest_similarities)

[[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0,

In [59]:
def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,
                  key=lambda pair: pair[1],
                  reverse=True)

def item_based_suggestions(user_id, include_current_interests=False):
    # 비슷한 관심사를 더함
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity
    
    # 가중치 기준으로 정렬
    suggestions = sorted(suggestions.items(),
                         key=lambda pair: pair[1],
                         reverse=True)

    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

In [60]:
# 관심사 0과 유사도가 가장 높은 관심사 출력
print (most_similar_interests_to(0))
print ()
# 사용자 기반 추천
print (item_based_suggestions(0, False))

[('Hadoop', 0.8164965809277261), ('Java', 0.6666666666666666), ('MapReduce', 0.5773502691896258), ('Spark', 0.5773502691896258), ('Storm', 0.5773502691896258), ('Cassandra', 0.4082482904638631), ('artificial intelligence', 0.4082482904638631), ('deep learning', 0.4082482904638631), ('neural networks', 0.4082482904638631), ('HBase', 0.3333333333333333)]

[('MapReduce', 1.861807319565799), ('MongoDB', 1.3164965809277263), ('Postgres', 1.3164965809277263), ('NoSQL', 1.2844570503761732), ('databases', 0.5773502691896258), ('programming languages', 0.5773502691896258), ('Haskell', 0.5773502691896258), ('MySQL', 0.5773502691896258), ('deep learning', 0.4082482904638631), ('artificial intelligence', 0.4082482904638631), ('C++', 0.4082482904638631), ('neural networks', 0.4082482904638631), ('Python', 0.2886751345948129), ('R', 0.2886751345948129)]


In [61]:
print("Popular Interests")
print(popular_interests)
print()

print("Most Popular New Interests")
print("already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
print(most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]))
print()
print("already like:", ["R", "Python", "statistics", "regression", "probability"])
print(most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"]))
print()

Popular Interests
[('Python', 4), ('R', 4), ('Big Data', 3), ('probability', 3), ('Java', 3), ('regression', 3), ('HBase', 3), ('statistics', 3), ('scikit-learn', 2), ('C++', 2), ('MongoDB', 2), ('libsvm', 2), ('neural networks', 2), ('pandas', 2), ('Hadoop', 2), ('artificial intelligence', 2), ('machine learning', 2), ('deep learning', 2), ('Cassandra', 2), ('Postgres', 2), ('statsmodels', 2), ('support vector machines', 1), ('programming languages', 1), ('theory', 1), ('databases', 1), ('Spark', 1), ('MapReduce', 1), ('mathematics', 1), ('decision trees', 1), ('Mahout', 1), ('numpy', 1), ('MySQL', 1), ('NoSQL', 1), ('Storm', 1), ('scipy', 1), ('Haskell', 1)]

Most Popular New Interests
already like: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
[('Python', 4), ('R', 4), ('Big Data', 3), ('probability', 3), ('Java', 3)]

already like: ['R', 'Python', 'statistics', 'regression', 'probability']
[('Big Data', 3), ('Java', 3), ('HBase', 3), ('scikit-learn', 2), ('C++', 2)]



In [62]:
print("User based similarity")
print("most similar to 0")
print(most_similar_users_to(0))

print("Suggestions for 0")
print(user_based_suggestions(0))
print()

print("Item based similarity")
print("most similar to 'Big Data'")
print(most_similar_interests_to(0))
print()

print("suggestions for user 0")
print(item_based_suggestions(0))

User based similarity
most similar to 0
[(9, 0.5669467095138409), (1, 0.3380617018914066), (8, 0.1889822365046136), (13, 0.1690308509457033), (5, 0.1543033499620919)]
Suggestions for 0
[('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('Postgres', 0.50709255283711), ('NoSQL', 0.3380617018914066), ('artificial intelligence', 0.1889822365046136), ('neural networks', 0.1889822365046136), ('deep learning', 0.1889822365046136), ('databases', 0.1690308509457033), ('MySQL', 0.1690308509457033), ('programming languages', 0.1543033499620919), ('Haskell', 0.1543033499620919), ('C++', 0.1543033499620919), ('Python', 0.1543033499620919), ('R', 0.1543033499620919)]

Item based similarity
most similar to 'Big Data'
[('Hadoop', 0.8164965809277261), ('Java', 0.6666666666666666), ('MapReduce', 0.5773502691896258), ('Spark', 0.5773502691896258), ('Storm', 0.5773502691896258), ('Cassandra', 0.4082482904638631), ('artificial intelligence', 0.4082482904638631), ('deep learning', 0.40824829

### 더 공부해 보고 싶다면
Grab 추천 시스템 프레임워크 : http://muricoca.github.io/crab/<br />
추천 시스템 툴킷 : https://turi.com/learn/userguide/recommender/using-trained-models.html<br />
넷플릭스 추천 시스템 공모전 : http://www.netflixprize.com/