# VASILY課題

In [1]:
import os
import numpy as np
import pandas as pd
import math

In [2]:
DATA_DIR = os.path.join(os.path.dirname(os.path.abspath("__file__")), "../data")

今回扱っているデータの基本情報

In [3]:
ALL_USERS = 943
ALL_ITEMS = 1682
ALL_RATINGS = 100000

映画の評価データの読み込み

In [4]:
movie_table = os.path.join(DATA_DIR, "movie_table.csv")
movie_data = pd.read_csv(movie_table, index_col=0)
movie_data.head()

Unnamed: 0,movie1,movie2,movie3,movie4,movie5,movie6,movie7,movie8,movie9,movie10,...,movie1673,movie1674,movie1675,movie1676,movie1677,movie1678,movie1679,movie1680,movie1681,movie1682
user1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
user2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
user3,,,,,,,,,,,...,,,,,,,,,,
user4,,,,,,,,,,,...,,,,,,,,,,
user5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


## ピアソンの相関係数を計算する

ピアソンの相関係数を用いて、類似度を計算する。

* ユーザ$a$とユーザ$x$が共通に評価した映画の集合を$Y_{ax}$
* ユーザ$a$ の映画$y$への評価値を$r_{ay}$

とすると、ユーザ$a$とユーザ$x$のピアソン相関係数$\rho_{ax}$は

$$
    \rho_{ax} = \frac{ \sum_{y \in Y_{ax}}^{} (r_{ay} - \bar{r}'_a)(r_{xy} - \bar{r}'_x) }{ \sqrt{ \sum_{y \in Y_{ax}}^{} (r_{ay} - \bar{r}'_a)^2 } \sqrt{ \sum_{y \in Y_{ax}}^{} (r_{xy} - \bar{r}'_x)^2}}
$$

ここで、

$$
    \bar{r}'_a = \frac{\sum_{y \in Y_{ax}}^{} r_{xy}}{|Y_{ax}|}
$$

である。

In [5]:
def sim_pearson(user1, user2):
    
    # 同じユーザ同士は0を返すようにする
    if user1.equals(user2):
        return 0.0

    #usre1, user2両者が評価した映画を取得する
    both_rated =[]
    for i in range(1, ALL_ITEMS+1):
        movie_name = "movie%d" % i
        
        if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
            both_rated.append(movie_name)

#     # 両者が評価した映画を表示
#     for movie in both_rated:
#         print(movie)

    number_of_ragins = len(both_rated)

    if number_of_ragins == 0:
        return 0.0

    # user1とuser2の映画の評価値の平均を計算
#     user1_array = [user1[movie] for movie_name in both_rated]
#     user2_array = [user2[movie] for movie_name in both_rated]
#     print("user1 array type: %s, user2 array type: %s" % (type(user1_array), type(user2_array)))

#     for num in user1_array:
#         print("num is %s" % type(num))

#     user1_sum = sum(user1_array)
#     user2_sum = sum(user2_array)

#     user1_mean = user1_sum / number_of_ragins
#     user2_mean = user2_sum / number_of_ragins
    user1_mean = sum([float(user1[movie]) for movie in both_rated]) / number_of_ragins
    user2_mean = sum([float(user2[movie]) for movie in both_rated]) / number_of_ragins

#     print("user1 mean: %.1f" % user1_mean)
#     print("user2 mean: %.1f" % user2_mean)

    # 分子の計算
    numer = float(sum([(user1[movie] - user1_mean)*(user2[movie] - user2_mean) for movie in both_rated]))
#     for movie in both_rated:
#         numer += (user1[movie] - user1_mean) * (user2[movie] - user2_mean)
#     print("numer type: %s" % type(numer))
#     print("numer: %.1f" % numer)
    
    # 分母の計算
    denom1 = math.sqrt(sum([(user1[movie] - user1_mean)**2 for movie in both_rated]))
    denom2 = math.sqrt(sum([(user1[movie] - user2_mean)**2 for movie in both_rated]))
#     print("denom1 type: %s, denom2 type: %s" % (type(denom1), type(denom2)))
#     print("denom1: %.1f denom2: %.1f" % (denom1, denom2))
#     print("denom1 * denom2: %.1f" % denom1*denom2)
    
    try:
        pearson_corr = numer / (denom1 * denom2)
    except ZeroDivisionError as e:
        pearson_corr = 0

    return pearson_corr

In [6]:
user1 = movie_data.ix['user1']
user2 = movie_data.ix['user2']
print("pearson corr = %.1f" % sim_pearson(user1, user2))

pearson corr = 0.1


## レコメンドアルゴリズム

ユーザ$a$に対して、未評価の映画$y$の評価値の推定を行い、スコアが高いものをレコメンドする。

* 映画$y$を評価済みのユーザの集合を$X_y$

とすると、ユーザ$a$の映画$y$への評価の推定値$\hat{r}_{ay}$は、

$$
    \hat{r}_{ay} = \frac{\sum_{x \in X_y}^{} \rho_{ax}(r_{xy} - \bar{r}'_x)}{\sum_{x \in X_y}^{} |\rho_{ax}|}
$$

ここで、

$$
    \bar{r}_x = \frac{ \sum_{y \in Y_x}^{} r_{xy}}{|Y_x|}
$$

である。

In [7]:
def calc_average(user):

    d = 0
    for movie in user:
        d += movie
    
    n = [x for x in user if not(math.isnan(x))]
    d = len(n)
    sum_n = sum(n)
    
    return sum_n/d

In [8]:
def calc_recom_score(movie_data, target_user, target_movie):
    
    # レコメンドを行うユーザのデータを抽出
    target_user = movie_data.ix[target_user]

    # target_movieを評価しているユーザを取得する
    rated_target_movie = []
    for i in range(1, ALL_USERS+1):
        user_name = "user%d" % i
        user_data = movie_data.ix[user_name]
        
        # 取り出したuser_dataがtarget_movieを評価しているかどうか判定
        if not(math.isnan(user_data[target_movie])):
            rated_target_movie.append(user_name)
            
#     # target_movieを評価したユーザを表示
#     for user in rated_target_movie:
#         print(user)
  
    sum1 = 0
    sum2 = 0
    for user in rated_target_movie:
        
        user_data = movie_data.ix[user]
        pc = sim_pearson(target_user, user_data)
        
        calc = pc * (user_data[target_movie] - calc_average(user_data))
        sum1 += calc

        abs_pc = math.fabs(pc)
        sum2 += abs_pc

    recom = calc_average(target_user) + sum1 / sum2
    return recom

今回、ユーザ2に対してレコメンドを行ってみる。まずはじめにmovie2の推薦度を計算してみる。

In [9]:
target_user_name = 'user2'
target_movie = 'movie2'
recom = calc_recom_score(movie_data, target_user_name, target_movie)
print("%s: %s recom rating is %.1f" % (target_user_name, target_movie, recom))

user2: movie2 recom rating is 3.5


In [10]:
recom_rating = {}
target_user = movie_data.ix[target_user_name]

for i in range(1, ALL_ITEMS+1):
    target_movie = "movie%d" % i
    
    if math.isnan(target_user[target_movie]):
        recom = calc_recom_score(movie_data, target_user_name, target_movie)
        print("%s: %s predict ratings %.1f" % (target_user_name, target_movie, recom))
        
        recom_rating[target_movie] = recom

user2: movie2 predict ratings 3.5
user2: movie3 predict ratings 3.3
user2: movie4 predict ratings 3.7
user2: movie5 predict ratings 3.5
user2: movie6 predict ratings 4.1
user2: movie7 predict ratings 4.0
user2: movie8 predict ratings 4.0
user2: movie9 predict ratings 4.1
user2: movie11 predict ratings 3.9
user2: movie12 predict ratings 4.4
user2: movie15 predict ratings 3.8
user2: movie16 predict ratings 3.8
user2: movie17 predict ratings 3.4
user2: movie18 predict ratings 2.9
user2: movie20 predict ratings 3.9
user2: movie21 predict ratings 3.2
user2: movie22 predict ratings 4.1
user2: movie23 predict ratings 4.2
user2: movie24 predict ratings 3.6
user2: movie26 predict ratings 3.5
user2: movie27 predict ratings 3.4
user2: movie28 predict ratings 4.0
user2: movie29 predict ratings 2.9
user2: movie30 predict ratings 4.0
user2: movie31 predict ratings 3.7
user2: movie32 predict ratings 3.9
user2: movie33 predict ratings 3.6
user2: movie34 predict ratings 2.1
user2: movie35 predict ratin

In [11]:
print("These are the recommended movies to you:")

count = 0
for movie, score in sorted(recom_rating.items(), key=lambda x:x[1], reverse=True):
    if count == 10:
        break
    
    print("%s: predict rating %.1f" % (movie, score))
    count += 1

These are the recommended movies to you:
movie1621: predict rating 6.5
movie1678: predict rating 5.8
movie814: predict rating 5.6
movie1554: predict rating 5.3
movie1643: predict rating 5.3
movie1467: predict rating 5.2
movie1599: predict rating 5.1
movie1629: predict rating 5.1
movie1642: predict rating 5.1
movie1653: predict rating 5.0


## 参考

* [推薦システム 解説・講義資料](http://www.kamishima.net/jp/kaisetsu/)
* [推薦システムのアルゴリズム - 第9章 メモリベース型協調フィルタリング -](http://www.kamishima.net/archive/recsysdoc.pdf)
* [協調フィルタリングを利用した推薦システム構築](http://www.slideshare.net/masayuki1986/recommendation-ml)