# VASILY 課題

In [2]:
import os
import numpy as np
import pandas as pd
import math
import time

In [3]:
DATA_DIR = os.path.join(os.path.dirname(os.path.realpath("__file__")),"ml-100k")

今回扱っているデータの基本情報

In [4]:
ALL_USERS = 943
ALL_ITEMS = 1682
ALL_RATINGS = 100000

縦軸：ユーザ、横軸：映画の表に各々映画に対してユーザが評価した値が格納されているmovie_table.csvを読み込む

In [5]:
movie_table_file = "movie_table.csv"
movie_data = pd.read_csv(movie_table_file, index_col=0)
movie_data.head()

Unnamed: 0,movie1,movie2,movie3,movie4,movie5,movie6,movie7,movie8,movie9,movie10,...,movie1673,movie1674,movie1675,movie1676,movie1677,movie1678,movie1679,movie1680,movie1681,movie1682
user1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
user2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
user3,,,,,,,,,,,...,,,,,,,,,,
user4,,,,,,,,,,,...,,,,,,,,,,
user5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


## ピアソン相関係数を計算する

ピアソン相関係数を計算して、相関が高いものをレコメンドするようにする。ピアソン相関係数$\rho_P$は

$$
    \rho_P = \frac{E_{X,Y}[(X-\mu_X)(Y - \mu_Y)]}{\sqrt{E_X[(X - \mu_X)^2]}\sqrt{E_Y[(Y - \mu_Y)^2]}}
$$

ただし、

$$
    \mu_X = E_X[X], \mu_Y = E_Y[Y]
$$

である。

以下のような式を実装する。

* ユーザ $s$ による映画 $i$ の評価を $p_{s,i}$、ユーザ $t$ による映画 $i$ の評価を $p_{t, i}$
* ユーザ $s$ の映画の評価の平均を $E[P_s]$、ユーザ $t$ の映画の評価の平均を $E[P_t]$
* ユーザ$s$, $t$間のピアソン相関係数 $Sim_{s,t}$

$$
    Sim_{s,t} = \frac{\sum_{i=1}^{n}(p_{s,i} - E[P_s])(p_{t,i} - E[P_t])}{\sqrt{\sum_{i=1}^{n}(p_{s,i} - E[P_s])^2} \sqrt{{\sum_{i=1}^{n}(p_{t,i} - E[P_t])^2}}}
$$

ユーザの映画評価の平均を計算する

In [6]:
user1 = movie_data.ix['user1']
user1_mean = user1.mean()
print("user1 average rating = %.1f" % user1_mean)

user2 = movie_data.ix['user2']
user2_mean = user2.mean()
print("user2 average rating = %.1f" % user2_mean)

print(type(user1))

user1 average rating = 3.6
user2 average rating = 3.7
<class 'pandas.core.series.Series'>


In [7]:
# 取り出したデータって型は何？
print(user1["movie1"])
print(type(user1["movie1"])) # numpy.float64みたい

5.0
<class 'numpy.float64'>


In [8]:
sum = 0
for i in range(1, ALL_ITEMS+1):
    movie_name = "movie%d"% i
    
    if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
        
        print("user1[%s] = %f, user2[%s] = %f" %(movie_name, user1[movie_name], movie_name, user2[movie_name]))
        
        # 計算を行う
        calc = (user1[movie_name] - user1_mean)*(user2[movie_name] - user2_mean)
        
        print("now movie_num = %s, calc = %f" % (movie_name, calc))
        
        # 計算結果を足しこむ
        sum += calc

print("sum = %f" % sum)

user1[movie1] = 5.000000, user2[movie1] = 4.000000
now movie_num = movie1, calc = 0.403463
user1[movie10] = 3.000000, user2[movie10] = 2.000000
now movie_num = movie10, calc = 1.043406
user1[movie13] = 5.000000, user2[movie13] = 4.000000
now movie_num = movie13, calc = 0.403463
user1[movie14] = 5.000000, user2[movie14] = 4.000000
now movie_num = movie14, calc = 0.403463
user1[movie19] = 5.000000, user2[movie19] = 3.000000
now movie_num = movie19, calc = -0.986243
user1[movie25] = 4.000000, user2[movie25] = 4.000000
now movie_num = movie25, calc = 0.113140
user1[movie50] = 5.000000, user2[movie50] = 5.000000
now movie_num = movie50, calc = 1.793169
user1[movie100] = 5.000000, user2[movie100] = 5.000000
now movie_num = movie100, calc = 1.793169
user1[movie111] = 5.000000, user2[movie111] = 4.000000
now movie_num = movie111, calc = 0.403463
user1[movie127] = 5.000000, user2[movie127] = 5.000000
now movie_num = movie127, calc = 1.793169
user1[movie237] = 2.000000, user2[movie237] = 4.00000

In [9]:
sos1 = 0; sos2 = 0
for i in range(1, ALL_ITEMS+1):
    movie_name = "movie%d" % i
    
    if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
        v1 = (user1[movie_name] - user1_mean)**2
        v2 = (user2[movie_name] - user2_mean)**2
        print("%s: v1 = %f, v2 = %f" % (movie_name, v1, v2))
        
        # sum of square(二乗和)を計算
        sos1 = sos1 + v1
        sos2 = sos2 + v2

sqrt1 = math.sqrt(sos1)
sqrt2 = math.sqrt(sos2)
print("sos1 = %.1f, sos2 = %.1f" % (sos1, sos2))
print("sqrt1 = %.1f, sqrt2 = %.1f" % (sqrt1, sqrt2))

movie1: v1 = 1.931282, v2 = 0.084287
movie10: v1 = 0.372459, v2 = 2.922997
movie13: v1 = 1.931282, v2 = 0.084287
movie14: v1 = 1.931282, v2 = 0.084287
movie19: v1 = 1.931282, v2 = 0.503642
movie25: v1 = 0.151871, v2 = 0.084287
movie50: v1 = 1.931282, v2 = 1.664932
movie100: v1 = 1.931282, v2 = 1.664932
movie111: v1 = 1.931282, v2 = 0.084287
movie127: v1 = 1.931282, v2 = 1.664932
movie237: v1 = 2.593047, v2 = 0.084287
movie242: v1 = 1.931282, v2 = 1.664932
movie251: v1 = 0.151871, v2 = 1.664932
movie255: v1 = 2.593047, v2 = 0.084287
movie257: v1 = 0.151871, v2 = 0.084287
movie258: v1 = 1.931282, v2 = 0.503642
movie269: v1 = 1.931282, v2 = 0.084287
movie272: v1 = 0.372459, v2 = 1.664932
sos1 = 27.6, sos2 = 14.7
sqrt1 = 5.3, sqrt2 = 3.8


In [10]:
pearson_correlation = sum / (sqrt1 * sqrt2)
print("pearson correlation = %f" % (pearson_correlation))

pearson correlation = 0.360871


In [11]:
def sim_pearson(user1, user2):

    # 同じもの同士は0を返すようにする
    if user1.equals(user2):
        return 0.0
    
    user1_mean = user1.mean()
    user2_mean = user2.mean()

    sum = 0
    count_ = 0
    for i in range(1, ALL_ITEMS+1):
        movie_name = "movie%d" % i
        
        if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
            calc_ = (user1[movie_name] - user1_mean)*(user2[movie_name] - user2_mean)
            sum += calc_
            count_ += 1

    # 共通しているアイテムが1個以下の場合
    if count_ <= 1:
        return 0.0
    
    sos1 = 0; sos2 = 0
    for i in range(1, ALL_ITEMS+1):
        movie_name = "movie%d" % i
        
        if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
            v1 = (user1[movie_name] - user1_mean)**2
            v2 = (user2[movie_name] - user2_mean)**2

            sos1 = sos1 + v1; sos2 = sos2 + v2
            
    sqrt1 = math.sqrt(sos1)
    sqrt2 = math.sqrt(sos2)
    
    try:
        pearson_corr = sum / (sqrt1 * sqrt2)
    except ZeroDivisionError as e:
        pearson_corr = 0
    
    return pearson_corr            

In [12]:
pc = sim_pearson(user1, user1)
print("pearson corr = %f" % pc)

pearson corr = 0.000000


*target_user*に似ているユーザを探し出す。

In [13]:
target_user = movie_data.ix['user2']
score_dict = {}
best_score = {}

for i in range(1, ALL_USERS+1):
    user_name = "user%d" % i
    # print("Now compare %s" % user_name)
    
    user = movie_data.ix[user_name]
    pc = sim_pearson(target_user, user)
    
    score_dict[user_name] = pc

# 似ているユーザ上位10人を表示してみる
count = 0
for user, score in sorted(score_dict.items(), key=lambda x:x[1], reverse=True):
    if count == 10:
        break
    print("%s: %f" % (user, score))
    best_score[user] = score
    count += 1

user289: 1.000000
user369: 1.000000
user114: 0.998487
user187: 0.955900
user310: 0.932889
user776: 0.931879
user96: 0.923220
user766: 0.915926
user267: 0.914423
user142: 0.907980


In [14]:
compare_user = movie_data.ix[['user2','user525']]
compare_user.head()

Unnamed: 0,movie1,movie2,movie3,movie4,movie5,movie6,movie7,movie8,movie9,movie10,...,movie1673,movie1674,movie1675,movie1676,movie1677,movie1678,movie1679,movie1680,movie1681,movie1682
user2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
user525,4.0,,,,,,3.0,,,,...,,,,,,,,,,


In [15]:
print(best_score)

{'user187': 0.95590026658782035, 'user369': 0.99999999999999978, 'user267': 0.91442320216832707, 'user142': 0.9079795061529079, 'user96': 0.92322022669018877, 'user289': 1.0, 'user114': 0.9984871757166689, 'user766': 0.91592615251797538, 'user310': 0.93288855735508525, 'user776': 0.93187918046317408}


## レコメンドアルゴリズム

評価済み映画の類似度による重み付け和を正規化したものをレコメンドスコアの指標とする。

以下のような式を実装する。

* ユーザ $s$ とユーザ $t$ とのピアソン相関係数 $Sim_{s, t}$
* ユーザ $t$ が映画 $i$ を評価した時の評価値 $p_{t, i}$
* ユーザ $s$ とユーザ $t$ が共通に評価した映画の集合 $i_{s,t}$
* $i_{s,t}$ 上での $t$ の評価値の平均 $\bar{r'_s}$
* ユーザ $s$ の評価値の平均 $\bar{r_s}$
* ユーザ $s$ が 映画 $i$ を評価した時の評価値の推定値 $Recom_{s,i}$

$$
    Recom_{s,i} = \frac{\sum_{i=1}^{n} (p_{t,i} - \bar{r'_t})Sim_{s,t}}{\sum_{i=1}^{n} |Sim_{s,t}|}
$$

1. *target_user*と各々のユーザに対して類似度を求める（ピアソン相関係数を利用する）
2. 映画1から映画$i$までの評価についてそれぞれ類似度を掛けて重み付けを行う。

    $$
        WS(Weighted Score) = \sum_{i=1}^{n} \sum_{j=1}^{m} Sim_{target\_user, j} p_{j, i}
    $$

3. 類似性スコアの合計を出す。

    $$
        TS(Total Sim) = \sum_{j=1}^{m} Sim_{target\_user, j}
    $$

4. 正規化を行う。

    $$
        Norm(Normalize) = \frac{WS}{TS}
    $$

5. $Norm$の値が一番大きい映画についてレコメンドを行う。

まず、今回ターゲットとなる*target_user*のデータを抽出してみる。

In [16]:
target_user_name = 'user2'
target_user = movie_data.ix[target_user_name]
target_user.head(10)
#print(target_user.mean())

movie1     4.0
movie2     NaN
movie3     NaN
movie4     NaN
movie5     NaN
movie6     NaN
movie7     NaN
movie8     NaN
movie9     NaN
movie10    2.0
Name: user2, dtype: float64

*target_user*の*movie2*への推定評価値$Recom_{target\_user, movie2}$を求めてみる。

まず相関係数$\rho$を求める。*movie2*を評価済みのユーザ間で相関係数を求める必要がある。

In [17]:
target_movie = 'movie2'

for i in range(1, ALL_USERS+1):
    user_name = "user%d" % i
    user_data = movie_data.ix[user_name]
    
    # movie2を評価しているかどうか確認する
    if not(math.isnan(user_data[target_movie])):
        pc = sim_pearson(target_user, user_data)
        print("target_user = %s, user_data = %s, sim_pearson = %.1f" % (target_user_name, user_name, pc))

target_user = user2, user_data = user1, sim_pearson = 0.4
target_user = user2, user_data = user5, sim_pearson = 0.8
target_user = user2, user_data = user13, sim_pearson = 0.4
target_user = user2, user_data = user22, sim_pearson = 0.6
target_user = user2, user_data = user30, sim_pearson = 0.1
target_user = user2, user_data = user42, sim_pearson = -0.1
target_user = user2, user_data = user49, sim_pearson = 0.2
target_user = user2, user_data = user64, sim_pearson = 0.1
target_user = user2, user_data = user72, sim_pearson = 0.1
target_user = user2, user_data = user83, sim_pearson = -0.1
target_user = user2, user_data = user87, sim_pearson = 0.3
target_user = user2, user_data = user92, sim_pearson = 0.3
target_user = user2, user_data = user95, sim_pearson = 0.5
target_user = user2, user_data = user102, sim_pearson = 0.3
target_user = user2, user_data = user110, sim_pearson = -0.1
target_user = user2, user_data = user130, sim_pearson = -0.0
target_user = user2, user_data = user178, sim_pears

*taget_user*の全評価済み映画上の平均評価値を求める。

In [18]:
target_user_mean = target_user.mean()
print("%s mean rating = %.1f" % (target_user_name, target_user_mean))

user2 mean rating = 3.7


*user2*の*movie2*に対する推定評価値を計算する。

In [19]:
target_movie = "movie2"

sum1 = 0
print("[Now processing numerator...]")
for i in range(1, ALL_USERS+1):
    user_name = "user%d" % i
    user_data = movie_data.ix[user_name]
    
    if not(math.isnan(user_data[target_movie])):
        pc = sim_pearson(target_user, user_data)
        print("target_user = %s, user_data = %s, sim_pearson = %.1f" % (target_user_name, user_name, pc))

        calc = pc * (user_data[target_movie] - user_data.mean())
        #print("target_user = %s, user_data = %s:" % (target_user_name, user_name))
        print("calc = %.1f * (%d - %.1f) = %.1f" % (pc, user_data[target_movie], user_data.mean(), calc))

        sum1 += calc

sum2 = 0
print("[Now processing denominator...]")
for i in range(1, ALL_USERS+1):
    user_name = "user%d" % i
    user_data = movie_data.ix[user_name]
    
    if not(math.isnan(user_data[target_movie])):
        pc = sim_pearson(target_user, user_data)
        abs_pc = math.fabs(pc)
        print("target_user = %s, user_data = %s, |sim_pearson| = %.1f" % (target_user_name, user_name, abs_pc))
        sum2 += abs_pc

recom = target_user_mean + sum1 / sum2
print("%s recom rating is %.1f" % (target_movie, recom))

[Now processing numerator...]
target_user = user2, user_data = user1, sim_pearson = 0.4
calc = 0.4 * (3 - 3.6) = -0.2
target_user = user2, user_data = user5, sim_pearson = 0.8
calc = 0.8 * (3 - 2.9) = 0.1
target_user = user2, user_data = user13, sim_pearson = 0.4
calc = 0.4 * (3 - 3.1) = -0.0
target_user = user2, user_data = user22, sim_pearson = 0.6
calc = 0.6 * (2 - 3.4) = -0.8
target_user = user2, user_data = user30, sim_pearson = 0.1
calc = 0.1 * (3 - 3.8) = -0.1
target_user = user2, user_data = user42, sim_pearson = -0.1
calc = -0.1 * (5 - 3.7) = -0.1
target_user = user2, user_data = user49, sim_pearson = 0.2
calc = 0.2 * (1 - 2.7) = -0.3
target_user = user2, user_data = user64, sim_pearson = 0.1
calc = 0.1 * (3 - 3.6) = -0.1
target_user = user2, user_data = user72, sim_pearson = 0.1
calc = 0.1 * (3 - 3.8) = -0.1
target_user = user2, user_data = user83, sim_pearson = -0.1
calc = -0.1 * (4 - 3.4) = -0.1
target_user = user2, user_data = user87, sim_pearson = 0.3
calc = 0.3 * (4 - 3.

In [20]:
print("%s recom rating is %.1f" % (target_movie, recom))

movie2 recom rating is 3.4


*target_user*がまだ評価していない映画を列挙してみる。

In [21]:
for i in range(1, ALL_ITEMS+1):
    movie_name = "movie%d" % i
    
    if math.isnan(target_user[movie_name]):
        print("%s is unrated %s" % (target_user_name, movie_name))

user2 is unrated movie2
user2 is unrated movie3
user2 is unrated movie4
user2 is unrated movie5
user2 is unrated movie6
user2 is unrated movie7
user2 is unrated movie8
user2 is unrated movie9
user2 is unrated movie11
user2 is unrated movie12
user2 is unrated movie15
user2 is unrated movie16
user2 is unrated movie17
user2 is unrated movie18
user2 is unrated movie20
user2 is unrated movie21
user2 is unrated movie22
user2 is unrated movie23
user2 is unrated movie24
user2 is unrated movie26
user2 is unrated movie27
user2 is unrated movie28
user2 is unrated movie29
user2 is unrated movie30
user2 is unrated movie31
user2 is unrated movie32
user2 is unrated movie33
user2 is unrated movie34
user2 is unrated movie35
user2 is unrated movie36
user2 is unrated movie37
user2 is unrated movie38
user2 is unrated movie39
user2 is unrated movie40
user2 is unrated movie41
user2 is unrated movie42
user2 is unrated movie43
user2 is unrated movie44
user2 is unrated movie45
user2 is unrated movie46
user2 is

レコメンドスコアを計算する関数を定義。

In [33]:
def calc_recom_score(movie_data, target_user_name, target_movie):
    
    # レコメンドを行うユーザのデータを抽出
    target_user = movie_data.ix[target_user_name]

    sum1 = 0
    #print("[Now processing numerator...]")
    for i in range(1, ALL_USERS+1):
        user_name = "user%d" % i
        user_data = movie_data.ix[user_name]
        
        # target_movieを評価しているuserについて計算を行う
        if not(math.isnan(user_data[target_movie])):
            pc = sim_pearson(target_user, user_data)
            #print("target user = %s, user data = %s, sim_pearson() = %.1f" % (target_user_name, user_name, pc))

            calc = pc * (user_data[target_movie] - user_data.mean())
            #print("calc = %.1f * (%d - %.1f) = %.1f" % (pc, user_data[target_movie], user_data.mean(), calc))
            
            sum1 += calc

    sum2 = 0
    #print("[Now processing denominator...]")
    for i in range(1, ALL_USERS+1):
        user_name = "user%d" % i
        user_data = movie_data.ix[user_name]

        if not(math.isnan(user_data[target_movie])):
            pc = sim_pearson(target_user, user_data)
            abs_pc = math.fabs(pc)
            #print("target user = %s, user data = %s, |sim_pearson()| = %.1f" % (target_user_name, user_name, abs_pc))
            sum2 += abs_pc
            
    recom = target_user.mean() + sum1 / sum2
    return recom       

In [32]:
target_user_name = "user2"
target_movie = "movie2"
recom = calc_recom_score(movie_data, target_user_name, target_movie)

print("%s: %s recom rating is %.1f" % (target_user_name, target_movie, recom))

[Now processing numerator...]
[Now processing denominator...]
user2: movie2 recom rating is 3.4


In [38]:
target_user_name = "user2"
target_user = movie_data.ix[target_user_name]

start = time.time()

for i in range(1, ALL_ITEMS+1):
    target_movie = "movie%d" % i
    
    if math.isnan(target_user[target_movie]):
        recom = calc_recom_score(movie_data, target_user_name, target_movie)
        print("%s: %s recom rating is %.1f" % (target_user_name, target_movie, recom))

end = time.time() - start
print("processing time: %.1f [sec]" % end)

user2: movie2 recom rating is 3.4
user2: movie3 recom rating is 3.3
user2: movie4 recom rating is 3.7
user2: movie5 recom rating is 3.5
user2: movie6 recom rating is 4.1
user2: movie7 recom rating is 3.9
user2: movie8 recom rating is 4.0
user2: movie9 recom rating is 4.1
user2: movie11 recom rating is 3.9
user2: movie12 recom rating is 4.4
user2: movie15 recom rating is 3.8
user2: movie16 recom rating is 3.8
user2: movie17 recom rating is 3.5
user2: movie18 recom rating is 2.9
user2: movie20 recom rating is 3.8
user2: movie21 recom rating is 3.1
user2: movie22 recom rating is 4.2
user2: movie23 recom rating is 4.2
user2: movie24 recom rating is 3.7
user2: movie26 recom rating is 3.6
user2: movie27 recom rating is 3.4
user2: movie28 recom rating is 4.0
user2: movie29 recom rating is 2.9
user2: movie30 recom rating is 4.0
user2: movie31 recom rating is 3.7
user2: movie32 recom rating is 3.8
user2: movie33 recom rating is 3.6
user2: movie34 recom rating is 2.1
user2: movie35 recom rating 

## 参考

* [推薦システム 解説・講義資料](http://www.kamishima.net/jp/kaisetsu/)
* [推薦システムのアルゴリズム - 第9章 メモリベース型協調フィルタリング -](http://www.kamishima.net/archive/recsysdoc.pdf)
* [協調フィルタリングを利用した推薦システム構築](http://www.slideshare.net/masayuki1986/recommendation-ml)