# VASILY 課題

In [39]:
import os
import numpy as np
import pandas as pd
import math
import time

In [18]:
DATA_DIR = os.path.join(os.path.dirname(os.path.realpath("__file__")),"ml-100k")

今回扱っているデータの基本情報

In [19]:
ALL_USERS = 943
ALL_ITEMS = 1682
ALL_RATINGS = 100000

縦軸：ユーザ、横軸：映画の表に各々映画に対してユーザが評価した値が格納されているmovie_table.csvを読み込む

In [20]:
movie_table_file = "movie_table.csv"
movie_data = pd.read_csv(movie_table_file, index_col=0)
movie_data.head()

Unnamed: 0,movie1,movie2,movie3,movie4,movie5,movie6,movie7,movie8,movie9,movie10,...,movie1673,movie1674,movie1675,movie1676,movie1677,movie1678,movie1679,movie1680,movie1681,movie1682
user1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
user2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
user3,,,,,,,,,,,...,,,,,,,,,,
user4,,,,,,,,,,,...,,,,,,,,,,
user5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


## ピアソン相関係数を計算する

ピアソン相関係数を計算して、相関が高いものをレコメンドするようにする。ピアソン相関係数$\rho_P$は

$$
    \rho_P = \frac{E_{X,Y}[(X-\mu_X)(Y - \mu_Y)]}{\sqrt{E_X[(X - \mu_X)^2]}\sqrt{E_Y[(Y - \mu_Y)^2]}}
$$

ただし、

$$
    \mu_X = E_X[X], \mu_Y = E_Y[Y]
$$

である。

以下のような式を実装する。

* ユーザ $s$ による映画 $i$ の評価を $p_{s,i}$、ユーザ $t$ による映画 $i$ の評価を $p_{t, i}$
* ユーザ $s$ の映画の評価の平均を $E[P_s]$、ユーザ $t$ の映画の評価の平均を $E[P_t]$
* ユーザ$s$, $t$間のピアソン相関係数 $Sim_{s,t}$

$$
    Sim_{s,t} = \frac{\sum_{i=1}^{n}(p_{s,i} - E[P_s])(p_{t,i} - E[P_t])}{\sqrt{\sum_{i=1}^{n}(p_{s,i} - E[P_s])^2} \sqrt{{\sum_{i=1}^{n}(p_{t,i} - E[P_t])^2}}}
$$

ユーザの映画評価の平均を計算する

In [21]:
user1 = movie_data.ix['user1']
user1_mean = user1.mean()
print("user1 average rating = %.1f" % user1_mean)

user2 = movie_data.ix['user2']
user2_mean = user2.mean()
print("user2 average rating = %.1f" % user2_mean)

print(type(user1))

user1 average rating = 3.6
user2 average rating = 3.7
<class 'pandas.core.series.Series'>


In [22]:
# 取り出したデータって型は何？
print(user1["movie1"])
print(type(user1["movie1"])) # numpy.float64みたい

5.0
<class 'numpy.float64'>


In [23]:
sum = 0
for i in range(1, ALL_ITEMS+1):
    movie_name = "movie%d"% i
    
    if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
        
        print("user1[%s] = %f, user2[%s] = %f" %(movie_name, user1[movie_name], movie_name, user2[movie_name]))
        
        # 計算を行う
        calc = (user1[movie_name] - user1_mean)*(user2[movie_name] - user2_mean)
        
        print("now movie_num = %s, calc = %f" % (movie_name, calc))
        
        # 計算結果を足しこむ
        sum += calc

print("sum = %f" % sum)

user1[movie1] = 5.000000, user2[movie1] = 4.000000
now movie_num = movie1, calc = 0.403463
user1[movie10] = 3.000000, user2[movie10] = 2.000000
now movie_num = movie10, calc = 1.043406
user1[movie13] = 5.000000, user2[movie13] = 4.000000
now movie_num = movie13, calc = 0.403463
user1[movie14] = 5.000000, user2[movie14] = 4.000000
now movie_num = movie14, calc = 0.403463
user1[movie19] = 5.000000, user2[movie19] = 3.000000
now movie_num = movie19, calc = -0.986243
user1[movie25] = 4.000000, user2[movie25] = 4.000000
now movie_num = movie25, calc = 0.113140
user1[movie50] = 5.000000, user2[movie50] = 5.000000
now movie_num = movie50, calc = 1.793169
user1[movie100] = 5.000000, user2[movie100] = 5.000000
now movie_num = movie100, calc = 1.793169
user1[movie111] = 5.000000, user2[movie111] = 4.000000
now movie_num = movie111, calc = 0.403463
user1[movie127] = 5.000000, user2[movie127] = 5.000000
now movie_num = movie127, calc = 1.793169
user1[movie237] = 2.000000, user2[movie237] = 4.00000

In [24]:
sos1 = 0; sos2 = 0
for i in range(1, ALL_ITEMS+1):
    movie_name = "movie%d" % i
    
    if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
        v1 = (user1[movie_name] - user1_mean)**2
        v2 = (user2[movie_name] - user2_mean)**2
        print("%s: v1 = %f, v2 = %f" % (movie_name, v1, v2))
        
        # sum of square(二乗和)を計算
        sos1 = sos1 + v1
        sos2 = sos2 + v2

sqrt1 = math.sqrt(sos1)
sqrt2 = math.sqrt(sos2)
print("sos1 = %.1f, sos2 = %.1f" % (sos1, sos2))
print("sqrt1 = %.1f, sqrt2 = %.1f" % (sqrt1, sqrt2))

movie1: v1 = 1.931282, v2 = 0.084287
movie10: v1 = 0.372459, v2 = 2.922997
movie13: v1 = 1.931282, v2 = 0.084287
movie14: v1 = 1.931282, v2 = 0.084287
movie19: v1 = 1.931282, v2 = 0.503642
movie25: v1 = 0.151871, v2 = 0.084287
movie50: v1 = 1.931282, v2 = 1.664932
movie100: v1 = 1.931282, v2 = 1.664932
movie111: v1 = 1.931282, v2 = 0.084287
movie127: v1 = 1.931282, v2 = 1.664932
movie237: v1 = 2.593047, v2 = 0.084287
movie242: v1 = 1.931282, v2 = 1.664932
movie251: v1 = 0.151871, v2 = 1.664932
movie255: v1 = 2.593047, v2 = 0.084287
movie257: v1 = 0.151871, v2 = 0.084287
movie258: v1 = 1.931282, v2 = 0.503642
movie269: v1 = 1.931282, v2 = 0.084287
movie272: v1 = 0.372459, v2 = 1.664932
sos1 = 27.6, sos2 = 14.7
sqrt1 = 5.3, sqrt2 = 3.8


In [25]:
pearson_correlation = sum / (sqrt1 * sqrt2)
print("pearson correlation = %f" % (pearson_correlation))

pearson correlation = 0.360871


In [26]:
def sim_pearson(user1, user2):

    # 同じもの同士は0を返すようにする
    if user1.equals(user2):
        return 0.0
    
    user1_mean = user1.mean()
    user2_mean = user2.mean()

    sum = 0
    for i in range(1, ALL_ITEMS+1):
        movie_name = "movie%d" % i
        
        if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
            calc_ = (user1[movie_name] - user1_mean)*(user2[movie_name] - user2_mean)
            sum += calc_
    
    sos1 = 0; sos2 = 0
    for i in range(1, ALL_ITEMS+1):
        movie_name = "movie%d" % i
        
        if not(math.isnan(user1[movie_name]) or math.isnan(user2[movie_name])):
            v1 = (user1[movie_name] - user1_mean)**2
            v2 = (user2[movie_name] - user2_mean)**2

            sos1 = sos1 + v1; sos2 = sos2 + v2
            
    sqrt1 = math.sqrt(sos1)
    sqrt2 = math.sqrt(sos2)
    
    try:
        pearson_corr = sum / (sqrt1 * sqrt2)
    except ZeroDivisionError as e:
        pearson_corr = 0
    
    return pearson_corr            

In [27]:
pc = sim_pearson(user1, user1)
print("pearson corr = %f" % pc)

pearson corr = 0.000000


*target_user*に似ているユーザを探し出す。

In [28]:
target_user = movie_data.ix['user2']
score_dict = {}
best_score = {}

for i in range(1, ALL_USERS+1):
    user_name = "user%d" % i
    # print("Now compare %s" % user_name)
    
    user = movie_data.ix[user_name]
    pc = sim_pearson(target_user, user)
    
    score_dict[user_name] = pc

# 似ているユーザ上位10人を表示してみる
count = 0
for user, score in sorted(score_dict.items(), key=lambda x:x[1], reverse=True):
    if count == 10:
        break
    print("%s: %f" % (user, score))
    best_score[user] = score
    count += 1

user522: 1.000000
user289: 1.000000
user366: 1.000000
user98: 1.000000
user778: 1.000000
user51: 1.000000
user700: 1.000000
user912: 1.000000
user369: 1.000000
user114: 0.998487


In [29]:
compare_user = movie_data.ix[['user2','user525']]
compare_user.head()

Unnamed: 0,movie1,movie2,movie3,movie4,movie5,movie6,movie7,movie8,movie9,movie10,...,movie1673,movie1674,movie1675,movie1676,movie1677,movie1678,movie1679,movie1680,movie1681,movie1682
user2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
user525,4.0,,,,,,3.0,,,,...,,,,,,,,,,


In [30]:
print(best_score)

{'user51': 1.0, 'user778': 1.0, 'user369': 0.99999999999999978, 'user700': 1.0, 'user114': 0.9984871757166689, 'user289': 1.0, 'user98': 1.0, 'user912': 1.0, 'user366': 1.0, 'user522': 1.0}


## レコメンドアルゴリズム

評価済み映画の類似度による重み付け和を正規化したものをレコメンドスコアの指標とする。

以下のような式を実装する。

* ユーザ $s$ とユーザ $t$ とのピアソン相関係数 $Sim_{s, t}$
* ユーザ $t$ が映画 $i$ を評価した時の評価値 $p_{t, i}$
* ユーザ $s$ とユーザ $t$ が共通に評価した映画の集合 $i_{s,t}$
* $i_{s,t}$ 上での $t$ の評価値の平均 $\bar{r'_s}$
* ユーザ $s$ の評価値の平均 $\bar{r_s}$
* ユーザ $s$ が 映画 $i$ を評価した時の評価値の推定値 $Recom_{s,i}$

$$
    Recom_{s,i} = \frac{\sum_{i=1}^{n} (p_{t,i} - \bar{r'_t})Sim_{s,t}}{\sum_{i=1}^{n} |Sim_{s,t}|}
$$

1. *target_user*と各々のユーザに対して類似度を求める（ピアソン相関係数を利用する）
2. 映画1から映画$i$までの評価についてそれぞれ類似度を掛けて重み付けを行う。

    $$
        WS(Weighted Score) = \sum_{i=1}^{n} \sum_{j=1}^{m} Sim_{target\_user, j} p_{j, i}
    $$

3. 類似性スコアの合計を出す。

    $$
        TS(Total Sim) = \sum_{j=1}^{m} Sim_{target\_user, j}
    $$

4. 正規化を行う。

    $$
        Norm(Normalize) = \frac{WS}{TS}
    $$

5. $Norm$の値が一番大きい映画についてレコメンドを行う。

In [38]:
target_user = movie_data.ix['user2']

start = time.time()

sum1 = 0
for i in range(1, ALL_USERS+1):
    user_name = "user%d" % i
    
    user = movie_data.ix[user_name]
    
    for j in range(1, ALL_ITEMS+1):
        movie_name = "movie%d" % j

        if not(math.isnan(user[movie_name])):
            pc = sim_pearson(target_user, user)
            sum1 += user[movie_name] * pc

sum2 = 0
for i in range(1, ALL_USERS+1):
    user_name = "user%d" % i
    user = movie_data.ix[user_name]
    
    pc = sim_pearson(target_user, user)
    sum2 += math.fabs(pc)

recom = sum1 / sum2

end = time.time() - start
print("processing time: %f" % end)

KeyboardInterrupt: 