In [8]:
import hashlib

def myHash(s):
    if s == None:
        return 0
    else:
        return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (10 ** 9)

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

# create the Spark Session
spark = SparkSession.builder.appName("spark").getOrCreate()

# create the Spark Context
sc = spark.sparkContext



In [10]:
####################################
# [0] = book unique id -> 1
# [1:980] = genre vector -> 980
# [981] = author -> 1
# 하나의 book profile 크기 = 982
####################################
import time
import pickle
with open("../data/ForBookMatrix.pkl", "rb") as f:
    ForBookMatrix = pickle.load(f)
print(ForBookMatrix.shape)

import numpy as np
with open("../data/foregin-all-genres-dic.pkl", "rb") as f:
    genres_dic = pickle.load(f)
print(len(genres_dic))

(52478, 982)
980


In [9]:
print(ForBookMatrix[10,0])
print(ForBookMatrix[0,980])
print(ForBookMatrix[0,981])
print(myHash("Suzanne Collins"))

11
0
647226322
647226322


# genre만 받았을 경우

### 일반적인 content based search

In [31]:
from numpy.linalg import norm

def cos_similarity(x :np.array, y :np.array):
    return ( (np.dot(x,y))/(norm(x)*norm(y)) )

In [32]:
def linear_search(Matrix, userProfile, topNum, genre_dim):
    similarity = map(
        lambda bookProfile: \
            (int(bookProfile[0]),cos_similarity(bookProfile[1:genre_dim+1], userProfile[0,1:]))
            ,Matrix)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [t[0] for t in best]
    # return [t for t in best]


In [52]:
genres = ["Young Adult", "Fiction", "Dystopia", "Fantasy", "Science Fiction", "Romance", "Adventure", "Teen"]

# 0을 남겨둠
userProfile = np.zeros((1,len(genres_dic)+1))
for genre in genres:
    userProfile[0,genres_dic[genre]] = 1.

for i in range(len(userProfile[0])):
    if userProfile[0,i] == 1.0:
        print(i, end=", ")

1, 2, 3, 4, 13, 17, 35, 57, 

In [53]:
start = time.time()
print(linear_search(ForBookMatrix, userProfile, 100)[:10])
print("="*100)
print(time.time()-start,"(second)")

[0, 21, 151, 184, 221, 237, 326, 335, 345, 622]
0.711101770401001 (second)


In [35]:
iterNum = 10
sizes = np.random.randint(low=1, high=10, size=iterNum)
addTime = 0
for i, size in enumerate(sizes):
    r = np.random.randint(low=1, high=982, size=size)
    userProfile = np.zeros((1,len(genres_dic)+1))
    userProfile[0,r] = 1.0
    start = time.time()
    best = linear_search(ForBookMatrix, userProfile, 100)
    addTime += time.time()-start
    print(f"iter{i+1}|",best[:10])

print("="*100)
print("average time:",addTime/iterNum)

iter1| [20007, 23623, 23074, 26142, 31168, 41034, 50739, 28610, 32964, 33087]
iter2| [17743, 30142, 22460, 33532, 42656, 21327, 25743, 34137, 35788, 37774]
iter3| [37791, 50834, 35650, 49004, 31373, 33844, 34212, 35918, 47468, 17897]
iter4| [20007, 46612, 34512, 41034, 42960, 34165, 20626, 21481, 21986, 44197]
iter5| [43724, 4926, 11277, 14824, 20600, 23260, 25915, 35181, 35524, 36741]
iter6| [36109, 43830, 50043, 42573, 32176, 33357, 41592, 47010, 12130, 24093]
iter7| [15396, 20921, 33048, 39232, 44424, 44674, 44751, 44792, 47160, 15027]
iter8| [28384, 32578, 44049, 32740, 42743, 13086, 27128, 30729, 37069, 6806]
iter9| [17928, 18198, 20391, 21004, 25099, 26952, 51995, 13544, 19476, 28309]
iter10| [40854, 38595, 3170, 4805, 14402, 14900, 23735, 2303, 2435, 4032]
average time: 0.6346384286880493


### LSH를 통한 content based search

In [45]:
# k == the number of "row"
# L == the number of "band"
DIMENSION = 982

def create_function(dimensions, thresholds):
    def f(v):
        boolarray = [v[dimensions[i]] >= thresholds[i] for i in range(len(dimensions))]
        return "".join(map(str, map(int, boolarray)))
    return f


def create_functions(k, L, num_dimensions = DIMENSION, min=0, max=1):
    functions = []
    for i in range(L):
        dimensions = np.random.randint(low = 0,
                                        high = num_dimensions,
                                        size = k)
        thresholds = np.random.randint(low = min,
                                        high = max+1,
                                        size = k)
        functions.append(create_function(dimensions, thresholds))
    return functions


# 하나의 shingling에 대한 hash_vector 추출
def hash_vector(functions, v):
    return np.array([f(v) for f in functions])


def hash_data(functions, A):
    # v(shape:(400,)) is a each row of A(matrix)(shape:(59499, 400))
    return np.array(list(map(lambda v: hash_vector(functions, v), A)))


# 하나의 band라도 맞으면 filtering됨.
def get_candidates(hashed_A, hashed_point):
    return filter(lambda i: any(hashed_point == hashed_A[i]), range(len(hashed_A)))


# return == (hash functions, signature matrix of A)
def lsh_setup(A, k = 20, L = 70):
    functions = create_functions(k = k, L = L)
    hashed_A = hash_data(functions, A)
    return (functions, hashed_A)


def lsh_search(A, hashed_A, functions, userProfile, topNum = 10):
    # A == KorBookProfile
    # hashed_A == signature matrix of KorBookProfile
    # 원래 데이터 vector에서 hash-vector 추출
    hashed_userProfile = hash_vector(functions, userProfile[0])
    candidate_row_nums = get_candidates(hashed_A, hashed_userProfile)
    similarity = map(lambda r: (A[r,0], cos_similarity(A[r,1:DIMENSION+1], userProfile[0,1:])), candidate_row_nums)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [int(t[0]) for t in best]
    # return [t for t in best]

In [73]:
A = ForBookMatrix[:,1:DIMENSION+1]
print(A.shape)
funcs, hashed_A = lsh_setup(A, k=1, L=1)
print(hashed_A.shape)

(47835, 982)
(47835, 1)


In [74]:
genres = ["Young Adult", "Fiction", "Dystopia", "Fantasy", "Science Fiction", "Romance", "Adventure", "Teen"]

# 0을 남겨둠
userProfile = np.zeros((1,len(genres_dic)+1))
for genre in genres:
    userProfile[0,genres_dic[genre]] = 1.

for i in range(len(userProfile[0])):
    if userProfile[0,i] == 1.0:
        print(i, end=", ")

1, 2, 3, 4, 13, 17, 35, 57, 

In [75]:
start = time.time()
print(lsh_search(ForBookMatrix, hashed_A, funcs, userProfile, 100)[:10])
print("="*100)
print(time.time()-start,"(second)")

[0, 21, 151, 184, 221, 237, 326, 335, 345, 622]
0.7970468997955322 (second)


In [76]:
x = np.array(lsh_search(ForBookMatrix, hashed_A, funcs, userProfile, 100))
y = np.array(linear_search(ForBookMatrix, userProfile, 100))
print("similarity:",cos_similarity(x, y))

similarity: 1.0000000000000002


In [77]:
iterNum = 10
sizes = np.random.randint(low=1, high=10, size=iterNum)
addTime = 0
# avgErr = 0
for i, size in enumerate(sizes):
    r = np.random.randint(low=1, high=DIMENSION, size=size)
    userProfile = np.zeros((1,len(genres_dic)+1))
    userProfile[0,r] = 1.0
    start = time.time()
    best = lsh_search(ForBookMatrix, hashed_A, funcs, userProfile, 100)
    addTime += time.time()-start
    x = np.array(best)
    y = np.array(linear_search(ForBookMatrix, userProfile, 100))
    print(f"iter{i+1}|",best[:10])
    print(f"iter{i+1}| similarity: ",cos_similarity(x, y))
    print("-"*50)

print("="*50)
print("average time:",addTime/iterNum)

iter1| [18450, 27316, 30991, 48795, 29765, 34776, 18496, 19702, 42149, 31626]
iter1| similarity:  1.0
--------------------------------------------------
iter2| [32282, 35115, 39141, 42333, 45656, 51947, 33115, 52104, 2915, 5059]
iter2| similarity:  0.9999999999999999
--------------------------------------------------
iter3| [18821, 18888, 19023, 19122, 29748, 51185, 15593, 17833, 19210, 19489]
iter3| similarity:  1.0000000000000002
--------------------------------------------------
iter4| [36503, 3364, 50963, 3328, 18336, 22010, 25339, 29610, 32059, 34061]
iter4| similarity:  1.0
--------------------------------------------------
iter5| [19762, 18945, 26132, 39512, 41639, 43736, 50049, 52083, 4393, 22216]
iter5| similarity:  1.0
--------------------------------------------------
iter6| [21700, 33956, 42167, 18568, 23712, 29334, 29529, 30433, 18975, 20407]
iter6| similarity:  0.9999999999999999
--------------------------------------------------
iter7| [35320, 38955, 22533, 41491, 42194,

# 02

In [81]:
from numpy.linalg import norm

def mySimilarity(x :np.array, user :np.array):
    if(any(x[0]==user[0])):
        return 0
    elif(any(x[-1]==user[2])):
        return 1.3*((np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))))
    else:
        return ( (np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))) )

def my_search(KorBookMatrix, userProfile, topNum):
    similarity = map(
        lambda bookProfile: \
            (int(bookProfile[0]),mySimilarity(bookProfile, userProfile))
            ,KorBookMatrix)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [t[0] for t in best]
    # return [t for t in best]


In [83]:
# test setting
genres = ["Young Adult", "Fiction", "Dystopia", "Fantasy", "Science Fiction", "Romance", "Adventure", "Teen"]
data = [
    {
        "title":"The Hunger Games",
        "author":"Suzanne Collins",
        "id":0,
        "genres":["Young Adult", "Fiction", "Dystopia", "Fantasy", "Science Fiction", "Romance", "Adventure", "Teen", "Post Apocalyptic", "Action"]
    },
]

# 0을 남겨둠
genreVec = np.zeros((len(genres_dic)))
for genre in genres:
    genreVec[genres_dic[genre]] = 1.

idVec = []
authorVec = []
for one in data:
    for genre in one["genres"]:
        genreVec[genres_dic[genre]-1] += 1.
    idVec.append(one["id"])
    authorVec.append(myHash(one["author"]))

userProfile = [idVec, genreVec, authorVec]

In [84]:
start = time.time()
print(my_search(ForBookMatrix, userProfile, 50)[:10])
print("="*100)
print(time.time()-start,"(second)")

[221, 326, 5240, 21, 151, 345, 1602, 4493, 4696, 12765]
1.5366299152374268 (second)
