In [1]:
import sys
sys.version

'3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 11:57:41) [MSC v.1900 64 bit (AMD64)]'

In [24]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine

In [55]:
import re
def getWords(text):
    splitted_text = text.split()
    new_splitted_text = []
    for item in splitted_text:
        if len(item) == 1:
            item = item.lower()
        new_splitted_text.append(item)
    new_text = " ".join(new_splitted_text)
    new_text = re.sub("\.", " ", new_text)
    new_text = re.sub("_", "", new_text)
    new_text = re.sub("(?<= [A-Z]{1}) +((?=[A-Z] )|(?=[A-Z]$))", "", new_text)
    all_words = re.compile('\w+').findall(new_text)
    return [words.lower() for words in all_words if len(words)>1]

def get_word_score(model, word):
    try:
        score = model.wv[word]
    except:
        score = np.repeat(np.nan, repeats=VECTOR_DIM)
    return score

def get_title_score(title, model):
    title_parsed = getWords(title)
    mean_score = np.nanmean([get_word_score(model, i) for i in title_parsed], axis = 0) #nanmean to jak na.rm=TRUE w R
    return mean_score

def calculate_mean_cos_similarity(input_col):
    n = input_col.shape[0]
    calc_col = np.array(input_col)
    similarity_sum = 0
    counter = 0
    for i in range(n):
        for j in range(i, n):
            similarity_sum+= cosine(calc_col[i], calc_col[j])
            counter+=1
            
    return similarity_sum/counter

In [9]:
train_data = pd.read_csv("titles_books.csv")
test_data = pd.read_csv("test_data.csv")
train_titles = [getWords(title) for title in train_data.title]

In [60]:
VECTOR_DIM = 64

In [61]:
model = Word2Vec(train_titles, size=VECTOR_DIM, min_count=10)
test_data['score'] = test_data.title.apply(func=get_title_score, model=model)

In [62]:
test_data[["author", "book_title", "score"]].groupby(("author", "book_title")).agg(lambda x: calculate_mean_cos_similarity(x))

Unnamed: 0_level_0,Unnamed: 1_level_0,score
author,book_title,Unnamed: 2_level_1
Canavan,Anioł Burz,0.020167
Canavan,Gildia Magów,0.012206
Canavan,Królowa Zdrajców,0.014668
Canavan,Misja Ambasadora,0.016124
Canavan,Nowicjuszka,0.022936
Canavan,Wielki Mistrz,0.015329
Paolini,Brisingr,0.056605
Paolini,Dziedzictwo,0.062073
Paolini,Eragon,0.139743
Paolini,Najstarszy,0.038812


In [63]:
test_data[["author",  "score"]].groupby(("author")).agg(lambda x: calculate_mean_cos_similarity(x))

Unnamed: 0_level_0,score
author,Unnamed: 1_level_1
Canavan,0.028009
Paolini,0.083185
Pratchett,0.015569
Rowling,0.005471
Sapkowski,0.008545
Tolkien,0.022098


### Clustering

In [64]:
from sklearn.cluster import KMeans

In [65]:
n_clusters = test_data.book_title.unique().shape[0]

In [67]:
km = KMeans(n_clusters=n_clusters)

In [75]:
km.fit(np.array([i for i in test_data.score]))

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=25, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [77]:
test_data['cluster'] = km.predict(np.array([i for i in test_data.score]))

In [84]:
test_data.tail(100)

Unnamed: 0.1,Unnamed: 0,title,author,book_title,score,cluster
359,6851388803,Nowicjuszka - Trudi Canavan - twarda 2803,Canavan,Nowicjuszka,"[-0.0729020177387, 0.854988880455, 0.658814564...",0
360,6852023895,CANAVAN - GILDIA + NOWICJUSZKA + WIELKI - Tryl...,Canavan,Nowicjuszka,"[-0.00282413, 0.90076, 0.673447, 0.279323, 0.2...",13
361,6852251296,Trudi Canavan NOWICJUSZKA cz.1 i 2 Tw.Opra.,Canavan,Nowicjuszka,"[-0.0694624032825, 0.859984174371, 0.679221215...",0
362,5930344095,Misja Ambasadora+ Łotr+ Królowa zdrajców Canavan,Canavan,Królowa Zdrajców,"[-0.0484326, 0.759889, 0.540533, 0.254925, 0.2...",13
363,6684064303,KRÓLOWA ZDRAJCÓW T. Canavan egz. powystawowy,Canavan,Królowa Zdrajców,"[-0.0866238, 0.728537, 0.529533, 0.236191, 0.1...",13
364,6690273767,KRÓLOWA ZDRAJCÓW Trudi CANAVAN,Canavan,Królowa Zdrajców,"[-0.101578, 1.14772, 0.894204, 0.465335, 0.258...",14
365,6709812157,KRÓLOWA ZDRAJCÓW - TRUDI CANAVAN,Canavan,Królowa Zdrajców,"[-0.101578, 1.14772, 0.894204, 0.465335, 0.258...",14
366,6748309335,KRÓLOWA ZDRAJCÓW - TRUDI CANAVAN,Canavan,Królowa Zdrajców,"[-0.101578, 1.14772, 0.894204, 0.465335, 0.258...",14
367,6751590932,T. Canavan - Królowa zdrajców,Canavan,Królowa Zdrajców,"[-0.099186, 1.04822, 0.765453, 0.36363, 0.2823...",14
368,6767406618,Królowa zdrajców [Canavan Trudi],Canavan,Królowa Zdrajców,"[-0.101578, 1.14772, 0.894204, 0.465335, 0.258...",14


In [83]:
getWords("ZBROJNI TOM 2 TERRY PRATCHETT TWARDA")

['zbrojni', 'tom', 'terry', 'pratchett', 'twarda']