In [1]:
import pandas as pd
import numpy as np

In [2]:
from gensim.models import Word2Vec



In [16]:
from sklearn.manifold import t_sne

#### Data loading

In [17]:
data = pd.read_csv("titles_books.csv")
data.sample(10)

Unnamed: 0.1,Unnamed: 0,title
35311,6841910099,Mroczna wieża VI. Pieśń Susannah - Stephen King
17132,6725405812,WYBRANKA KUSZIELA Jacqueline Carey 2005
19102,6747185996,HUMOR I MĄDROŚĆ ŚWIATA DYSKU - TERRY PRATCHETT...
40892,6848215173,Cassandra Clare - Mechaniczny anioł [NOWA]
9107,6395685494,"Zoroaster, Gwiazdy umierają w milczeniu, R. Dę..."
11822,6601149479,BURSZTYN I POPIÓL tom 1 Mroczny uczeń Weis
14310,6680141537,GEORGE R.R. MARTIN - NAWAŁNICA MIECZY STAL I Ś...
41841,6849261587,Romans Wszechczasów T 8 - Chmielewska Joanna
29933,6829569828,Feist ADEPT MAGII
11466,6596742477,J .D. HORN ŹRÓDŁO WIDZMY Z SAVANNAH 2


In [6]:
import re
def getWords(text):
    all_words = re.compile('\w+').findall(text.replace(".", ""))
    return [words.lower() for words in all_words if len(words)>1]

In [7]:
titles = [getWords(title) for title in data.title]

#### Model training

In [28]:
VECTOR_DIM = 100

In [8]:
model = Word2Vec(titles, size=VECTOR_DIM)

In [9]:
model.similar_by_word(word="tolkien".lower(), topn=20)

[('hobbit', 0.9957756996154785),
 ('jrr', 0.9954607486724854),
 ('hurina', 0.9934357404708862),
 ('silmarillion', 0.9924434423446655),
 ('kullervo', 0.9885507822036743),
 ('tomie', 0.9864453077316284),
 ('powrotem', 0.98615962266922),
 ('sibley', 0.9847725033760071),
 ('skibniewska', 0.9844139218330383),
 ('pierścienia', 0.9834655523300171),
 ('dwie', 0.9830611348152161),
 ('kropka', 0.9809067249298096),
 ('niedokończone', 0.9801709055900574),
 ('pierścieni', 0.9795058965682983),
 ('trygław', 0.9794532656669617),
 ('kropce', 0.9793313145637512),
 ('władca', 0.9787853956222534),
 ('czyli', 0.9782322645187378),
 ('niezwykła', 0.977992594242096),
 ('toma', 0.9758894443511963)]

In [47]:
def get_word_score(model, word):
    try:
        score = model.wv[word]
    except:
        score = np.repeat(np.nan, repeats=VECTOR_DIM)
    return score

def get_title_score(model, title):
    title_parsed = getWords(title)
    mean_score = np.nanmean([get_word_score(model, i) for i in title_parsed], axis = 0) #nanmean to jak na.rm=TRUE w R
    return mean_score

In [35]:
v1 = get_title_score(model, "Tolkien - Władca pierścieni - Trylogia")
v2 = get_title_score(model, "NAUKA ŚWIATA DYSKU I - Terry Pratchett")
np.sum((v1-v2)**2)

73.810577

In [32]:
vectorized_titles = [get_title_score(model, title) for title in data.title]

In [33]:
vectorized_titles

[array([-0.1946907 , -0.16222398, -0.10319182, -0.09339006,  0.13864772,
        -0.01069478, -0.33829049, -0.17100036,  0.07659489,  0.16675675,
         0.05623475,  0.08468229, -0.24688641, -0.02592919, -0.03777951,
         0.13747667,  0.06990512,  0.27506932,  0.37132437,  0.19710468,
        -0.36179687,  0.1057469 , -0.06857828,  0.28104637, -0.20767892,
        -0.18430691, -0.05704453, -0.19200987,  0.07081492, -0.069886  ,
         0.05408499, -0.1828124 ,  0.07046911,  0.14331242, -0.13305595,
        -0.00138262, -0.06471096,  0.0461946 ,  0.04585995,  0.01848481,
         0.04109328,  0.05250003,  0.10861211,  0.16122509,  0.07124811,
         0.18695768, -0.12275385,  0.08631537, -0.01639856,  0.00779069,
        -0.07523089, -0.12146421, -0.14432396, -0.20220626, -0.05173947,
        -0.148416  , -0.04975614,  0.02712826, -0.05641775,  0.06593676,
         0.20389815, -0.17378749, -0.10733493,  0.04032679, -0.3815569 ,
         0.22788328, -0.14473848, -0.08698911, -0.3

#### T-SNE visualizations

In [19]:
tsne_transformer = t_sne.TSNE()

In [None]:
tsne_transformer.fit()