In [1]:
import sys
print(sys.version)

3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from gensim.models import Word2Vec

In [4]:
from sklearn.manifold import t_sne

In [5]:
from MulticoreTSNE import MulticoreTSNE

#### Data loading

In [6]:
import re
def getWords(text, stopwords):
    splitted_text = text.split()
    new_splitted_text = []
    for item in splitted_text:
        if len(item) == 1:
            item = item.lower()
        new_splitted_text.append(item)
    new_text = " ".join(new_splitted_text)
    new_text = re.sub("\.", " ", new_text)
    new_text = re.sub("_", "", new_text)
    new_text = re.sub("(?<= [A-Z]{1}) +((?=[A-Z] )|(?=[A-Z]$))", "", new_text)
    all_words = re.compile('\w+').findall(new_text)
    return [words.lower() for words in all_words if len(words)>1 and words.lower() not in stopwords]

In [7]:
stopwords = pd.read_csv("stopwords.csv").as_matrix()
stopwords.resize(stopwords.shape[0])

In [8]:
train_data = pd.read_csv("titles_books.csv")
test_data = pd.read_csv("test_data.csv")
train_titles = [getWords(title, stopwords) for title in train_data.title]
test_titles = [title for title in test_data.title]

i = 0
labels = [0]
for j in range(test_data.shape[0])[1:]:
    if test_data.book_title[j-1]!=test_data.book_title[j]:
        i += 1
    labels.append(i)
test_data['label'] = labels

#### Model training

In [9]:
VECTOR_DIM = 32

In [10]:
model = Word2Vec(train_titles, size=VECTOR_DIM, min_count=0)

In [11]:
model.similar_by_word(word="zbrojni".lower(), topn=20)

[('regiment', 0.9942148327827454),
 ('ruch', 0.9931845664978027),
 ('ruchome', 0.993017315864563),
 ('para', 0.9927998781204224),
 ('jugulum', 0.9924675226211548),
 ('ciekawe', 0.9924342632293701),
 ('kosiarz', 0.99152672290802),
 ('mort', 0.9905078411102295),
 ('potworny', 0.9900221824645996),
 ('elefant', 0.9899853467941284),
 ('wolni', 0.989794135093689),
 ('pratchett', 0.9893941879272461),
 ('cohen', 0.9890822172164917),
 ('czarodzicielstwo', 0.9878342747688293),
 ('carpe', 0.9864664077758789),
 ('ciut', 0.9860356450080872),
 ('dysku', 0.9855395555496216),
 ('piramidy', 0.9842520952224731),
 ('maskarada', 0.9842439293861389),
 ('straż', 0.9841561317443848)]

In [12]:
def get_word_score(model, word):
    try:
        score = model.wv[word]
    except:
        score = np.repeat(np.nan, repeats=VECTOR_DIM)
    return score

def get_title_score(model, title, stopwords):
    title_parsed = getWords(title, stopwords)
    mean_score = np.nanmean([get_word_score(model, i) for i in title_parsed], axis = 0) #nanmean to jak na.rm=TRUE w R
    return mean_score

In [13]:
v1 = get_title_score(model, "Tolkien - Władca pierścieni - Trylogia", stopwords)
v2 = get_title_score(model, "NAUKA ŚWIATA DYSKU I - Terry Pratchett", stopwords)
np.sum((v1-v2)**2)

76.281982

In [14]:
#vectorized_titles = np.array([get_title_score(model, title, stopwords) for title in train_data.title])
vectorized_test_titles = np.array([get_title_score(model, title, stopwords) for title in test_data.title])

In [15]:
#vectorized_titles_sample = vectorized_titles[np.random.choice(vectorized_titles.shape[0], size = 20000, replace = False)]

#### T-SNE visualizations

In [16]:
tsne_transformer = MulticoreTSNE(verbose = 1, n_jobs=4, perplexity=25)

In [17]:
tnsed_titles = tsne_transformer.fit_transform(vectorized_test_titles.astype(np.float64))

In [37]:
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, CategoricalColorMapper
from bokeh.palettes import inferno, Category20, Category20c, Category20b, Category10

output_file("bokeh_vis/test_data_bokeh_vis.html")

source = ColumnDataSource(
        data=dict(
            x=tnsed_titles[:, 0],
            y=tnsed_titles[:, 1],
            desc=test_titles,
            label = [i for i in test_data.label]
        )
    )
color_mapper = CategoricalColorMapper(factors=[i for i in test_data.label.unique()], palette=Category10[10]*5)

hover = HoverTool(
        tooltips=[
            ("index", "$index"),
            ("(x,y)", "($x, $y)"),
            ("desc", "@desc"),
            ("label", "@label")
        ]
    )

p = figure(plot_width=600, plot_height=600, tools=[hover],
           title="Mouse over the dots")

p.circle('x', 'y', size=2, source=source, color={'field': 'label', 'transform': color_mapper})

show(p)

INFO:bokeh.core.state:Session output file 'bokeh_vis/test_data_bokeh_vis.html' already exists, will be overwritten.


In [19]:
from bokeh.io import show
from bokeh.models import ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import RdBu3
from bokeh.plotting import figure

source = ColumnDataSource(dict(
    x=[1, 2, 3, 4, 5, 6],
    y=[2, 1, 2, 1, 2, 1],
    label=['hi', 'lo', 'hi', 'lo', 'hi', 'lo']
))
color_mapper = CategoricalColorMapper(factors=['hi', 'lo'], palette=[RdBu3[2], RdBu3[0]])

p = figure(x_range=(0, 7), y_range=(0, 3), height=300, tools='save')
p.circle(
    x='x', y='y', radius=0.5, source=source,
    fill_color={'field': 'label', 'transform': color_mapper},
    legend='label'
)
show(p)