In [1]:
import sys
print(sys.version)

3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from gensim.models import Word2Vec

In [4]:
from sklearn.manifold import t_sne

In [5]:
from MulticoreTSNE import MulticoreTSNE

#### Data loading

In [93]:
import re
def getWords(text, stopwords):
    splitted_text = text.split()
    new_splitted_text = []
    for item in splitted_text:
        if len(item) == 1:
            item = item.lower()
        new_splitted_text.append(item)
    new_text = " ".join(new_splitted_text)
    new_text = re.sub("\.", " ", new_text)
    new_text = re.sub("_", "", new_text)
    new_text = re.sub("(?<= [A-Z]{1}) +((?=[A-Z] )|(?=[A-Z]$))", "", new_text)
    all_words = re.compile('\w+').findall(new_text)
    return [words.lower() for words in all_words if len(words)>1 and words.lower() not in stopwords]

def getBigrams(text, stopwords):
    splitted_text = text.split()
    new_splitted_text = []
    for item in splitted_text:
        if len(item) == 1:
            item = item.lower()
        new_splitted_text.append(item)
    new_text = " ".join(new_splitted_text)
    new_text = re.sub("\.", " ", new_text)
    new_text = re.sub("_", "", new_text)
    new_text = re.sub("(?<= [A-Z]{1}) +((?=[A-Z] )|(?=[A-Z]$))", "", new_text)
    all_words = re.compile('\w+').findall(new_text)
    lower_words = [words.lower() for words in all_words if len(words)>1 and words.lower() not in stopwords]
    return [" ".join(sorted(lower_words[i:i+2])) for i in range(len(lower_words)-1)]

def getTerms(test, stopwords):
    return getWords(test, stopwords) + getBigrams(test, stopwords)

In [94]:
stopwords = pd.read_csv("stopwords.csv").as_matrix()
stopwords.resize(stopwords.shape[0])

In [160]:
train_data = pd.read_csv("titles_books.csv")
test_data = pd.read_csv("test_data.csv")
train_titles = [getWords(title, stopwords) for title in train_data.title]
test_titles = [title for title in test_data.title]

i = 0
labels = [0]
for j in range(test_data.shape[0])[1:]:
    if test_data.book_title[j-1]!=test_data.book_title[j]:
        i += 1
    labels.append(i)
test_data['label'] = labels

#### Model training

In [161]:
VECTOR_DIM = 32

In [162]:
model = Word2Vec(train_titles, size=VECTOR_DIM, min_count=0)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 48454 words, keeping 9014 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 95485 words, keeping 11817 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 140148 words, keeping 14264 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 186623 words, keeping 16607 word types
INFO:gensim.models.word2vec:collected 17484 word types from a corpus of 207117 raw words and 44357 sentences
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:min_count=0 retains 17484 unique words (100% of original 17484, drops 0)
INFO:gensim.models.word2vec:min_count=0 leaves 207117 word corpus (100% of original 207117, drops 0)
INFO:gensim.models.word2vec:deleting the raw coun

In [164]:
model.similar_by_word(word="władca".lower(), topn=20)

[('pierścienia', 0.9816210865974426),
 ('hobbit', 0.9810181856155396),
 ('kropce', 0.9804226756095886),
 ('pierscieni', 0.9801980257034302),
 ('skibniewska', 0.9798970222473145),
 ('jrr', 0.9785947799682617),
 ('tolkien', 0.9776356220245361),
 ('kropka', 0.9765268564224243),
 ('powrotem', 0.9752150177955627),
 ('yeskov', 0.9746350049972534),
 ('silmarillion', 0.9732321500778198),
 ('pierścieni', 0.9731463193893433),
 ('nuda', 0.971091628074646),
 ('dwie', 0.9690154194831848),
 ('mórz', 0.968645453453064),
 ('dżil', 0.9684089422225952),
 ('podlesia', 0.9673435688018799),
 ('rudy', 0.9656370878219604),
 ('tam', 0.964540421962738),
 ('czyli', 0.9625658392906189)]

In [165]:
def get_word_score(model, word):
    try:
        score = model.wv[word]
    except:
        score = np.repeat(np.nan, repeats=VECTOR_DIM)
    return score

def get_title_score(model, title, stopwords):
    title_parsed = getWords(title, stopwords)
    mean_score = np.nanmean([get_word_score(model, i) for i in title_parsed], axis = 0) #nanmean to jak na.rm=TRUE w R
    return mean_score

In [166]:
v1 = get_title_score(model, "Tolkien - Władca pierścieni - Trylogia", stopwords)
v2 = get_title_score(model, "NAUKA ŚWIATA DYSKU I - Terry Pratchett", stopwords)
np.sum((v1-v2)**2)

84.6343

In [167]:
#vectorized_titles = np.array([get_title_score(model, title, stopwords) for title in train_data.title])
vectorized_test_titles = np.array([get_title_score(model, title, stopwords) for title in test_data.title])

In [51]:
#vectorized_titles_sample = vectorized_titles[np.random.choice(vectorized_titles.shape[0], size = 20000, replace = False)]

#### T-SNE visualizations

In [168]:
tsne_transformer = MulticoreTSNE(verbose = 1, n_jobs=4, perplexity=20, n_iter=2000, angle=0.5)

In [169]:
tnsed_titles = tsne_transformer.fit_transform(vectorized_test_titles.astype(np.float64))

In [170]:
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, CategoricalColorMapper
from bokeh.palettes import inferno, Category20, Category20c, Category20b, Category10

output_file("bokeh_vis/test_data_bokeh_vis.html")

source = ColumnDataSource(
        data=dict(
            x=tnsed_titles[:, 0],
            y=tnsed_titles[:, 1],
            desc=test_titles,
            label = [i for i in test_data.label]
        )
    )
color_mapper = CategoricalColorMapper(factors=[i for i in test_data.label.unique()], palette=Category10[10]*5)

hover = HoverTool(
        tooltips=[
            ("index", "$index"),
            ("(x,y)", "($x, $y)"),
            ("desc", "@desc"),
            ("label", "@label")
        ]
    )

p = figure(plot_width=600, plot_height=600, tools=[hover],
           title="Mouse over the dots")

p.circle('x', 'y', size=2, source=source, color={'field': 'label', 'transform': color_mapper})

show(p)

INFO:bokeh.core.state:Session output file 'bokeh_vis/test_data_bokeh_vis.html' already exists, will be overwritten.


In [19]:
from bokeh.io import show
from bokeh.models import ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import RdBu3
from bokeh.plotting import figure

source = ColumnDataSource(dict(
    x=[1, 2, 3, 4, 5, 6],
    y=[2, 1, 2, 1, 2, 1],
    label=['hi', 'lo', 'hi', 'lo', 'hi', 'lo']
))
color_mapper = CategoricalColorMapper(factors=['hi', 'lo'], palette=[RdBu3[2], RdBu3[0]])

p = figure(x_range=(0, 7), y_range=(0, 3), height=300, tools='save')
p.circle(
    x='x', y='y', radius=0.5, source=source,
    fill_color={'field': 'label', 'transform': color_mapper},
    legend='label'
)
show(p)