In [2]:
import sys
sys.version

'3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [3]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine, euclidean

In [3]:
import re
def getWords(text):
    splitted_text = text.split()
    new_splitted_text = []
    for item in splitted_text:
        if len(item) == 1:
            item = item.lower()
        new_splitted_text.append(item)
    new_text = " ".join(new_splitted_text)
    new_text = re.sub("\.", " ", new_text)
    new_text = re.sub("_", "", new_text)
    new_text = re.sub("(?<= [A-Z]{1}) +((?=[A-Z] )|(?=[A-Z]$))", "", new_text)
    all_words = re.compile('\w+').findall(new_text)
    return [words.lower() for words in all_words if len(words)>1]

def get_word_score(model, word):
    try:
        score = model.wv[word]
    except:
        score = np.repeat(np.nan, repeats=VECTOR_DIM)
    return score

def get_title_score(title, model):
    title_parsed = getWords(title)
    mean_score = np.nanmean([get_word_score(model, i) for i in title_parsed], axis = 0) #nanmean to jak na.rm=TRUE w R
    return mean_score

def calculate_mean_cos_similarity(input_col):
    n = input_col.shape[0]
    calc_col = np.array(input_col)
    similarity_sum = 0
    counter = 0
    for i in range(n):
        for j in range(i, n):
            similarity_sum+= cosine(calc_col[i], calc_col[j])
            counter+=1
            
    return similarity_sum/counter

In [4]:
train_data = pd.read_csv("titles_books.csv")
test_data = pd.read_csv("test_data.csv")
train_titles = [getWords(title) for title in train_data.title]
i = 0
labels = [0]
for j in range(test_data.shape[0])[1:]:
    if test_data.book_title[j-1]!=test_data.book_title[j]:
        i += 1
    labels.append(i)
test_data['label'] = labels

sample_test_data = test_data.loc[test_data.label.isin(np.random.choice(25, 10))].copy()

### Clustering and w2v grid search

In [5]:
results_df = pd.DataFrame(columns=['vec_dim', 'bandwith', 'h_test', 'c_test', 'v_test', 'h_smtest', 'c_smtest', 'v_smtest', 'n_clust_test', 'n_clust_smtest'], dtype=np.float64)

In [None]:
from sklearn.cluster import MeanShift
from sklearn import metrics
for VECTOR_DIM in [32,50,65,80,100]:
    model = Word2Vec(train_titles, size=VECTOR_DIM, min_count=0)
    train_data['score'] = train_data.title.apply(func=get_title_score, model=model)
    test_data['score'] = test_data.title.apply(func=get_title_score, model=model)
    sample_test_data['score'] = sample_test_data.title.apply(func=get_title_score, model=model)
    for bandwith in np.arange(0.4, 1.6, 0.2):
        km = MeanShift(bandwidth=bandwith, n_jobs=8)
        km.fit(np.array([i for i in train_data.score]))
        test_data['cluster'] = km.predict(np.array([i for i in test_data.score]))
        sample_test_data['cluster'] = km.predict(np.array([i for i in sample_test_data.score]))
        new_obs = {
            'vec_dim' : VECTOR_DIM,
            'bandwith' : bandwith,
            'h_test' : metrics.homogeneity_score(test_data.label.tolist(), test_data.cluster.tolist()),
            'c_test' : metrics.completeness_score(test_data.label.tolist(), test_data.cluster.tolist()),
            'v_test' : metrics.v_measure_score(test_data.label.tolist(), test_data.cluster.tolist()),
            'h_smtest' : metrics.homogeneity_score(sample_test_data.label.tolist(), sample_test_data.cluster.tolist()),
            'c_smtest' : metrics.completeness_score(sample_test_data.label.tolist(), sample_test_data.cluster.tolist()),
            'v_smtest' : metrics.v_measure_score(sample_test_data.label.tolist(), sample_test_data.cluster.tolist()),
            'n_clust_test' : test_data.cluster.unique().shape[0],
            'n_clust_smtest' : sample_test_data.cluster.unique().shape[0]
        }
        results_df.append(new_obs, ignore_index=True)

results_df.to_csv("grid_search_results.csv")

In [377]:
VECTOR_DIM = 64

In [378]:
model = Word2Vec(train_titles, size=VECTOR_DIM, min_count=0)
train_data['score'] = train_data.title.apply(func=get_title_score, model=model)
test_data['score'] = test_data.title.apply(func=get_title_score, model=model)
sample_test_data['score'] = sample_test_data.title.apply(func=get_title_score, model=model)

In [28]:
#test_data[["author", "book_title", "score"]].groupby(("author", "book_title")).agg(lambda x: calculate_mean_cos_similarity(x))

In [29]:
#test_data[["author",  "score"]].groupby(("author")).agg(lambda x: calculate_mean_cos_similarity(x))

### Clustering

In [373]:
from sklearn.cluster import MeanShift

In [379]:
n_clusters = test_data.book_title.unique().shape[0]
km = MeanShift(bandwidth=1, n_jobs=7)
km.fit(np.array([i for i in train_data.score]))
test_data['cluster'] = km.predict(np.array([i for i in test_data.score]))
sample_test_data['cluster'] = km.predict(np.array([i for i in sample_test_data.score]))

In [380]:
print(sample_test_data.cluster.unique().shape)
print(test_data.cluster.unique().shape)

(19,)
(43,)


In [381]:
from sklearn import metrics
#print(metrics.adjusted_rand_score(test_data.label.tolist(), test_data.cluster.tolist()))
#print(metrics.adjusted_mutual_info_score(test_data.label.tolist(), test_data.cluster.tolist()))
print(metrics.homogeneity_score(test_data.label.tolist(), test_data.cluster.tolist()))
print(metrics.completeness_score(test_data.label.tolist(), test_data.cluster.tolist()))
print(metrics.v_measure_score(test_data.label.tolist(), test_data.cluster.tolist()))
print(metrics.homogeneity_score(sample_test_data.label.tolist(), sample_test_data.cluster.tolist()))
print(metrics.completeness_score(sample_test_data.label.tolist(), sample_test_data.cluster.tolist()))
print(metrics.v_measure_score(sample_test_data.label.tolist(), sample_test_data.cluster.tolist()))

0.598780484539
0.780779077942
0.677774685955
0.732633729111
0.624081322282
0.67401482123
