In [56]:
import csv
import os
import re
import sys
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.models import Model, Sequential

from scipy import spatial

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper

In [2]:
SEED=np.random.randint(1,1000)
SEED

232

In [3]:
np.random.seed(SEED)

In [4]:
texts, labels = files_helper.read_stackoverflow_sample_small_stanford_tokenized()

In [5]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
LABELS_MIN_DOC_COUNT = int(10)
BATCH_SIZE=32
EMBEDDING_DIM=100
NUM_EPOCHS=10
TOKENIZER_FILTERS='' # I will perform tokenization myself

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# word => word_index_position
word_index = tokenizer.word_index

# word_index_position => word
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [7]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [8]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [9]:
# make each document (sequence of word indices) be truncated to 
# MAX_SEQUENCE_LENGTH
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [10]:
# extracting IDF weights to weight the embeddings

# snooping
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(tokenized_texts)

feature_names = vect.get_feature_names()
idf = vect.idf_

# word => word IDF
idf_index = dict(zip(vect.get_feature_names(), idf))

In [11]:
document_vectors = vect.transform(tokenized_texts)

In [29]:
NUM_WORDS = document_vectors[0].shape[1]

In [12]:
# document_id => [tag1, tag2, tag3, ...]
document_tag_index = tags_helper.get_document_tag_index(binary_labels,lb.classes_)

# tag => [document_id1, document_id2, document_id3, ...]
tag_document_index = tags_helper.get_tag_document_index(binary_labels, lb.classes_)


In [18]:
i = 2
tokenized_texts[i],document_tag_index[i]

('use jekyll with gulp i read about gulp and was quite taken by the philosophy i want to try it out for myself but i am running into a little problem i am used to using jekyll with grunt and i have no idea how to get jekyll to play nice with gulp i ve come across this article which suggests there is no need for a jekyll plugin when using gulp unfortunately it does n t really explain how to go about it and the example it links to does n t help me much is there anyone who knows how to go about this',
 ['gulp'])

In [57]:
def cosine_similarity(a,b):
    return 1 - spatial.distance.cosine(a, b)

In [76]:
# tag => tag_vector
tag_vectors_index = dict()

for (tag,document_ids) in tag_document_index.items():
    
    tag_vector = np.zeros(NUM_WORDS)
    
    for document_id in document_ids:
        document_vector = document_vectors[document_id]
        num_tags = len(document_tag_index[document_id])

        # if this is the ONLY tag, it's probably a more representative document of this tag, than
        # if this tag were only one among many others
        weight = 1.0 / num_tags

        weighted_document_vector = document_vector * weight
        
        tag_vector = tag_vector + weighted_document_vector
        
    tag_vectors_index[tag] = np.asarray(tag_vector).ravel()
    
# example    
target_tag = 'sql'
target_vector = tag_vectors_index[target_tag]

# calculating the similarites
similarities = list()
curr_best = np.NINF
curr_tag = None

for (tag,tag_vector) in tag_vectors_index.items():
        sim = cosine_similarity(target_vector,tag_vector)
        
        similarities.append((tag,sim))
        
        if sim > curr_best and tag != target_tag:
            curr_best = sim
            curr_tag = tag

sorted(similarities,key=lambda t: t[1],reverse=True)

[('sql', 0.99999999999999989),
 ('sql-server', 0.92066484240003987),
 ('mysql', 0.88122304629851711),
 ('sql-server-2008', 0.86867382080168853),
 ('tsql', 0.83214121856961754),
 ('database', 0.82612639056565373),
 ('oracle', 0.80866489792659924),
 ('select', 0.80527466406363568),
 ('postgresql', 0.79784147611363498),
 ('join', 0.72783130833519483),
 ('sql-server-2005', 0.71709941947555611),
 ('database-design', 0.71210391157537078),
 ('ms-access', 0.70438804620122786),
 ('php', 0.70383914664182945),
 ('c#', 0.69550926038989724),
 ('sqlite', 0.68870695635508761),
 ('performance', 0.67312808777120259),
 ('sql-update', 0.66279524865636497),
 ('.net', 0.6614628643607362),
 ('sql-server-2012', 0.65288096924027605),
 ('sql-server-2008-r2', 0.65035294861613213),
 ('r', 0.63872714943003572),
 ('plsql', 0.63503168302679192),
 ('activerecord', 0.63439415601468041),
 ('entity-framework', 0.6322354388972139),
 ('ado.net', 0.62859853720096026),
 ('asp.net', 0.62089442530970707),
 ('javascript', 0.6