In [60]:
import csv
import os
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.models import Model, Sequential

from numpy import linalg as LA

from scipy import spatial

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper

In [2]:
SEED=np.random.randint(1,1000)
SEED

322

In [3]:
np.random.seed(SEED)

In [4]:
texts, labels = files_helper.read_stackoverflow_sample_stanford_tokenized("Medium-Small-Sample-Posts-Shuffled",ssd=True)

In [16]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
LABELS_MIN_DOC_COUNT = int(20)
BATCH_SIZE=32
EMBEDDING_DIM=100
NUM_EPOCHS=10
TOKENIZER_FILTERS='' # I will perform tokenization myself

In [17]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# word => word_index_position
word_index = tokenizer.word_index

# word_index_position => word
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [18]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [19]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [20]:
# make each document (sequence of word indices) be truncated to 
# MAX_SEQUENCE_LENGTH
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [21]:
# extracting IDF weights to weight the embeddings

# snooping
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(tokenized_texts)

feature_names = vect.get_feature_names()
idf = vect.idf_

# word => word IDF
idf_index = dict(zip(vect.get_feature_names(), idf))

In [22]:
document_vectors = vect.transform(tokenized_texts)

In [23]:
NUM_WORDS = document_vectors[0].shape[1]

In [24]:
# document_id => [tag1, tag2, tag3, ...]
document_tag_index = tags_helper.get_document_tag_index(binary_labels,lb.classes_)

# tag => [document_id1, document_id2, document_id3, ...]
tag_document_index = tags_helper.get_tag_document_index(binary_labels, lb.classes_)


In [25]:
tag_vocabulary = lb.classes_

with open("tag-vocabulary.txt","w+") as f:
    for t in tag_vocabulary:
        f.write(t+"\n")

In [26]:
len(tag_vocabulary)

1704

In [27]:
i = 2
tokenized_texts[i],document_tag_index[i]

("use jekyll with gulp . i read about gulp and was quite taken by the philosophy . i want to try it out for myself but i am running into a little problem . i am used to using jekyll with grunt and i have no idea how to get jekyll to play nice with gulp . i 've come across this article which suggests there is no need for a jekyll plugin when using gulp . unfortunately it does n't really explain how to go about it and the example it links to does n't help me much . is there anyone who knows how to go about this ?",
 ['gulp', 'jekyll'])

In [28]:
def cosine_similarity(a,b):
    return 1 - spatial.distance.cosine(a, b)

In [29]:
# tag => tag_vector
tag_vectors_index = dict()

for (tag,document_ids) in tag_document_index.items():
    
    tag_vector = np.zeros(NUM_WORDS)
    
    for document_id in document_ids:
        document_vector = document_vectors[document_id]
        num_tags = len(document_tag_index[document_id])

        # if this is the ONLY tag, it's probably a more representative document of this tag, than
        # if this tag were only one among many others
        weight = 1.0 / num_tags

        weighted_document_vector = document_vector * weight
        
        tag_vector = tag_vector + weighted_document_vector
        
    tag_vectors_index[tag] = np.asarray(tag_vector).ravel()

In [30]:
normalized_tag_vectors_index = dict()    
    
## normalize vectors to make cosine similarity more accurate  
for (tag,tag_vector) in tag_vectors_index.items():    
    normalized_tag_vectors_index[tag] = LA.norm(tag_vector, 2)

In [31]:
def get_top_k_most_similar_tags(index, target_tag_name, k):  
    """
    index is a dict tag_name => tag_vector
    target_tag_name is a string
    k is an int
    """
    
    target_tag_vector = index[target_tag_name]

    # calculating the similarites
    similarities = list()
    curr_best = np.NINF
    curr_tag = None

    for (tag_name,tag_vector) in index.items():
        
        if np.array_equal(tag_vector,target_tag_vector):
            continue
        
        sim = cosine_similarity(target_tag_vector,tag_vector)
            
        similarities.append((tag_name,sim))

        if sim > curr_best and tag_name != target_tag_name:
            curr_best = sim
            curr_tag = tag_name

    sorted_similarities = sorted(similarities,key=lambda t: t[1],reverse=True)
    
    return sorted_similarities[:k]


get_top_k_most_similar_tags(tag_vectors_index,'sql',100)

[('sql-server', 0.9370739836546409),
 ('tsql', 0.92462096587922771),
 ('sql-server-2008', 0.91997820008966369),
 ('mysql', 0.8983386610514803),
 ('database', 0.84869609388628819),
 ('select', 0.84057214534804203),
 ('sql-server-2008-r2', 0.81775262284551742),
 ('sql-server-2005', 0.81424570682834618),
 ('oracle', 0.80746852609747466),
 ('join', 0.7936493006462727),
 ('postgresql', 0.79203118875169654),
 ('sql-server-2012', 0.78784279967233517),
 ('ms-access', 0.77625263105775677),
 ('database-design', 0.77551272980923158),
 ('sql-update', 0.74567027132454367),
 ('group-by', 0.72854527670431057),
 ('performance', 0.72502571287157458),
 ('table', 0.7218723524823093),
 ('c#', 0.72043108452731974),
 ('php', 0.70791632023608086),
 ('ms-access-2010', 0.69854138554199197),
 ('insert', 0.69695362941802697),
 ('sqlite', 0.69597116974864148),
 ('.net', 0.69254776097140713),
 ('plsql', 0.68797486480406222),
 ('oracle-sqldeveloper', 0.68587271854908649),
 ('indexing', 0.68242605619099572),
 ('linq

In [32]:
target_tags = ['sql-server-2008','sql','sql-server','mysql','database','postgresql','select','join','oracle','tsql','sqlite']

pairs = [ (a,b) for a in target_tags for b in target_tags if a != b ]

In [33]:
for (tag_a,tag_b) in pairs:
    tag_a_vector = tag_vectors_index[tag_a]
    tag_b_vector = tag_vectors_index[tag_b]
    
    sim = cosine_similarity(tag_a_vector,tag_b_vector)
    
    print("SIM '{}' '{}' => {}".format(tag_a,tag_b,sim))
    

SIM 'sql-server-2008' 'sql' => 0.9199782000896637
SIM 'sql-server-2008' 'sql-server' => 0.9532329987953024
SIM 'sql-server-2008' 'mysql' => 0.7936316426612687
SIM 'sql-server-2008' 'database' => 0.8011177491862693
SIM 'sql-server-2008' 'postgresql' => 0.7226837933469563
SIM 'sql-server-2008' 'select' => 0.7399624368939832
SIM 'sql-server-2008' 'join' => 0.6805063030486375
SIM 'sql-server-2008' 'oracle' => 0.7316741323857728
SIM 'sql-server-2008' 'tsql' => 0.9040922115728023
SIM 'sql-server-2008' 'sqlite' => 0.6419665645123165
SIM 'sql' 'sql-server-2008' => 0.9199782000896637
SIM 'sql' 'sql-server' => 0.9370739836546409
SIM 'sql' 'mysql' => 0.8983386610514803
SIM 'sql' 'database' => 0.8486960938862882
SIM 'sql' 'postgresql' => 0.7920311887516965
SIM 'sql' 'select' => 0.840572145348042
SIM 'sql' 'join' => 0.7936493006462727
SIM 'sql' 'oracle' => 0.8074685260974747
SIM 'sql' 'tsql' => 0.9246209658792277
SIM 'sql' 'sqlite' => 0.6959711697486415
SIM 'sql-server' 'sql-server-2008' => 0.95323

In [34]:
# if all other tags are, on average, more similar to A than to B, then A is probably more generic than B
# should it be normalized by the difference between A and B?
# will fail for tags that are below B?

sql_vector = tag_vectors_index['sql']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.6959711697486417

In [35]:
sql_vector = tag_vectors_index['sql-server']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.65604750202606543

In [36]:
sql_vector = tag_vectors_index['sql-server-2008']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.64196656451231626

In [37]:
def get_sim(tag_a,tag_b):
    
    sim = cosine_similarity(tag_vectors_index[tag_a],tag_vectors_index[tag_b])
    
    return (tag_a,tag_b,sim)

In [38]:
all_tags = [tag for (tag,_) in tag_vectors_index.items()]

In [39]:
sims = Parallel(n_jobs=-1)(delayed(get_sim)(a,b) for a in all_tags for b in all_tags)

In [40]:
sims[:10]    

[('upload', 'upload', 1.0),
 ('upload', 'ipython', 0.22451662982335407),
 ('upload', 'pointers', 0.30959361884230652),
 ('upload', 'integration', 0.31426980682401329),
 ('upload', 'rotation', 0.29392923884871036),
 ('upload', 'nullreferenceexception', 0.26462367410666787),
 ('upload', 'core', 0.30866974417652648),
 ('upload', 'openmp', 0.19071054765424855),
 ('upload', 'wpf', 0.444023313979934),
 ('upload', 'attachment', 0.37607389394291679)]

In [41]:
similarity_dict = dict()

for tag_a,tag_b, sim in sims:
    
    if similarity_dict.get(tag_a) is None:
        similarity_dict[tag_a] = [(tag_b,sim)]
    else:
        similarity_dict[tag_a].append((tag_b,sim))

In [42]:
similarity_dict["sql"]

[('upload', 0.40297176693768288),
 ('ipython', 0.29574517134694478),
 ('pointers', 0.42681883943644316),
 ('integration', 0.38886137536621479),
 ('rotation', 0.35703577724355018),
 ('nullreferenceexception', 0.34288808831830098),
 ('core', 0.35402130287889633),
 ('openmp', 0.24064488166142939),
 ('wpf', 0.57843124815896929),
 ('attachment', 0.29655493694372193),
 ('panel', 0.28028579143285726),
 ('clojure', 0.35009453009108027),
 ('forms-authentication', 0.45216474688088626),
 ('height', 0.34971162358767682),
 ('deserialization', 0.41443331298854147),
 ('nsdictionary', 0.34187508172237313),
 ('nsdate', 0.31623837305673597),
 ('integration-testing', 0.35133268037309784),
 ('devise', 0.32759538152446499),
 ('verilog', 0.25490778541490933),
 ('overlay', 0.27783152370844721),
 ('text-files', 0.4476227207541168),
 ('grand-central-dispatch', 0.38640152676433048),
 ('rdf', 0.37144132383356221),
 ('shared-memory', 0.28798872506472417),
 ('apple-push-notifications', 0.30091506247335043),
 ('ang

In [43]:
sorted(similarity_dict["sql"],key=lambda tpl : tpl[1],reverse=True)[1:]

[('sql-server', 0.9370739836546409),
 ('tsql', 0.92462096587922771),
 ('sql-server-2008', 0.91997820008966369),
 ('mysql', 0.8983386610514803),
 ('database', 0.84869609388628819),
 ('select', 0.84057214534804203),
 ('sql-server-2008-r2', 0.81775262284551742),
 ('sql-server-2005', 0.81424570682834618),
 ('oracle', 0.80746852609747466),
 ('join', 0.7936493006462727),
 ('postgresql', 0.79203118875169654),
 ('sql-server-2012', 0.78784279967233517),
 ('ms-access', 0.77625263105775677),
 ('database-design', 0.77551272980923158),
 ('sql-update', 0.74567027132454367),
 ('group-by', 0.72854527670431057),
 ('performance', 0.72502571287157458),
 ('table', 0.7218723524823093),
 ('c#', 0.72043108452731974),
 ('php', 0.70791632023608086),
 ('ms-access-2010', 0.69854138554199197),
 ('insert', 0.69695362941802697),
 ('sqlite', 0.69597116974864148),
 ('.net', 0.69254776097140713),
 ('plsql', 0.68797486480406222),
 ('oracle-sqldeveloper', 0.68587271854908649),
 ('indexing', 0.68242605619099572),
 ('linq

In [44]:
sorted_similarity_dict = dict()

for (tag, similarities_to_other_tags ) in similarity_dict.items():
    
    sorted_similarities = sorted(similarities_to_other_tags,key=lambda tpl: tpl[1],reverse=True)
    sorted_without_self = sorted_similarities[1:]
    
    sorted_similarity_dict[tag] = sorted_without_self

In [45]:
sorted_similarity_dict["sql"]

[('sql-server', 0.9370739836546409),
 ('tsql', 0.92462096587922771),
 ('sql-server-2008', 0.91997820008966369),
 ('mysql', 0.8983386610514803),
 ('database', 0.84869609388628819),
 ('select', 0.84057214534804203),
 ('sql-server-2008-r2', 0.81775262284551742),
 ('sql-server-2005', 0.81424570682834618),
 ('oracle', 0.80746852609747466),
 ('join', 0.7936493006462727),
 ('postgresql', 0.79203118875169654),
 ('sql-server-2012', 0.78784279967233517),
 ('ms-access', 0.77625263105775677),
 ('database-design', 0.77551272980923158),
 ('sql-update', 0.74567027132454367),
 ('group-by', 0.72854527670431057),
 ('performance', 0.72502571287157458),
 ('table', 0.7218723524823093),
 ('c#', 0.72043108452731974),
 ('php', 0.70791632023608086),
 ('ms-access-2010', 0.69854138554199197),
 ('insert', 0.69695362941802697),
 ('sqlite', 0.69597116974864148),
 ('.net', 0.69254776097140713),
 ('plsql', 0.68797486480406222),
 ('oracle-sqldeveloper', 0.68587271854908649),
 ('indexing', 0.68242605619099572),
 ('linq

In [61]:
def get_similar_sounding_tags(tag):
    return get_close_matches(tag,tag_vocabulary)
#     return [other for other in tag_vocabulary if SequenceMatcher(None, tag, other).ratio() > 0.6 ]
    

In [47]:
def evaluate_cobrinha(tag_a, tag_b, similarity_dict):
    
    similarities_with_tag_a = list()
    similarities_with_tag_b = list()
    
    for (source_tag, similarities_to_other_tags ) in similarity_dict.items():
        for target_tag, similarity in similarities_to_other_tags:
            
            if source_tag != tag_a and source_tag !=tag_b:
                if target_tag == tag_a:
                    similarities_with_tag_a.append(similarity)
                elif target_tag == tag_b:
                    similarities_with_tag_b.append(similarity)
                else:
                    # do nothing
                    pass
                
    avg_sim_wrt_tag_a = np.array(similarities_with_tag_a).mean()
    avg_sim_wrt_tag_b = np.array(similarities_with_tag_b).mean()
    
    try:
        vec_a = tag_vectors_index[tag_a]
    except KeyError:
        print("{} is not a valid tag. These are: {}".format(tag_a,get_similar_sounding_tags(tag_a)))
        return

    try:
        vec_b = tag_vectors_index[tag_b]
    except KeyError:
        print("{} is not a valid tag. These are: {}".format(tag_b,get_similar_sounding_tags(tag_b))) 
        return
    
    mutual_sim = cosine_similarity(vec_a,vec_b)
        
    return (avg_sim_wrt_tag_a-avg_sim_wrt_tag_b,mutual_sim)
    

In [62]:
evaluate_cobrinha('sql','sql-server',sorted_similarity_dict)

(0.0070364657824624754, 0.9370739836546409)

In [63]:
evaluate_cobrinha('sql-server','sql-server-2008',sorted_similarity_dict)

(0.0096664302260090484, 0.95323299879530243)

In [64]:
evaluate_cobrinha('python','python-3.x',sorted_similarity_dict)

(0.041269371847660163, 0.93972855644695297)

In [65]:
evaluate_cobrinha('python-2.7','python-3.x',sorted_similarity_dict)

(0.016245324836895059, 0.91624424444048602)

In [66]:
evaluate_cobrinha('database','oracle',sorted_similarity_dict)

(0.085023048172449212, 0.72110826342934398)

In [67]:
# hmmm... maybe because rails is a much more stronger child of something like "web-framework"?
evaluate_cobrinha('ruby','ruby-on-rails',sorted_similarity_dict)

(-0.00011694256793887137, 0.89300762439747983)

In [70]:
evaluate_cobrinha('frameworks','ruby-on-rails',sorted_similarity_dict)

(-0.027048323610780078, 0.57101892758135864)

In [54]:
evaluate_cobrinha('.net','.net-3.5',sorted_similarity_dict)

(0.11492253703993699, 0.78605767699041174)

In [55]:
# makes sense
evaluate_cobrinha('.net-3.5','.net-4.0',sorted_similarity_dict) 

(-0.055391072716755807, 0.70457752423543374)

In [56]:
evaluate_cobrinha('android','android-service',sorted_similarity_dict) 

(0.14693596711506279, 0.64258578351271844)

In [57]:
evaluate_cobrinha('asp.net','asp.net-mvc',sorted_similarity_dict) 

(0.030701877081268925, 0.83976220352492692)

In [58]:
evaluate_cobrinha('asp.net-mvc','asp.net-mvc-5',sorted_similarity_dict) 

(0.027337950233681518, 0.85647238125628344)

## what about unrelated stuff?

In [None]:
evaluate_cobrinha('java','python-2.7',sorted_similarity_dict) 

In [None]:
evaluate_cobrinha('ruby','python-2.7',sorted_similarity_dict) 

In [None]:
evaluate_cobrinha('database','python-2.7',sorted_similarity_dict) 