In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.models import Model, Sequential

from numpy import linalg as LA

from scipy import spatial

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper,cobrinha_helper

from cobrinha_helper import evaluate_cobrinha


Using TensorFlow backend.


In [2]:
SEED=np.random.randint(1,1000)
SEED

827

In [3]:
np.random.seed(SEED)

In [4]:
PICKLE_DIR_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [6]:
texts, labels = files_helper.read_stackoverflow_sample_stanford_tokenized("Medium-Small-Sample-Posts-Shuffled",ssd=True)

In [7]:
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
LABELS_MIN_DOC_COUNT = int(20)
BATCH_SIZE=32
EMBEDDING_DIM=100
NUM_EPOCHS=10
TOKENIZER_FILTERS='' # I will perform tokenization myself

In [8]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# word => word_index_position
word_index = tokenizer.word_index

# word_index_position => word
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [9]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [10]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [11]:
# make each document (sequence of word indices) be truncated to 
# MAX_SEQUENCE_LENGTH
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [12]:
# extracting IDF weights to weight the embeddings

# snooping
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(tokenized_texts)

feature_names = vect.get_feature_names()
idf = vect.idf_

# word => word IDF
idf_index = dict(zip(vect.get_feature_names(), idf))

In [13]:
document_vectors = vect.transform(tokenized_texts)

In [14]:
NUM_WORDS = document_vectors[0].shape[1]

In [15]:
# document_id => [tag1, tag2, tag3, ...]
document_tag_index = tags_helper.get_document_tag_index(binary_labels,lb.classes_)

# tag => [document_id1, document_id2, document_id3, ...]
tag_document_index = tags_helper.get_tag_document_index(binary_labels, lb.classes_)

### build tag_vocabulary

In [16]:
tag_vocabulary = lb.classes_

In [17]:
len(tag_vocabulary)

1704

In [18]:
pickle.dump(tag_vocabulary,open(PICKLE_DIR_ROOT+"/tag_vocabulary.p","wb"))

In [19]:
i = 2
tokenized_texts[i],document_tag_index[i]

("use with gulp . i read about gulp and was quite taken by the . i want to try it out for myself but i am running into a little problem . i am used to using with grunt and i have no idea how to get to play nice with gulp . i 've come across this article which suggests there is no need for a plugin when using gulp . unfortunately it does n't really explain how to go about it and the example it links to does n't help me much . is there anyone who knows how to go about this ?",
 ['gulp', 'jekyll'])

### build tag_vectors_index

> if a tag is the ONLY tag assigned to some document, that's probably a more representative document of this tag, than if this tag were only one among many others

In [20]:
# tag => tag_vector
tag_vectors_index = dict()

for (tag,document_ids) in tag_document_index.items():
    
    tag_vector = np.zeros(NUM_WORDS)
    
    for document_id in document_ids:
        document_vector = document_vectors[document_id]
        num_tags = len(document_tag_index[document_id])


        weight = 1.0 / num_tags

        weighted_document_vector = document_vector * weight
        
        tag_vector = tag_vector + weighted_document_vector
        
    tag_vectors_index[tag] = np.asarray(tag_vector).ravel()

In [21]:
pickle.dump(tag_vectors_index,open(PICKLE_DIR_ROOT+"/tag_vectors_index.p","wb"))

In [22]:
normalized_tag_vectors_index = dict()    

## THIS SUCKS.. WHY?
## normalize vectors to make cosine similarity more accurate  
for (tag,tag_vector) in tag_vectors_index.items():    
    normalized_tag_vectors_index[tag] = LA.norm(tag_vector, 2)

In [23]:
def _cosine_similarity(a,b):
    return 1 - spatial.distance.cosine(a, b)

In [24]:
target_tags = ['sql-server-2008','sql','sql-server','mysql','database','postgresql','select','join','oracle','tsql','sqlite']

pairs = [ (a,b) for a in target_tags for b in target_tags if a != b ]

In [25]:
for (tag_a,tag_b) in pairs:
    tag_a_vector = tag_vectors_index[tag_a]
    tag_b_vector = tag_vectors_index[tag_b]
    
    sim = _cosine_similarity(tag_a_vector,tag_b_vector)
    
    print("SIM '{}' '{}' => {}".format(tag_a,tag_b,sim))
    

SIM 'sql-server-2008' 'sql' => 0.9349862739794322
SIM 'sql-server-2008' 'sql-server' => 0.9660931871464769
SIM 'sql-server-2008' 'mysql' => 0.8154537999297654
SIM 'sql-server-2008' 'database' => 0.8156094260042039
SIM 'sql-server-2008' 'postgresql' => 0.7553912824948147
SIM 'sql-server-2008' 'select' => 0.7778990183850996
SIM 'sql-server-2008' 'join' => 0.7190125266357225
SIM 'sql-server-2008' 'oracle' => 0.7645262468727586
SIM 'sql-server-2008' 'tsql' => 0.9261999301139061
SIM 'sql-server-2008' 'sqlite' => 0.6563333385539288
SIM 'sql' 'sql-server-2008' => 0.9349862739794322
SIM 'sql' 'sql-server' => 0.9460848411594168
SIM 'sql' 'mysql' => 0.9066234957540791
SIM 'sql' 'database' => 0.8520170435797642
SIM 'sql' 'postgresql' => 0.8138918207844948
SIM 'sql' 'select' => 0.8654228616735832
SIM 'sql' 'join' => 0.8199437626551427
SIM 'sql' 'oracle' => 0.828475622543522
SIM 'sql' 'tsql' => 0.936445656417424
SIM 'sql' 'sqlite' => 0.7008841298527745
SIM 'sql-server' 'sql-server-2008' => 0.966093

if all other tags are, on average, more similar to A than to B, then A is probably more generic than B

should it be normalized by the difference between A and B?

will fail for tags that are below B?

In [26]:
sql_vector = tag_vectors_index['sql']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.70088412985277426

In [27]:
sql_vector = tag_vectors_index['sql-server']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.66694692822065349

In [28]:
sql_vector = tag_vectors_index['sql-server-2008']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.65633333855392895

In [29]:
def get_sim(tag_a,tag_b):
    
    sim = _cosine_similarity(tag_vectors_index[tag_a],tag_vectors_index[tag_b])
    
    return (tag_a,tag_b,sim)

In [30]:
all_tags = [tag for (tag,_) in tag_vectors_index.items()]

In [31]:
sims = Parallel(n_jobs=-1)(delayed(get_sim)(a,b) for a in all_tags for b in all_tags)

In [32]:
sims[:10]

[('transform', 'transform', 1.0),
 ('transform', 'case', 0.26554840755418407),
 ('transform', 'jni', 0.3009183278459453),
 ('transform', 'floating-point', 0.336854480012164),
 ('transform', 'angular2', 0.3661205276052284),
 ('transform', 'jframe', 0.27573279740961776),
 ('transform', 'spinner', 0.20500842597293434),
 ('transform', 'time-series', 0.37992182618296111),
 ('transform', 'json.net', 0.28870913869046644),
 ('transform', 'database', 0.44774169494985283)]

In [33]:
pairwise_similarity_dict = dict()

for tag_a,tag_b, sim in sims:
    
    if pairwise_similarity_dict.get(tag_a) is None:
        pairwise_similarity_dict[tag_a] = [(tag_b,sim)]
    else:
        pairwise_similarity_dict[tag_a].append((tag_b,sim))

In [34]:
pairwise_similarity_dict["sql"][:20]

[('transform', 0.40855872521667036),
 ('case', 0.59262310587100175),
 ('jni', 0.37921904769952519),
 ('floating-point', 0.44944965001635828),
 ('angular2', 0.44004904096272757),
 ('jframe', 0.31217042383026083),
 ('spinner', 0.2528297156338789),
 ('time-series', 0.56773145215294152),
 ('json.net', 0.39438820786891104),
 ('database', 0.85201704357976416),
 ('avfoundation', 0.43377206014556335),
 ('character-encoding', 0.49789902540568343),
 ('reference', 0.53962198256955318),
 ('prestashop', 0.5149808316574368),
 ('callback', 0.44843055355845529),
 ('mp3', 0.27999504313708945),
 ('upgrade', 0.42153009925488338),
 ('mingw', 0.2637657774736184),
 ('sharepoint-2013', 0.34542164458363755),
 ('indexing', 0.70737882653316475)]

In [35]:
sorted(pairwise_similarity_dict["sql"],key=lambda tpl : tpl[1],reverse=True)[1:20]

[('sql-server', 0.94608484115941682),
 ('tsql', 0.93644565641742405),
 ('sql-server-2008', 0.93498627397943224),
 ('mysql', 0.90662349575407908),
 ('select', 0.86542286167358318),
 ('sql-server-2005', 0.85241837335393233),
 ('database', 0.85201704357976416),
 ('sql-server-2008-r2', 0.84318095989905328),
 ('oracle', 0.82847562254352203),
 ('sql-server-2012', 0.82132633406845212),
 ('join', 0.81994376265514268),
 ('postgresql', 0.81389182078449485),
 ('ms-access', 0.80221352074550467),
 ('database-design', 0.78301104261912458),
 ('sql-update', 0.7763878939714397),
 ('ms-access-2010', 0.77491446300201694),
 ('group-by', 0.77283602270618612),
 ('subquery', 0.76446255816377828),
 ('query-optimization', 0.74722746136273188)]

In [36]:
sorted_pairwise_similarity_dict = dict()

for (tag, similarities_to_other_tags) in pairwise_similarity_dict.items():
    
    sorted_similarities = sorted(similarities_to_other_tags,key=lambda tpl: tpl[1],reverse=True)
    sorted_without_self = sorted_similarities[1:]
    
    sorted_pairwise_similarity_dict[tag] = sorted_without_self

In [37]:
sorted_pairwise_similarity_dict["sql"][:20]

[('sql-server', 0.94608484115941682),
 ('tsql', 0.93644565641742405),
 ('sql-server-2008', 0.93498627397943224),
 ('mysql', 0.90662349575407908),
 ('select', 0.86542286167358318),
 ('sql-server-2005', 0.85241837335393233),
 ('database', 0.85201704357976416),
 ('sql-server-2008-r2', 0.84318095989905328),
 ('oracle', 0.82847562254352203),
 ('sql-server-2012', 0.82132633406845212),
 ('join', 0.81994376265514268),
 ('postgresql', 0.81389182078449485),
 ('ms-access', 0.80221352074550467),
 ('database-design', 0.78301104261912458),
 ('sql-update', 0.7763878939714397),
 ('ms-access-2010', 0.77491446300201694),
 ('group-by', 0.77283602270618612),
 ('subquery', 0.76446255816377828),
 ('query-optimization', 0.74722746136273188),
 ('oracle-sqldeveloper', 0.74190672404619762)]

In [39]:
pickle.dump(sorted_pairwise_similarity_dict,open(PICKLE_DIR_ROOT+"/sorted_pairwise_similarity_dict.p",'wb'))

### build global similarity index

In [40]:
global_similarity_index = dict()

for tag in tag_vocabulary:
    
    similarities_with_current_tag = list()
    
    for other_tag, similarity_to_other_tag in sorted_pairwise_similarity_dict[tag]:
        
        if other_tag != tag:
            similarities_with_current_tag.append(similarity_to_other_tag)

    global_avg_sim_wrt_tag = np.array(similarities_with_current_tag).mean()
    
    global_similarity_index[tag] = global_avg_sim_wrt_tag
            
pickle.dump(global_similarity_index,open(PICKLE_DIR_ROOT+"/global_similarity_index.p",'wb'))

In [41]:
# global_similarity_index

In [42]:
sorted(global_similarity_index.items(),key=lambda tpl: tpl[1],reverse=True)[:20]

[('c#', 0.52795499262682699),
 ('.net', 0.52236475935224813),
 ('web', 0.49697625672404389),
 ('c#-4.0', 0.492369770949807),
 ('performance', 0.49044844940532073),
 ('design', 0.48791340732320043),
 ('java', 0.48654773101891391),
 ('javascript', 0.48597898596666739),
 ('asp.net', 0.48340641191234252),
 ('security', 0.48224596054387231),
 ('php', 0.47972653441409041),
 ('user-interface', 0.47964352300825269),
 ('.net-4.0', 0.47751736281412316),
 ('html5', 0.47722780149518895),
 ('optimization', 0.47653380867752793),
 ('windows', 0.47546250693757391),
 ('ios', 0.47506348956805688),
 ('cocoa', 0.47290701874429175),
 ('cocoa-touch', 0.47098300553692912),
 ('winforms', 0.46932126814097019)]

In [43]:
evaluate_cobrinha('android','android-service',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.13860640703283789, 0.68468023488856755)

In [44]:
evaluate_cobrinha('asp.net','asp.net-mvc',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.034241028998139356, 0.84005500773754138)

In [45]:
evaluate_cobrinha('asp.net-mvc','asp.net-mvc-5',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.011887516293288714, 0.89472967540836057)

what about unrelated stuff

In [46]:
evaluate_cobrinha('java','arrays',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.11558216143824568, 0.62033735340071139)

In [47]:
evaluate_cobrinha('ruby','python-2.7',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(-0.012370486727575292, 0.6421273193667727)

In [48]:
evaluate_cobrinha('database','python-2.7',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.020251065340873897, 0.64969028290196595)

what's the mean and stddev of the similarity between all tags and each other?

In [49]:
running_avgs = []

for (tag, similarities_to_other_tags) in pairwise_similarity_dict.items():
    sims = [tpl[1] for tpl in similarities_to_other_tags]
    
    avg = np.array(sims).mean()
    
    running_avgs.append(avg)
    
np.array(running_avgs).mean(),np.array(running_avgs).std()

(0.31303772647830352, 0.064948327868787334)