In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.models import Model, Sequential

from numpy import linalg as LA

from scipy import spatial

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper

Using TensorFlow backend.


In [2]:
SEED=np.random.randint(1,1000)
SEED

46

In [15]:
PICKLE_DIR_ROOT = "/media/felipe/ssd_vol/auto-tagger/data/tag-hierarchy/"

In [3]:
np.random.seed(SEED)

In [4]:
texts, labels = files_helper.read_stackoverflow_sample_stanford_tokenized("Medium-Small-Sample-Posts-Shuffled",ssd=True)

In [5]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
LABELS_MIN_DOC_COUNT = int(20)
BATCH_SIZE=32
EMBEDDING_DIM=100
NUM_EPOCHS=10
TOKENIZER_FILTERS='' # I will perform tokenization myself

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# word => word_index_position
word_index = tokenizer.word_index

# word_index_position => word
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [7]:
truncated_labels = tags_helper.truncate_labels(labels,LABELS_MIN_DOC_COUNT)

In [8]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [9]:
# make each document (sequence of word indices) be truncated to 
# MAX_SEQUENCE_LENGTH
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [10]:
# extracting IDF weights to weight the embeddings

# snooping
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(tokenized_texts)

feature_names = vect.get_feature_names()
idf = vect.idf_

# word => word IDF
idf_index = dict(zip(vect.get_feature_names(), idf))

In [11]:
document_vectors = vect.transform(tokenized_texts)

In [12]:
NUM_WORDS = document_vectors[0].shape[1]

In [13]:
# document_id => [tag1, tag2, tag3, ...]
document_tag_index = tags_helper.get_document_tag_index(binary_labels,lb.classes_)

# tag => [document_id1, document_id2, document_id3, ...]
tag_document_index = tags_helper.get_tag_document_index(binary_labels, lb.classes_)


In [16]:
tag_vocabulary = lb.classes_

with open("tag-vocabulary.txt","w+") as f:
    for t in tag_vocabulary:
        f.write(t+"\n")

In [17]:
len(tag_vocabulary)

1704

In [19]:
pickle.dump(tag_vocabulary,open(PICKLE_DIR_ROOT+"/tag_vocabulary.p","wb"))

In [20]:
i = 2
tokenized_texts[i],document_tag_index[i]

("use jekyll with gulp . i read about gulp and was quite taken by the philosophy . i want to try it out for myself but i am running into a little problem . i am used to using jekyll with grunt and i have no idea how to get jekyll to play nice with gulp . i 've come across this article which suggests there is no need for a jekyll plugin when using gulp . unfortunately it does n't really explain how to go about it and the example it links to does n't help me much . is there anyone who knows how to go about this ?",
 ['gulp', 'jekyll'])

### build tag_vectors_index

> if a tag is the ONLY tag assigned to some document, that's probably a more representative document of this tag, than if this tag were only one among many others


In [23]:
# tag => tag_vector
tag_vectors_index = dict()

for (tag,document_ids) in tag_document_index.items():
    
    tag_vector = np.zeros(NUM_WORDS)
    
    for document_id in document_ids:
        document_vector = document_vectors[document_id]
        num_tags = len(document_tag_index[document_id])


        weight = 1.0 / num_tags

        weighted_document_vector = document_vector * weight
        
        tag_vector = tag_vector + weighted_document_vector
        
    tag_vectors_index[tag] = np.asarray(tag_vector).ravel()

In [25]:
pickle.dump(tag_vectors_index,open(PICKLE_DIR_ROOT+"/tag_vectors_index.p","wb"))

In [None]:
normalized_tag_vectors_index = dict()    

## THIS SUCKS.. WHY?
## normalize vectors to make cosine similarity more accurate  
for (tag,tag_vector) in tag_vectors_index.items():    
    normalized_tag_vectors_index[tag] = LA.norm(tag_vector, 2)

In [27]:
def _cosine_similarity(a,b):
    return 1 - spatial.distance.cosine(a, b)

In [28]:
def get_top_k_most_similar_tags(index, target_tag_name, k):  
    """
    index is a dict tag_name => tag_vector
    target_tag_name is a string
    k is an int
    """
    
    target_tag_vector = index[target_tag_name]

    # calculating the similarites
    similarities = list()
    curr_best = np.NINF
    curr_tag = None

    for (tag_name,tag_vector) in index.items():
        
        if np.array_equal(tag_vector,target_tag_vector):
            continue
        
        sim = _cosine_similarity(target_tag_vector,tag_vector)
            
        similarities.append((tag_name,sim))

        if sim > curr_best and tag_name != target_tag_name:
            curr_best = sim
            curr_tag = tag_name

    sorted_similarities = sorted(similarities,key=lambda t: t[1],reverse=True)
    
    return sorted_similarities[:k]


get_top_k_most_similar_tags(tag_vectors_index,'sql',100)

[('sql-server', 0.93703034503959293),
 ('tsql', 0.92440745774580635),
 ('sql-server-2008', 0.92015670947657757),
 ('mysql', 0.89862102206324723),
 ('database', 0.8487521566074212),
 ('select', 0.84063604154888716),
 ('sql-server-2008-r2', 0.81810292951382668),
 ('sql-server-2005', 0.81591897559109283),
 ('oracle', 0.80774624424158692),
 ('join', 0.7938536047760949),
 ('postgresql', 0.79203190639447318),
 ('sql-server-2012', 0.79001724731051137),
 ('ms-access', 0.77631831110928884),
 ('database-design', 0.77566478462191046),
 ('sql-update', 0.74606329213516065),
 ('group-by', 0.72898994379296611),
 ('performance', 0.72494161845390259),
 ('table', 0.72224977521944933),
 ('c#', 0.72032846274865159),
 ('php', 0.70773411230704653),
 ('ms-access-2010', 0.70011478635504243),
 ('insert', 0.6973461830740928),
 ('sqlite', 0.69582320451358193),
 ('.net', 0.69250630860680473),
 ('plsql', 0.6879625383852408),
 ('oracle-sqldeveloper', 0.68359207021560398),
 ('relational-database', 0.6832148793208832

In [29]:
target_tags = ['sql-server-2008','sql','sql-server','mysql','database','postgresql','select','join','oracle','tsql','sqlite']

pairs = [ (a,b) for a in target_tags for b in target_tags if a != b ]

In [30]:
for (tag_a,tag_b) in pairs:
    tag_a_vector = tag_vectors_index[tag_a]
    tag_b_vector = tag_vectors_index[tag_b]
    
    sim = _cosine_similarity(tag_a_vector,tag_b_vector)
    
    print("SIM '{}' '{}' => {}".format(tag_a,tag_b,sim))
    

SIM 'sql-server-2008' 'sql' => 0.9201567094765776
SIM 'sql-server-2008' 'sql-server' => 0.9531474611720513
SIM 'sql-server-2008' 'mysql' => 0.7938121520184529
SIM 'sql-server-2008' 'database' => 0.8009814749649009
SIM 'sql-server-2008' 'postgresql' => 0.7227153532027295
SIM 'sql-server-2008' 'select' => 0.7402369188070147
SIM 'sql-server-2008' 'join' => 0.680867470877861
SIM 'sql-server-2008' 'oracle' => 0.7320556472069563
SIM 'sql-server-2008' 'tsql' => 0.9041758078206868
SIM 'sql-server-2008' 'sqlite' => 0.6418715563569742
SIM 'sql' 'sql-server-2008' => 0.9201567094765776
SIM 'sql' 'sql-server' => 0.9370303450395929
SIM 'sql' 'mysql' => 0.8986210220632472
SIM 'sql' 'database' => 0.8487521566074212
SIM 'sql' 'postgresql' => 0.7920319063944732
SIM 'sql' 'select' => 0.8406360415488872
SIM 'sql' 'join' => 0.7938536047760949
SIM 'sql' 'oracle' => 0.8077462442415869
SIM 'sql' 'tsql' => 0.9244074577458063
SIM 'sql' 'sqlite' => 0.6958232045135819
SIM 'sql-server' 'sql-server-2008' => 0.95314

In [31]:
# if all other tags are, on average, more similar to A than to B, then A is probably more generic than B
# should it be normalized by the difference between A and B?
# will fail for tags that are below B?

sql_vector = tag_vectors_index['sql']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.69582320451358204

In [32]:
sql_vector = tag_vectors_index['sql-server']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.65567830723544218

In [33]:
sql_vector = tag_vectors_index['sql-server-2008']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.64187155635697402

In [34]:
def get_sim(tag_a,tag_b):
    
    sim = _cosine_similarity(tag_vectors_index[tag_a],tag_vectors_index[tag_b])
    
    return (tag_a,tag_b,sim)

In [35]:
all_tags = [tag for (tag,_) in tag_vectors_index.items()]

In [36]:
sims = Parallel(n_jobs=-1)(delayed(get_sim)(a,b) for a in all_tags for b in all_tags)

In [37]:
sims[:10]    

[('chef', 'chef', 0.99999999999999978),
 ('chef', 'copy', 0.19348469078990493),
 ('chef', 'asp.net-core-mvc', 0.20279433527826463),
 ('chef', 'activemq', 0.12183826687234578),
 ('chef', 'settings', 0.2048127142864814),
 ('chef', 'correlation', 0.1550043048174794),
 ('chef', 'parsing', 0.27052106046857638),
 ('chef', '.net', 0.3311040292547931),
 ('chef', 'web-services', 0.26112815665923461),
 ('chef', 'macros', 0.20204564828428029)]

In [38]:
similarity_dict = dict()

for tag_a,tag_b, sim in sims:
    
    if similarity_dict.get(tag_a) is None:
        similarity_dict[tag_a] = [(tag_b,sim)]
    else:
        similarity_dict[tag_a].append((tag_b,sim))

In [39]:
similarity_dict["sql"]

[('chef', 0.25387057099299037),
 ('copy', 0.41249325557683503),
 ('asp.net-core-mvc', 0.41571428991991399),
 ('activemq', 0.23036928883111651),
 ('settings', 0.40084457876129476),
 ('correlation', 0.36720435003522522),
 ('parsing', 0.6126317248290496),
 ('.net', 0.69250630860680473),
 ('web-services', 0.51962338014786602),
 ('macros', 0.45338616348502692),
 ('android-listview', 0.34955250390071058),
 ('mysqldump', 0.42643088116163996),
 ('condition', 0.4905533407884104),
 ('http-headers', 0.38833620735028862),
 ('fork', 0.35916141486487208),
 ('href', 0.39255199097508209),
 ('syntax-error', 0.41483813265650749),
 ('sap', 0.42512854653076626),
 ('operator-overloading', 0.26263732788042105),
 ('haml', 0.26461799640188599),
 ('prepared-statement', 0.45582772138420213),
 ('bootstrap-modal', 0.27265149540118361),
 ('apache-poi', 0.37053442665198555),
 ('xmlhttprequest', 0.42617443292736179),
 ('hex', 0.36992138041248923),
 ('couchdb', 0.3359684841721593),
 ('ios-simulator', 0.34344469144416

In [40]:
sorted(similarity_dict["sql"],key=lambda tpl : tpl[1],reverse=True)[1:]

[('sql-server', 0.93703034503959293),
 ('tsql', 0.92440745774580635),
 ('sql-server-2008', 0.92015670947657757),
 ('mysql', 0.89862102206324723),
 ('database', 0.8487521566074212),
 ('select', 0.84063604154888716),
 ('sql-server-2008-r2', 0.81810292951382668),
 ('sql-server-2005', 0.81591897559109283),
 ('oracle', 0.80774624424158692),
 ('join', 0.7938536047760949),
 ('postgresql', 0.79203190639447318),
 ('sql-server-2012', 0.79001724731051137),
 ('ms-access', 0.77631831110928884),
 ('database-design', 0.77566478462191046),
 ('sql-update', 0.74606329213516065),
 ('group-by', 0.72898994379296611),
 ('performance', 0.72494161845390259),
 ('table', 0.72224977521944933),
 ('c#', 0.72032846274865159),
 ('php', 0.70773411230704653),
 ('ms-access-2010', 0.70011478635504243),
 ('insert', 0.6973461830740928),
 ('sqlite', 0.69582320451358193),
 ('.net', 0.69250630860680473),
 ('plsql', 0.6879625383852408),
 ('oracle-sqldeveloper', 0.68359207021560398),
 ('relational-database', 0.6832148793208832

In [41]:
sorted_similarity_dict = dict()

for (tag, similarities_to_other_tags) in similarity_dict.items():
    
    sorted_similarities = sorted(similarities_to_other_tags,key=lambda tpl: tpl[1],reverse=True)
    sorted_without_self = sorted_similarities[1:]
    
    sorted_similarity_dict[tag] = sorted_without_self

In [42]:
sorted_similarity_dict["sql"]

[('sql-server', 0.93703034503959293),
 ('tsql', 0.92440745774580635),
 ('sql-server-2008', 0.92015670947657757),
 ('mysql', 0.89862102206324723),
 ('database', 0.8487521566074212),
 ('select', 0.84063604154888716),
 ('sql-server-2008-r2', 0.81810292951382668),
 ('sql-server-2005', 0.81591897559109283),
 ('oracle', 0.80774624424158692),
 ('join', 0.7938536047760949),
 ('postgresql', 0.79203190639447318),
 ('sql-server-2012', 0.79001724731051137),
 ('ms-access', 0.77631831110928884),
 ('database-design', 0.77566478462191046),
 ('sql-update', 0.74606329213516065),
 ('group-by', 0.72898994379296611),
 ('performance', 0.72494161845390259),
 ('table', 0.72224977521944933),
 ('c#', 0.72032846274865159),
 ('php', 0.70773411230704653),
 ('ms-access-2010', 0.70011478635504243),
 ('insert', 0.6973461830740928),
 ('sqlite', 0.69582320451358193),
 ('.net', 0.69250630860680473),
 ('plsql', 0.6879625383852408),
 ('oracle-sqldeveloper', 0.68359207021560398),
 ('relational-database', 0.6832148793208832

In [43]:
pickle.dump(sorted_similarity_dict,open(PICKLE_DIR_ROOT+"/sorted_similarity_dict",'wb'))

In [None]:
evaluate_cobrinha('android','android-service',sorted_similarity_dict) 

In [None]:
evaluate_cobrinha('asp.net','asp.net-mvc',sorted_similarity_dict) 

In [None]:
evaluate_cobrinha('asp.net-mvc','asp.net-mvc-5',sorted_similarity_dict) 

what about unrelated stuff?

In [None]:
evaluate_cobrinha('java','arrays',sorted_similarity_dict) 

In [None]:
evaluate_cobrinha('ruby','python-2.7',sorted_similarity_dict) 

In [None]:
evaluate_cobrinha('database','python-2.7',sorted_similarity_dict) 

what's the mean and stddev of the similarity between all tags and each other?

In [None]:
running_avgs = []

for (tag, similarities_to_other_tags) in similarity_dict.items():
    sims = [tpl[1] for tpl in similarities_to_other_tags]
    
    avg = np.array(sims).mean()
    
    running_avgs.append(avg)
    
np.array(running_avgs).mean(),np.array(running_avgs).std()

make_global_similarity_index

In [48]:
global_similarity_index = dict()

for tag in tag_vocabulary:
    
    similarities_with_current_tag = list()
    
    for other_tag, similarity_to_other_tag in similarity_dict[tag]:
        
        print(similarity_to_other_tag)
    
        break
    

0.171643144813
0.331104029255
0.230984071181
0.249372139041
0.291653519413
0.245099077688
0.176495387819
0.170091811657
0.208950406173
0.167207675468
0.131424907757
0.234793085391
0.211068210212
0.122989068135
0.197939714499
0.129368399231
0.213487585149
0.233066176978
0.2302871305
0.172081141891
0.121838266872
0.250512223215
0.182066854942
0.125039515694
0.134888607453
0.190924290676
0.176652542368
0.122606760102
0.225670994932
0.177366023626
0.140845614274
0.154128753081
0.16398726966
0.150125054547
0.218940567446
0.148710677374
0.101705987562
0.161711377844
0.267728502105
0.164471249384
0.169906251649
0.132175962457
0.249159703607
0.171243745948
0.268522515023
0.177023727696
0.258539496535
0.182240991282
0.161613904871
0.183070370939
0.109279175605
0.14914031455
0.11590047637
0.145474530152
0.115047688888
0.125405236988
0.141093305798
0.138816947698
0.170126449833
0.131224759203
0.147970751143
0.130643939236
0.151773176614
0.224654922618
0.137848119052
0.140103590555
0.170260833337


In [51]:
global_similarity_index = dict()

for tag in tag_vocabulary:
    
    similarities_with_current_tag = list()
    
    for other_tag, similarity_to_other_tag in sorted_similarity_dict[tag]:
        
        if other_tag != tag:
            similarities_with_current_tag.append(similarity_to_other_tag)

    global_avg_sim_wrt_tag = np.array(similarities_with_current_tag).mean()
    
    global_similarity_index[tag] = global_avg_sim_wrt_tag
            
pickle.dump(global_similarity_index,open(PICKLE_DIR_ROOT+"/global_similarity_index.p",'wb'))

In [52]:
global_similarity_index

{'chef': 0.18377650303013637,
 'copy': 0.28713301815951497,
 'asp.net-core-mvc': 0.30592487563028636,
 'android-sqlite': 0.2592977826583725,
 'settings': 0.30291255111165932,
 'correlation': 0.23684377134137219,
 'broadcastreceiver': 0.23965908569017558,
 '.net': 0.4906330688920813,
 'web-services': 0.38720301035483712,
 'macros': 0.3159139778513777,
 'android-listview': 0.26034912192886328,
 'mysqldump': 0.2274447158925734,
 'condition': 0.28561320770362125,
 'http-headers': 0.30292356154070788,
 'fork': 0.26301872830689471,
 'cygwin': 0.24209169148058571,
 'syntax-error': 0.25690885945404612,
 'sap': 0.24544761906092302,
 'operator-overloading': 0.195895761134879,
 'haml': 0.18526888231475214,
 'prepared-statement': 0.21830242452742146,
 'activemq': 0.17246582450740611,
 'flash': 0.32518218933702076,
 'xmlhttprequest': 0.3309561594302321,
 'hex': 0.25356523817156246,
 'couchdb': 0.22864188841897418,
 'ios-simulator': 0.27061972421066344,
 'parameter-passing': 0.33518226306922638,
 'a