In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.models import Model, Sequential

from numpy import linalg as LA

from scipy import spatial

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper,cobrinha_helper

from cobrinha_helper import get_metrics_for_tag_pair


Using TensorFlow backend.


In [2]:
SEED=np.random.randint(1,1000)
SEED

593

In [3]:
np.random.seed(SEED)

In [4]:
PICKLE_DIR_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [5]:
texts, labels = files_helper.read_stackoverflow_sample_stanford_tokenized("Medium-Sample-Posts-Shuffled",ssd=True)

In [6]:
MAX_NB_WORDS = 8000
MAX_SEQUENCE_LENGTH = 1000
LABELS_MIN_DOC_PROPORTION = 0.0005 # only tags appearing in at least CORPUS_SIZE * LABELS_MIN_DOC_PROPORTION will be used
TOKENIZER_FILTERS='' # I will perform tokenization myself

In [7]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# word => word_index_position
word_index = tokenizer.word_index

# word_index_position => word
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [25]:
# write a text-copy in the current directory too
with open('word-vocabulary.txt','w+') as f:
    for word,pos in sorted(word_index.items(),key=lambda tpl:tpl[1]):
        f.write(word+" => "+ str(pos)+"\n")

In [8]:
corpus_size = len(texts)
labels_min_doc_count = int(corpus_size * LABELS_MIN_DOC_PROPORTION)

corpus_size,labels_min_doc_count

(500000, 250)

In [9]:
truncated_labels = tags_helper.truncate_labels(labels,labels_min_doc_count)

In [10]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [11]:
# make each document (sequence of word indices) be truncated to 
# MAX_SEQUENCE_LENGTH
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [12]:
# extracting IDF weights to weight the embeddings

# snooping
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(tokenized_texts)

feature_names = vect.get_feature_names()
idf = vect.idf_

# word => word IDF
idf_index = dict(zip(vect.get_feature_names(), idf))

In [13]:
document_vectors = vect.transform(tokenized_texts)

In [14]:
NUM_WORDS = document_vectors[0].shape[1]
NUM_WORDS

7410

In [15]:
# document_id => [tag1, tag2, tag3, ...]
document_tag_index = tags_helper.get_document_tag_index(binary_labels,lb.classes_)

# tag => [document_id1, document_id2, document_id3, ...]
tag_document_index = tags_helper.get_tag_document_index(binary_labels, lb.classes_)

### build tag_vocabulary

In [16]:
tag_vocabulary = lb.classes_

In [17]:
len(tag_vocabulary)

737

In [55]:
"   ".join(sorted(tag_vocabulary))

'.htaccess   .net   .net-4.0   3d   access-vba   actionscript   actionscript-3   active-directory   activerecord   ado.net   ajax   algorithm   amazon-ec2   amazon-s3   amazon-web-services   android   android-actionbar   android-activity   android-asynctask   android-edittext   android-emulator   android-fragments   android-gradle   android-intent   android-layout   android-listview   android-ndk   android-studio   android-viewpager   angular-ui-router   angular2   angularjs   angularjs-directive   angularjs-scope   animation   annotations   ant   apache   apache-spark   api   applet   architecture   arduino   arm   arraylist   arrays   asp-classic   asp.net   asp.net-core   asp.net-mvc   asp.net-mvc-2   asp.net-mvc-3   asp.net-mvc-4   asp.net-mvc-5   asp.net-web-api   assembly   asynchronous   attributes   audio   authentication   autocomplete   autolayout   automation   awk   azure   backbone.js   background   bash   batch-file   beautifulsoup   binary   binding   bitmap   blackberry

In [19]:
pickle.dump(tag_vocabulary,open(PICKLE_DIR_ROOT+"/tag_vocabulary.p","wb"))

In [20]:
tag_frequency_index = dict()

for tag in tag_vocabulary:
    freq = len(tag_document_index[tag])   
    tag_frequency_index[tag] = freq

pickle.dump(tag_frequency_index,open(PICKLE_DIR_ROOT+"/tag_frequency_index.p","wb"))

In [21]:
# write a text-copy in the current directory too
with open('tag-frequencies.txt','w+') as f:
    for tag in tag_vocabulary:
        f.write(tag+" => "+ str(tag_frequency_index[tag])+"\n")

In [26]:
i = 1
tokenized_texts[i],document_tag_index[i]

("replacing asp datagrid control to custom jquery control ( telerik , or other library ) . we have a traditional aspx application . we are using so many asp server controls in our application which in ui look n feel . i have been looking into telerik and other asp custom control library . we are also looking for open source library which doesnt come with license . i want to ask how can we start replacing all the asp . net control to any good ui custom control library . as we want to change as much less our code . i need pointers to open source asp custom control ui library . and steps needed . in aspx code : in the server side we have a bind function private sub ( ) dim as data . ' bind data grid = / / function which is returning sets from database / / me . . datasource = me . . databind ( ) end sub and couple more handlers to handle data grid events like private sub ( byval source as system . object , byval e as system . web . ui . . ) handles .",
 ['asp.net', 'jquery', 'telerik', 'vb

### build tag_vectors_index

> if a tag is the ONLY tag assigned to some document, that's probably a more representative document of this tag, than if this tag were only one among many others

In [27]:
# tag => tag_vector
tag_vectors_index = dict()

for (tag,document_ids) in tqdm(tag_document_index.items()):
    
    tag_vector = np.zeros(NUM_WORDS)
    
    for document_id in document_ids:
        document_vector = document_vectors[document_id]
        num_tags = len(document_tag_index[document_id])

        weight = 1.0 / num_tags

        weighted_document_vector = document_vector * weight
        
        tag_vector = tag_vector + weighted_document_vector
        
    tag_vectors_index[tag] = np.asarray(tag_vector).ravel()

In [28]:
pickle.dump(tag_vectors_index,open(PICKLE_DIR_ROOT+"/tag_vectors_index.p","wb"))

In [None]:
normalized_tag_vectors_index = dict()    

## THIS SUCKS.. WHY?
## normalize vectors to make cosine similarity more accurate  
for (tag,tag_vector) in tag_vectors_index.items():    
    normalized_tag_vectors_index[tag] = LA.norm(tag_vector, 2)

In [29]:
def _cosine_similarity(a,b):
    return 1 - spatial.distance.cosine(a, b)

In [30]:
target_tags = ['sql-server-2008','sql','sql-server','mysql','database','postgresql','select','join','oracle','tsql','sqlite']

pairs = [ (a,b) for a in target_tags for b in target_tags if a != b ]

In [31]:
for (tag_a,tag_b) in pairs:
    tag_a_vector = tag_vectors_index[tag_a]
    tag_b_vector = tag_vectors_index[tag_b]
    
    sim = _cosine_similarity(tag_a_vector,tag_b_vector)
    
    print("SIM '{}' '{}' => {}".format(tag_a,tag_b,sim))
    

SIM 'sql-server-2008' 'sql' => 0.9291628095667054
SIM 'sql-server-2008' 'sql-server' => 0.9829380997991071
SIM 'sql-server-2008' 'mysql' => 0.8065604193626673
SIM 'sql-server-2008' 'database' => 0.8170611875289827
SIM 'sql-server-2008' 'postgresql' => 0.761087500951073
SIM 'sql-server-2008' 'select' => 0.785103881919789
SIM 'sql-server-2008' 'join' => 0.7130952681509377
SIM 'sql-server-2008' 'oracle' => 0.7671177042552559
SIM 'sql-server-2008' 'tsql' => 0.9443029838094574
SIM 'sql-server-2008' 'sqlite' => 0.6345147604149299
SIM 'sql' 'sql-server-2008' => 0.9291628095667054
SIM 'sql' 'sql-server' => 0.952545989316134
SIM 'sql' 'mysql' => 0.9112950896230164
SIM 'sql' 'database' => 0.858316269587468
SIM 'sql' 'postgresql' => 0.833948173237951
SIM 'sql' 'select' => 0.8978246507586888
SIM 'sql' 'join' => 0.8354713336785698
SIM 'sql' 'oracle' => 0.832804201032417
SIM 'sql' 'tsql' => 0.9619093170088324
SIM 'sql' 'sqlite' => 0.6884356896724918
SIM 'sql-server' 'sql-server-2008' => 0.9829380997

if all other tags are, on average, more similar to A than to B, then A is probably more generic than B

should it be normalized by the difference between A and B?

will fail for tags that are below B?

In [36]:
sql_vector = tag_vectors_index['database']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.77451840819423257

In [37]:
sql_vector = tag_vectors_index['sql-server']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.65910354936049864

In [34]:
sql_vector = tag_vectors_index['sql-server-2008']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.63451476041493016

In [38]:
def get_sim(tag_a,tag_b):
    
    sim = _cosine_similarity(tag_vectors_index[tag_a],tag_vectors_index[tag_b])
    
    return (tag_a,tag_b,sim)

In [39]:
all_tags = [tag for (tag,_) in tag_vectors_index.items()]

In [40]:
sims = Parallel(n_jobs=-1)(delayed(get_sim)(a,b) for a in all_tags for b in all_tags)

In [41]:
sims[:10]

[('jackson', 'jackson', 0.99999999999999989),
 ('jackson', 'reflection', 0.51956802986866779),
 ('jackson', 'dll', 0.25866232577141124),
 ('jackson', 'architecture', 0.39364916033882613),
 ('jackson', 'loops', 0.40255028779815838),
 ('jackson', '.net-4.0', 0.44457963147019997),
 ('jackson', 'regex', 0.33665023568345054),
 ('jackson', 'jenkins', 0.21674803428873846),
 ('jackson', 'exception', 0.45280418173228476),
 ('jackson', 'ffmpeg', 0.19184276253419164)]

In [42]:
pairwise_similarity_dict = dict()

for tag_a,tag_b, sim in sims:
    
    if pairwise_similarity_dict.get(tag_a) is None:
        pairwise_similarity_dict[tag_a] = [(tag_b,sim)]
    else:
        pairwise_similarity_dict[tag_a].append((tag_b,sim))

In [43]:
pairwise_similarity_dict["sql"][:20]

[('jackson', 0.35431049596438946),
 ('reflection', 0.49434239867438867),
 ('dll', 0.38416226836820333),
 ('architecture', 0.63225685994859249),
 ('loops', 0.6531479366212255),
 ('.net-4.0', 0.63392357980771474),
 ('regex', 0.52056455884758845),
 ('jenkins', 0.32264793024061589),
 ('exception', 0.50973975484893086),
 ('ffmpeg', 0.30906535039361394),
 ('scroll', 0.44791580136997744),
 ('swift3', 0.46049914268799397),
 ('dynamic', 0.69017815088574042),
 ('static', 0.42291974401546251),
 ('websocket', 0.41043308933345335),
 ('swift', 0.5204105659640087),
 ('performance', 0.74585135587224227),
 ('stl', 0.42421013009556163),
 ('opengl-es', 0.43934795250101011),
 ('interface', 0.42281249846307989)]

In [44]:
sorted(pairwise_similarity_dict["sql"],key=lambda tpl : tpl[1],reverse=True)[1:20]

[('tsql', 0.96190931700883242),
 ('sql-server', 0.95254598931613399),
 ('sql-server-2008', 0.92916280956670538),
 ('mysql', 0.91129508962301642),
 ('select', 0.89782465075868878),
 ('sql-server-2012', 0.88684567432481809),
 ('sql-server-2008-r2', 0.88480977786474824),
 ('sql-server-2005', 0.88250364108234103),
 ('database', 0.85831626958746798),
 ('join', 0.83547133367856985),
 ('postgresql', 0.83394817323795101),
 ('oracle', 0.83280420103241704),
 ('ms-access', 0.81548975867485429),
 ('group-by', 0.80747757699783751),
 ('sql-update', 0.79760414291182635),
 ('database-design', 0.79697121542133098),
 ('oracle11g', 0.78241275665253496),
 ('table', 0.77902824083126654),
 ('linq-to-sql', 0.77501594490089354)]

In [45]:
sorted_pairwise_similarity_dict = dict()

for (tag, similarities_to_other_tags) in pairwise_similarity_dict.items():
    
    sorted_similarities = sorted(similarities_to_other_tags,key=lambda tpl: tpl[1],reverse=True)
    sorted_without_self = sorted_similarities[1:]
    
    sorted_pairwise_similarity_dict[tag] = sorted_without_self

In [46]:
sorted_pairwise_similarity_dict["sql"][:20]

[('tsql', 0.96190931700883242),
 ('sql-server', 0.95254598931613399),
 ('sql-server-2008', 0.92916280956670538),
 ('mysql', 0.91129508962301642),
 ('select', 0.89782465075868878),
 ('sql-server-2012', 0.88684567432481809),
 ('sql-server-2008-r2', 0.88480977786474824),
 ('sql-server-2005', 0.88250364108234103),
 ('database', 0.85831626958746798),
 ('join', 0.83547133367856985),
 ('postgresql', 0.83394817323795101),
 ('oracle', 0.83280420103241704),
 ('ms-access', 0.81548975867485429),
 ('group-by', 0.80747757699783751),
 ('sql-update', 0.79760414291182635),
 ('database-design', 0.79697121542133098),
 ('oracle11g', 0.78241275665253496),
 ('table', 0.77902824083126654),
 ('linq-to-sql', 0.77501594490089354),
 ('insert', 0.76291688550989623)]

In [47]:
pickle.dump(sorted_pairwise_similarity_dict,open(PICKLE_DIR_ROOT+"/sorted_pairwise_similarity_dict.p",'wb'))

what's the mean and stddev of the similarity between all tags and each other?

In [65]:
running_avgs = []

for (tag, similarities_to_other_tags) in pairwise_similarity_dict.items():
    sims = [tpl[1] for tpl in similarities_to_other_tags]
    
    avg = np.array(sims).mean()
    
    running_avgs.append(avg)
    
np.array(running_avgs).mean(),np.array(running_avgs).std()

(0.40836471509708244, 0.076821269009175339)