In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.models import Model, Sequential

from numpy import linalg as LA

from scipy import spatial

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper,cobrinha_helper

from cobrinha_helper import evaluate_cobrinha


Using TensorFlow backend.


In [2]:
SEED=np.random.randint(1,1000)
SEED

702

In [3]:
np.random.seed(SEED)

In [4]:
PICKLE_DIR_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [5]:
texts, labels = files_helper.read_stackoverflow_sample_stanford_tokenized("Medium-Sample-Posts-Shuffled",ssd=True)

In [6]:
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 1000
LABELS_MIN_DOC_PROPORTION = 0.0006 # only tags appearing in at least CORPUS_SIZE * LABELS_MIN_DOC_PROPORTION will be used
TOKENIZER_FILTERS='' # I will perform tokenization myself

In [7]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                     filters=TOKENIZER_FILTERS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# word => word_index_position
word_index = tokenizer.word_index

# word_index_position => word
inverse_word_index = texts_helper.build_inverse_word_index(word_index)

In [8]:
corpus_size = len(texts)
labels_min_doc_count = int(corpus_size * LABELS_MIN_DOC_PROPORTION)

corpus_size,labels_min_doc_count

(222000, 133)

In [9]:
truncated_labels = tags_helper.truncate_labels(labels,labels_min_doc_count)

In [10]:
lb = preprocessing.MultiLabelBinarizer()
binary_labels = lb.fit_transform(truncated_labels)

In [11]:
# make each document (sequence of word indices) be truncated to 
# MAX_SEQUENCE_LENGTH
tokenized_texts = []

for seq in sequences:
    truncated_seq = seq[:MAX_SEQUENCE_LENGTH]
    tokenized_txt = " ".join([inverse_word_index[idx] for idx in truncated_seq])
    tokenized_texts.append(tokenized_txt)

In [12]:
# extracting IDF weights to weight the embeddings

# snooping
vect = TfidfVectorizer(max_features=MAX_NB_WORDS).fit(tokenized_texts)

feature_names = vect.get_feature_names()
idf = vect.idf_

# word => word IDF
idf_index = dict(zip(vect.get_feature_names(), idf))

In [13]:
document_vectors = vect.transform(tokenized_texts)

In [23]:
NUM_WORDS = document_vectors[0].shape[1]
NUM_WORDS

4692

In [15]:
# document_id => [tag1, tag2, tag3, ...]
document_tag_index = tags_helper.get_document_tag_index(binary_labels,lb.classes_)

# tag => [document_id1, document_id2, document_id3, ...]
tag_document_index = tags_helper.get_tag_document_index(binary_labels, lb.classes_)

### build tag_vocabulary

In [16]:
tag_vocabulary = lb.classes_

In [17]:
len(tag_vocabulary)

618

In [18]:
tag_vocabulary

array(['.htaccess', '.net', '.net-4.0', '3d', 'access-vba',
       'actionscript-3', 'active-directory', 'activerecord', 'ajax',
       'algorithm', 'amazon-ec2', 'amazon-s3', 'amazon-web-services',
       'android', 'android-activity', 'android-asynctask',
       'android-edittext', 'android-emulator', 'android-fragments',
       'android-intent', 'android-layout', 'android-listview',
       'android-ndk', 'android-studio', 'android-viewpager',
       'angular-ui-router', 'angular2', 'angularjs', 'angularjs-directive',
       'angularjs-scope', 'animation', 'annotations', 'ant', 'apache',
       'apache-spark', 'api', 'architecture', 'arduino', 'arraylist',
       'arrays', 'asp-classic', 'asp.net', 'asp.net-core', 'asp.net-mvc',
       'asp.net-mvc-2', 'asp.net-mvc-3', 'asp.net-mvc-4', 'asp.net-mvc-5',
       'asp.net-web-api', 'assembly', 'asynchronous', 'attributes',
       'audio', 'authentication', 'autocomplete', 'autolayout',
       'automation', 'awk', 'azure', 'backbone.js', 

In [19]:
pickle.dump(tag_vocabulary,open(PICKLE_DIR_ROOT+"/tag_vocabulary.p","wb"))

In [20]:
i = 1
tokenized_texts[i],document_tag_index[i]

("replacing asp datagrid control to custom jquery control ( , or other library ) . we have a aspx application . we are using so many asp server controls in our application which in ui look n feel . i have been looking into and other asp custom control library . we are also looking for open source library which doesnt come with license . i want to ask how can we start replacing all the asp . net control to any good ui custom control library . as we want to change as much less our code . i need pointers to open source asp custom control ui library . and steps needed . in aspx code : in the server side we have a bind function private sub ( ) dim as data . ' bind data grid = / / function which is returning sets from database / / me . . datasource = me . . databind ( ) end sub and couple more handlers to handle data grid events like private sub ( byval source as system . object , byval e as system . web . ui . . ) handles .",
 ['asp.net', 'jquery', 'vb.net'])

### build tag_vectors_index

> if a tag is the ONLY tag assigned to some document, that's probably a more representative document of this tag, than if this tag were only one among many others

In [24]:
# tag => tag_vector
tag_vectors_index = dict()

for (tag,document_ids) in tag_document_index.items():
    
    tag_vector = np.zeros(NUM_WORDS)
    
    for document_id in document_ids:
        document_vector = document_vectors[document_id]
        num_tags = len(document_tag_index[document_id])


        weight = 1.0 / num_tags

        weighted_document_vector = document_vector * weight
        
        tag_vector = tag_vector + weighted_document_vector
        
    tag_vectors_index[tag] = np.asarray(tag_vector).ravel()

In [25]:
pickle.dump(tag_vectors_index,open(PICKLE_DIR_ROOT+"/tag_vectors_index.p","wb"))

In [None]:
normalized_tag_vectors_index = dict()    

## THIS SUCKS.. WHY?
## normalize vectors to make cosine similarity more accurate  
for (tag,tag_vector) in tag_vectors_index.items():    
    normalized_tag_vectors_index[tag] = LA.norm(tag_vector, 2)

In [26]:
def _cosine_similarity(a,b):
    return 1 - spatial.distance.cosine(a, b)

In [27]:
target_tags = ['sql-server-2008','sql','sql-server','mysql','database','postgresql','select','join','oracle','tsql','sqlite']

pairs = [ (a,b) for a in target_tags for b in target_tags if a != b ]

In [28]:
for (tag_a,tag_b) in pairs:
    tag_a_vector = tag_vectors_index[tag_a]
    tag_b_vector = tag_vectors_index[tag_b]
    
    sim = _cosine_similarity(tag_a_vector,tag_b_vector)
    
    print("SIM '{}' '{}' => {}".format(tag_a,tag_b,sim))
    

SIM 'sql-server-2008' 'sql' => 0.9395895746882705
SIM 'sql-server-2008' 'sql-server' => 0.9821119605172598
SIM 'sql-server-2008' 'mysql' => 0.8226097503208992
SIM 'sql-server-2008' 'database' => 0.8275114596159142
SIM 'sql-server-2008' 'postgresql' => 0.7747714200885653
SIM 'sql-server-2008' 'select' => 0.7972666890913032
SIM 'sql-server-2008' 'join' => 0.7392739084398778
SIM 'sql-server-2008' 'oracle' => 0.7797090548419625
SIM 'sql-server-2008' 'tsql' => 0.9378127522029255
SIM 'sql-server-2008' 'sqlite' => 0.6490600970004755
SIM 'sql' 'sql-server-2008' => 0.9395895746882705
SIM 'sql' 'sql-server' => 0.9505872018144442
SIM 'sql' 'mysql' => 0.9103751542945812
SIM 'sql' 'database' => 0.8543220589394381
SIM 'sql' 'postgresql' => 0.8322720471873928
SIM 'sql' 'select' => 0.8906146287347845
SIM 'sql' 'join' => 0.847780243207564
SIM 'sql' 'oracle' => 0.8386410400720171
SIM 'sql' 'tsql' => 0.9554279729324378
SIM 'sql' 'sqlite' => 0.6869169237548929
SIM 'sql-server' 'sql-server-2008' => 0.98211

if all other tags are, on average, more similar to A than to B, then A is probably more generic than B

should it be normalized by the difference between A and B?

will fail for tags that are below B?

In [29]:
sql_vector = tag_vectors_index['sql']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.68691692375489299

In [30]:
sql_vector = tag_vectors_index['sql-server']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.66235291625552073

In [31]:
sql_vector = tag_vectors_index['sql-server-2008']
avg_diff = list()

for (tag_name,tag_vector) in tag_vectors_index.items():
    avg_diff.append(_cosine_similarity(tag_a_vector,sql_vector))
    
np.mean(np.array(avg_diff))

0.64906009700047551

In [32]:
def get_sim(tag_a,tag_b):
    
    sim = _cosine_similarity(tag_vectors_index[tag_a],tag_vectors_index[tag_b])
    
    return (tag_a,tag_b,sim)

In [33]:
all_tags = [tag for (tag,_) in tag_vectors_index.items()]

In [34]:
sims = Parallel(n_jobs=-1)(delayed(get_sim)(a,b) for a in all_tags for b in all_tags)

In [35]:
sims[:10]

[('asp.net-mvc-5', 'asp.net-mvc-5', 0.99999999999999978),
 ('asp.net-mvc-5', 'visual-studio-2013', 0.6078486953790867),
 ('asp.net-mvc-5', 'windows', 0.64490353939086387),
 ('asp.net-mvc-5', 'file-upload', 0.53429951630358219),
 ('asp.net-mvc-5', 'crystal-reports', 0.37989519582539),
 ('asp.net-mvc-5', 'console', 0.52062722307343112),
 ('asp.net-mvc-5', 'user-controls', 0.58667391415372983),
 ('asp.net-mvc-5', 'io', 0.59035272012368512),
 ('asp.net-mvc-5', 'replace', 0.4965473126579063),
 ('asp.net-mvc-5', 'flex', 0.53910200836349897)]

In [36]:
pairwise_similarity_dict = dict()

for tag_a,tag_b, sim in sims:
    
    if pairwise_similarity_dict.get(tag_a) is None:
        pairwise_similarity_dict[tag_a] = [(tag_b,sim)]
    else:
        pairwise_similarity_dict[tag_a].append((tag_b,sim))

In [37]:
pairwise_similarity_dict["sql"][:20]

[('asp.net-mvc-5', 0.59769479208929133),
 ('visual-studio-2013', 0.51486099784763351),
 ('windows', 0.60750827829231147),
 ('file-upload', 0.46489986971607467),
 ('crystal-reports', 0.42603387134466697),
 ('console', 0.48132889780765997),
 ('user-controls', 0.46797348138180328),
 ('io', 0.56905764523141578),
 ('replace', 0.52591183414567777),
 ('flex', 0.49748939598572317),
 ('angular2', 0.47559463419384995),
 ('url', 0.48541590471914975),
 ('dns', 0.43886394356552116),
 ('reactjs', 0.42166100528760331),
 ('magento', 0.42848213488395936),
 ('cocos2d-iphone', 0.53961322512835186),
 ('java-ee', 0.5792233793669771),
 ('dataframe', 0.55324184704451007),
 ('git', 0.33804186710049988),
 ('xaml', 0.55775193200694506)]

In [38]:
sorted(pairwise_similarity_dict["sql"],key=lambda tpl : tpl[1],reverse=True)[1:20]

[('tsql', 0.95542797293243775),
 ('sql-server', 0.95058720181444423),
 ('sql-server-2008', 0.93958957468827053),
 ('mysql', 0.91037515429458116),
 ('select', 0.89061462873478447),
 ('sql-server-2005', 0.88581025847870809),
 ('sql-server-2008-r2', 0.87935180799215285),
 ('database', 0.85432205893943813),
 ('sql-server-2012', 0.85136154912507944),
 ('join', 0.84778024320756395),
 ('oracle', 0.83864104007201712),
 ('postgresql', 0.83227204718739278),
 ('ms-access', 0.81932215209769499),
 ('group-by', 0.79698865335013647),
 ('database-design', 0.79539319358328964),
 ('oracle11g', 0.78170311306235873),
 ('table', 0.7751362391324248),
 ('linq-to-sql', 0.76852875453844216),
 ('insert', 0.75992647257611134)]

In [39]:
sorted_pairwise_similarity_dict = dict()

for (tag, similarities_to_other_tags) in pairwise_similarity_dict.items():
    
    sorted_similarities = sorted(similarities_to_other_tags,key=lambda tpl: tpl[1],reverse=True)
    sorted_without_self = sorted_similarities[1:]
    
    sorted_pairwise_similarity_dict[tag] = sorted_without_self

In [40]:
sorted_pairwise_similarity_dict["sql"][:20]

[('tsql', 0.95542797293243775),
 ('sql-server', 0.95058720181444423),
 ('sql-server-2008', 0.93958957468827053),
 ('mysql', 0.91037515429458116),
 ('select', 0.89061462873478447),
 ('sql-server-2005', 0.88581025847870809),
 ('sql-server-2008-r2', 0.87935180799215285),
 ('database', 0.85432205893943813),
 ('sql-server-2012', 0.85136154912507944),
 ('join', 0.84778024320756395),
 ('oracle', 0.83864104007201712),
 ('postgresql', 0.83227204718739278),
 ('ms-access', 0.81932215209769499),
 ('group-by', 0.79698865335013647),
 ('database-design', 0.79539319358328964),
 ('oracle11g', 0.78170311306235873),
 ('table', 0.7751362391324248),
 ('linq-to-sql', 0.76852875453844216),
 ('insert', 0.75992647257611134),
 ('coldfusion', 0.75616402682368966)]

In [41]:
pickle.dump(sorted_pairwise_similarity_dict,open(PICKLE_DIR_ROOT+"/sorted_pairwise_similarity_dict.p",'wb'))

### build global similarity index

In [42]:
global_similarity_index = dict()

for tag in tag_vocabulary:
    
    similarities_with_current_tag = list()
    
    for other_tag, similarity_to_other_tag in sorted_pairwise_similarity_dict[tag]:
        
        if other_tag != tag:
            similarities_with_current_tag.append(similarity_to_other_tag)

    global_avg_sim_wrt_tag = np.array(similarities_with_current_tag).mean()
    
    global_similarity_index[tag] = global_avg_sim_wrt_tag
            
pickle.dump(global_similarity_index,open(PICKLE_DIR_ROOT+"/global_similarity_index.p",'wb'))

In [43]:
# global_similarity_index

In [44]:
sorted(global_similarity_index.items(),key=lambda tpl: tpl[1],reverse=True)[:20]

[('c#', 0.61429474374359472),
 ('.net', 0.60856767935547096),
 ('coldfusion', 0.60133827473238266),
 ('web', 0.58749988631852479),
 ('c#-4.0', 0.58703059122895906),
 ('design', 0.57828630242397228),
 ('mfc', 0.57473086547385188),
 ('performance', 0.57384818812305094),
 ('optimization', 0.56875933925500555),
 ('.net-4.0', 0.56798499202169783),
 ('javascript', 0.56788158824929913),
 ('java', 0.56740237398190985),
 ('web-applications', 0.56587494861950982),
 ('user-interface', 0.5655724222812859),
 ('security', 0.56419463079111587),
 ('php', 0.56297895418686694),
 ('cocoa', 0.56254237759167014),
 ('asp.net', 0.56224469352671291),
 ('coffeescript', 0.55656888421314732),
 ('winforms', 0.55459198118370545)]

In [48]:
evaluate_cobrinha('android','android-studio',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.1304544666069315, 0.82292004418484122)

In [49]:
evaluate_cobrinha('asp.net','asp.net-mvc',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.034968450719514266, 0.84565024635719455)

In [50]:
evaluate_cobrinha('asp.net-mvc','asp.net-mvc-5',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.0070798645057762455, 0.95671340973047336)

what about unrelated stuff

In [51]:
evaluate_cobrinha('java','arrays',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.12649887344671018, 0.62727084914851483)

In [52]:
evaluate_cobrinha('ruby','python-2.7',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(-0.016851103574039517, 0.64133340229546998)

In [53]:
evaluate_cobrinha('database','python-2.7',tag_vectors_index,sorted_pairwise_similarity_dict, global_similarity_index) 

(0.027103394222880106, 0.6490934775334003)

what's the mean and stddev of the similarity between all tags and each other?

In [54]:
running_avgs = []

for (tag, similarities_to_other_tags) in pairwise_similarity_dict.items():
    sims = [tpl[1] for tpl in similarities_to_other_tags]
    
    avg = np.array(sims).mean()
    
    running_avgs.append(avg)
    
np.array(running_avgs).mean(),np.array(running_avgs).std()

(0.42193599264174947, 0.077541119988049412)