In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import get_metrics_for_tag_pair

In [2]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [3]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [4]:
sorted_pairwise_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_pairwise_similarity_dict.p","rb"))

In [5]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

In [6]:
tag_frequency_index = pickle.load(open(PICKLE_ROOT+"tag_frequency_index.p","rb"))

### build global similarity index

In [68]:
global_similarity_index = dict()

for tag in tag_vocabulary:
    
    similarities_with_current_tag = list()
    
    for other_tag, similarity_to_other_tag in sorted_pairwise_similarity_dict[tag]:
        
        if other_tag != tag:
            similarities_with_current_tag.append(similarity_to_other_tag)

    tag_frequency = tag_frequency_index[tag]        
            
    global_avg_sim_wrt_tag = np.array(similarities_with_current_tag).mean() / np.log(tag_frequency)
    
    global_similarity_index[tag] = global_avg_sim_wrt_tag

In [69]:
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1],)

### most general tags

In [70]:
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1],reverse=True )

In [71]:
sorted_tags_by_global_avg_similarities[:20]

[('language-agnostic', 0.098461997477516688),
 ('.net-4.0', 0.096956342134913359),
 ('uwp', 0.096089051381918839),
 ('synchronization', 0.095579031632504044),
 ('statistics', 0.09505454166976697),
 ('coding-style', 0.094875445105894532),
 ('nosql', 0.093511747126850647),
 ('automation', 0.093280635058997655),
 ('windows-runtime', 0.092698271847448985),
 ('nested', 0.091472604414723047),
 ('formatting', 0.089220839212342098),
 ('get', 0.089171133091095756),
 ('network-programming', 0.087998828290756212),
 ('set', 0.08762864623089385),
 ('operating-system', 0.087234527345558852),
 ('design', 0.087185826801526492),
 ('initialization', 0.087169782389706837),
 ('website', 0.087059734517938567),
 ('scripting', 0.086664148146310155),
 ('com', 0.086475458109207134)]

Looks a bit better now... the fact that `language-agnostic` is at the top is a good sign. 

Nothing can be more generic than that.

In [19]:
def evaluate(a,b):
    return get_metrics_for_tag_pair(a,b,tag_vectors_index, sorted_pairwise_similarity_dict, global_similarity_index)

> the result is a tuple of the form (difference_of_global_averages, mutual_similarity)

positive difference means the first tag is "more general" than the second tag (according to our measure)


In [20]:
# USES hierarchy
evaluate('sql','sql-server')

(-0.0034978322183831859, 0.95254598931613399)

In [21]:
# IS-A hierarchy
evaluate('sql-server','sql-server-2008')

(-0.007392260101224811, 0.98293809979910707)

In [22]:
# IS-A hierarchy
evaluate('python','python-3.x')

(-0.014158187837021469, 0.97338151804814499)

In [23]:
# siblings
evaluate('python-2.7','python-3.x')

(-0.0012258016964297042, 0.97285139543273913)

In [24]:
# IS-A hierarchy
evaluate('database','oracle')

(0.0082165382611330759, 0.76230161650880313)

In [25]:
# USES hierarchy
evaluate('ruby','ruby-on-rails')

(0.0026412682182784877, 0.89413843055747877)

In [26]:
# IS-A hierarchy
evaluate('frameworks','ruby-on-rails')

(0.025344549669578165, 0.57048883172717779)

In [27]:
# PART-OF hierarchy
evaluate('web-applications','ruby-on-rails')

(0.032462531874949352, 0.69688410924691535)

In [28]:
# PART-OF hierarchy
evaluate('web','ruby-on-rails')

(0.032556606451813921, 0.71642165092562582)

In [29]:
# PART-OF hierarchy
evaluate('web','web-applications')

(9.4074576864569548e-05, 0.93570023662497925)

In [30]:
# IS-A hierarchy
evaluate('.net','.net-4.0')

(-0.031287169465293849, 0.96030815224117505)

In [31]:
# USES hierarchy
evaluate('android','android-studio') 

(-0.0043501447807454546, 0.82370215299345206)

In [32]:
# PART-OF hierarchy
evaluate('asp.net','asp.net-mvc') 

(-0.00062812641264734653, 0.84275196216780723)

In [33]:
# IS-A hierarchy
evaluate('asp.net-mvc','asp.net-mvc-5') 

(-0.023854420855456235, 0.96185433698816181)

In [72]:
# IS-A hierarchy
evaluate('asp.net-mvc','asp.net-mvc-4') 

(-0.0095397746784952192, 0.97239882654272058)

In [34]:
# PART-of hierarchy
evaluate('asp.net','asp.net-mvc-5')

(-0.024482547268103581, 0.80538136703023122)

## what about unrelated stuff?

this will probably fluctuate around zero

In [35]:
evaluate('java','arrays')

(0.0033810052090272671, 0.63315370365888091)

In [36]:
evaluate('ruby','python-2.7') 

(-0.0088852857207939659, 0.64061660110488106)

In [37]:
evaluate('database','python-2.7') 

(-0.0014707787135152389, 0.65083198793931007)

## what are the tag pairs that are the best cobrinha parts?

In [38]:
similarities_to_other_tags = sorted_pairwise_similarity_dict["sql"]

In [39]:
similarities_to_other_tags[1]

('sql-server', 0.95254598931613399)

In [40]:
lst = sorted_pairwise_similarity_dict.items()

In [41]:
def make_pairwise_cobrinha_factors_v1(pairwise_similarity_dict):

    outgoing_cobrinha_factors_dict = dict()
    incoming_cobrinha_factors_dict = dict()


    for tag in tag_vocabulary:
        outgoing_cobrinha_factors_dict[tag] = list()
        incoming_cobrinha_factors_dict[tag] = list()
        
        
    for (tag, similarities_to_other_tags) in tqdm(pairwise_similarity_dict.items()):
    
        for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

            (avg_avg, mutual_similarity)= evaluate(tag,other_tag)

            cobrinha_factor = avg_avg * mutual_similarity
            pair = "{} ->  {}".format(tag,other_tag)    

            outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
            incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))    
        
        
    return (outgoing_cobrinha_factors_dict,incoming_cobrinha_factors_dict)

In [42]:
outgoing_factors_dict,incoming_factors_dict = make_pairwise_cobrinha_factors_v1(sorted_pairwise_similarity_dict)

100%|██████████| 737/737 [00:24<00:00, 29.85it/s]


## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [43]:
sorted_outgoing_cobrinha_factors_dict = dict()
sorted_incoming_cobrinha_factors_dict = dict()

for (tag,similarities) in outgoing_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_outgoing_cobrinha_factors_dict[tag] = sorted_similarities
    
for (tag,similarities) in incoming_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_incoming_cobrinha_factors_dict[tag] = sorted_similarities        

In [44]:
sorted_incoming_cobrinha_factors_dict['android'][:10]

[('uwp ->  android', 0.033255820770293369),
 ('.net-4.0 ->  android', 0.032828980505127288),
 ('language-agnostic ->  android', 0.03212580905231719),
 ('synchronization ->  android', 0.031656875488256644),
 ('windows-runtime ->  android', 0.030263179386971123),
 ('automation ->  android', 0.03001170916089678),
 ('coding-style ->  android', 0.029939229165608178),
 ('statistics ->  android', 0.028095846639527442),
 ('nosql ->  android', 0.027848622535994093),
 ('c#-4.0 ->  android', 0.027773373848560955)]

In [45]:
sorted_incoming_cobrinha_factors_dict['scala'][:10]

[('language-agnostic ->  scala', 0.026102490901386395),
 ('.net-4.0 ->  scala', 0.024385483627414713),
 ('coding-style ->  scala', 0.024122750565675241),
 ('synchronization ->  scala', 0.022962513461221277),
 ('statistics ->  scala', 0.02221887036718953),
 ('nosql ->  scala', 0.021821106029345162),
 ('uwp ->  scala', 0.021770658924324535),
 ('automation ->  scala', 0.021444858232063761),
 ('functional-programming ->  scala', 0.021090711517254668),
 ('nested ->  scala', 0.020926731438902418)]

In [46]:
sorted_incoming_cobrinha_factors_dict['javascript'][:10]

[('language-agnostic ->  javascript', 0.034945462244093319),
 ('.net-4.0 ->  javascript', 0.032925406353192632),
 ('coding-style ->  javascript', 0.032306922098138777),
 ('uwp ->  javascript', 0.031786574803723384),
 ('automation ->  javascript', 0.030991900029976489),
 ('statistics ->  javascript', 0.03081017213264672),
 ('synchronization ->  javascript', 0.030800982672318161),
 ('get ->  javascript', 0.029778614103775712),
 ('nosql ->  javascript', 0.029386082811344028),
 ('windows-runtime ->  javascript', 0.029203026020183247)]

Note in the above list, that it's nearly the same as the raw top most general tags.

It looks like the mutual tag similarity isn't affecting it enough...


if "sql" is a "good" parent tag, then the avg of all OUTGOING cobrinha factors should be higher than the average INCOMING cobrinha factors, right?

In [47]:
TAG_NAME='sql'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: -0.006720296365256086 
average of incoming factors: 0.006720296365256086


In [48]:
TAG_NAME='performance'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.004260296917591758 
average of incoming factors: -0.004260296917591758


In [49]:
TAG_NAME='user-interface'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.0067160087015690085 
average of incoming factors: -0.0067160087015690085


now let's see whether a reasonably "bad" parent tag shows the opposite behaviour.

In [50]:
TAG_NAME = 'android-fragments'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
    
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))    

average of outgoing factors: -0.006735859006789629 
average of incoming factors: 0.006735859006789628


## now let's try to find a triple. two tags B and C that have good cobrinha factor and a third tag A that has good cobrinha factor with tag B and tag C (but cobrinha(A,B) should probably be higher than cobrinha (A,C))


A -> B -> C

(TAG C is the most specific tag of the three)

### let's experiment first

In [51]:
# this is not very good, because numpy is a subset of scipy
# so scipy should, in theory, be more general than numpy

# perhaps this has to do with the fact that numpy tags are much more common than scipy, and this
# causes numpy-tagged documents to have more words, in general, than scipy?

global_similarity_index['scipy'],global_similarity_index['numpy']

(0.053286080863990483, 0.046774060959158967)

it's bad that 'c#' and things like '.net' and 'design' are at the top **but** 'python' and 'performance' 
and 'optimization' and 'statistics' at the top are indicative that something is correct

In [52]:
sorted_incoming_cobrinha_factors_dict['numpy'][:10]

[('statistics ->  numpy', 0.026144294431562264),
 ('language-agnostic ->  numpy', 0.025409634596762955),
 ('coding-style ->  numpy', 0.022193465352248448),
 ('.net-4.0 ->  numpy', 0.021502378686525738),
 ('nosql ->  numpy', 0.020871521779075589),
 ('synchronization ->  numpy', 0.020672158987537412),
 ('automation ->  numpy', 0.020381120661946174),
 ('nested ->  numpy', 0.020190065952798208),
 ('optimization ->  numpy', 0.020172055275786301),
 ('uwp ->  numpy', 0.019975917963927697)]

In [53]:
sorted_incoming_cobrinha_factors_dict['statistics'][:10]

[('language-agnostic ->  statistics', 0.0027956892721835353),
 ('.net-4.0 ->  statistics', 0.0013541776950901935),
 ('uwp ->  statistics', 0.00068569451373005029),
 ('synchronization ->  statistics', 0.00037100443914509635),
 ('coding-style ->  statistics', -0.00013234523485511731),
 ('nosql ->  statistics', -0.0011667871013878717),
 ('automation ->  statistics', -0.0012953492461209991),
 ('windows-runtime ->  statistics', -0.0015575956369949706),
 ('nested ->  statistics', -0.0024627804738334815),
 ('get ->  statistics', -0.0039243416415011374)]

### let's try some triples, starting with tags very low in the hierarchy (i.e. low generality, high specificity)

In [54]:
sorted(global_similarity_index.items(),key=lambda tpl: tpl[1])[:10]

[('gulp', 0.033517105895289334),
 ('git', 0.037247930783205624),
 ('docker', 0.037816760959739584),
 ('boost', 0.038639510721142301),
 ('pip', 0.039126334005130471),
 ('paypal', 0.03999486360695359),
 ('npm', 0.040452189118261483),
 ('socket.io', 0.040635671308347422),
 ('d3.js', 0.040972176172355025),
 ('.htaccess', 0.041231543801691181)]

In [56]:
sorted_incoming_cobrinha_factors_dict['gulp'][:10]

[('.net-4.0 ->  gulp', 0.016599449254427353),
 ('automation ->  gulp', 0.016485782138995504),
 ('uwp ->  gulp', 0.016242511768419902),
 ('language-agnostic ->  gulp', 0.015989805969127489),
 ('synchronization ->  gulp', 0.015460871916904513),
 ('web ->  gulp', 0.015382110713190551),
 ('coding-style ->  gulp', 0.015304935995178718),
 ('windows-runtime ->  gulp', 0.015062820224054245),
 ('compilation ->  gulp', 0.014626718092489372),
 ('web-applications ->  gulp', 0.014623286128637076)]

In [57]:
sorted_incoming_cobrinha_factors_dict['.net-4.0'][:10]

[('language-agnostic ->  .net-4.0', 0.0011616854022605597),
 ('uwp ->  .net-4.0', -0.0006850457487796004),
 ('synchronization ->  .net-4.0', -0.0010456057081472793),
 ('statistics ->  .net-4.0', -0.0013541776950901935),
 ('coding-style ->  .net-4.0', -0.0015589990767072793),
 ('nosql ->  .net-4.0', -0.0024783486233709239),
 ('automation ->  .net-4.0', -0.0027972527123922395),
 ('windows-runtime ->  .net-4.0', -0.0033456671146546925),
 ('nested ->  .net-4.0', -0.0037704357183631059),
 ('formatting ->  .net-4.0', -0.0054081657389775288)]

so we get **"language-agnostic" -> ".net-4.0" -> "gulp"**, which is clearly not good

In [58]:
sorted_incoming_cobrinha_factors_dict['hive'][:10]

[('nosql ->  hive', 0.019191633551789385),
 ('language-agnostic ->  hive', 0.018203759127021293),
 ('statistics ->  hive', 0.018167193506568754),
 ('.net-4.0 ->  hive', 0.017677818308878615),
 ('synchronization ->  hive', 0.01741565585055339),
 ('automation ->  hive', 0.01694153021714228),
 ('coding-style ->  hive', 0.016339279597100461),
 ('uwp ->  hive', 0.015925589012948885),
 ('c#-4.0 ->  hive', 0.015859551683684761),
 ('formatting ->  hive', 0.015783154119934389)]

In [59]:
sorted_incoming_cobrinha_factors_dict['nosql'][:10]

[('language-agnostic ->  nosql', 0.0038976107070543991),
 ('.net-4.0 ->  nosql', 0.0024783486233709239),
 ('uwp ->  nosql', 0.0017244171732186497),
 ('synchronization ->  nosql', 0.0015366062376343671),
 ('statistics ->  nosql', 0.0011667871013878717),
 ('coding-style ->  nosql', 0.00098854157898503278),
 ('automation ->  nosql', -0.00016680776304761632),
 ('windows-runtime ->  nosql', -0.00054410898647803702),
 ('nested ->  nosql', -0.0014231371282940801),
 ('formatting ->  nosql', -0.0029393713112211249)]

so we get **"language-agnoistic" -> "nosql" -> "hive"**, which makes a little sense

In [60]:
sorted_incoming_cobrinha_factors_dict['docker'][:10]

[('.net-4.0 ->  docker', 0.020679547392247432),
 ('language-agnostic ->  docker', 0.0201275736808211),
 ('automation ->  docker', 0.020078795113008082),
 ('uwp ->  docker', 0.019363924027328321),
 ('synchronization ->  docker', 0.019355503886943844),
 ('nosql ->  docker', 0.018061271760883738),
 ('web ->  docker', 0.01802983121033877),
 ('windows-runtime ->  docker', 0.017985388209507125),
 ('statistics ->  docker', 0.017955001046870988),
 ('coding-style ->  docker', 0.017705687113756592)]

In [61]:
sorted_incoming_cobrinha_factors_dict['.net'][:10]

[('.net-4.0 ->  .net', 0.030045323898072848),
 ('language-agnostic ->  .net', 0.02718325010248299),
 ('uwp ->  .net', 0.025229746901614034),
 ('synchronization ->  .net', 0.024492442970529237),
 ('coding-style ->  .net', 0.023516991472811138),
 ('statistics ->  .net', 0.022574322663960731),
 ('automation ->  .net', 0.022481233666942352),
 ('windows-runtime ->  .net', 0.022344246154517307),
 ('nosql ->  .net', 0.02199700553943125),
 ('c#-4.0 ->  .net', 0.019510675442893647)]

so we get ".net-4.0" -> ".net" -> "docker", which is also clearly not good

(.net-4.0 and .net are the wrong way around)

> For version 2, I think the raw tag generality is a lot better, but now things like "asp-net-mvc" being LESS general than "asp-net-mvc-5" looks bad.

I think the log damping was a good step but maybe we need a function that dampens high values but doesn't punish small values too much so that the above doesn't happen.
