In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import get_metrics_for_tag_pair

In [2]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [3]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [4]:
sorted_pairwise_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_pairwise_similarity_dict.p","rb"))

In [5]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

### build global similarity index

In [6]:
global_similarity_index = dict()

for tag in tag_vocabulary:
    
    similarities_with_current_tag = list()
    
    for other_tag, similarity_to_other_tag in sorted_pairwise_similarity_dict[tag]:
        
        if other_tag != tag:
            similarities_with_current_tag.append(similarity_to_other_tag)

    global_avg_sim_wrt_tag = np.array(similarities_with_current_tag).mean()
    
    global_similarity_index[tag] = global_avg_sim_wrt_tag

In [7]:
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1] )

In [8]:
sorted_tags_by_global_avg_similarities[:20]

[('gulp', 0.19527066254503755),
 ('pip', 0.21634628933565575),
 ('vagrant', 0.23903271161456216),
 ('ckeditor', 0.23906470957447498),
 ('sbt', 0.2395941030817981),
 ('recyclerview', 0.240645562610522),
 ('socket.io', 0.24387152236585527),
 ('npm', 0.2501623604716901),
 ('docker', 0.25114799860316461),
 ('gem', 0.25210788788163307),
 ('webpack', 0.25273662633079658),
 ('hive', 0.25340772779634463),
 ('boost', 0.25453970942388382),
 ('tensorflow', 0.25511990598826478),
 ('rubygems', 0.25631605934988944),
 ('paypal', 0.25804881791810436),
 ('gdb', 0.26075799247517195),
 ('gruntjs', 0.26333284945243524),
 ('jqgrid', 0.26348713577539085),
 ('uicollectionview', 0.26389225109749653)]

### most general tags

In [9]:
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1],reverse=True )

In [10]:
sorted_tags_by_global_avg_similarities[:20]

[('c#', 0.60393332042983183),
 ('.net', 0.59913818075582181),
 ('c#-4.0', 0.58743554183478974),
 ('web', 0.58263167756894441),
 ('design', 0.57053636477186953),
 ('performance', 0.56889648744908738),
 ('optimization', 0.56422751980785046),
 ('javascript', 0.5601175189768971),
 ('user-interface', 0.55934441416543368),
 ('cocoa', 0.55806260270253794),
 ('web-applications', 0.55782925701871977),
 ('java', 0.55779187068343317),
 ('.net-4.0', 0.55713084251874179),
 ('dynamic', 0.55689723683270642),
 ('language-agnostic', 0.55481263969358785),
 ('security', 0.55287751783657502),
 ('php', 0.5524340257056598),
 ('asp.net', 0.55056913493752191),
 ('automation', 0.54957631362077564),
 ('ios', 0.5439883533841402)]

note that tags such as "web", "performance", "design", "security" and "user-interface" are **nowhere near** the top assigned tags on SO, but they show up here... and it makes sense because they're very general indeed

In [11]:
def evaluate(a,b):
    return get_metrics_for_tag_pair(a,b,tag_vectors_index, sorted_pairwise_similarity_dict, global_similarity_index)

> the result is a tuple of the form (difference_of_global_averages, mutual_similarity)

positive difference means the first tag is "more general" than the second tag (according to our measure)


In [12]:
# USES hierarchy
evaluate('sql','sql-server')

(0.0028483713477022166, 0.95254598931613399)

In [13]:
# IS-A hierarchy
evaluate('sql-server','sql-server-2008')

(0.019295938046841321, 0.98293809979910707)

In [14]:
# IS-A hierarchy
evaluate('python','python-3.x')

(0.030133297622367938, 0.97338151804814499)

In [15]:
# siblings
evaluate('python-2.7','python-3.x')

(0.0038214584489413284, 0.97285139543273913)

In [16]:
# IS-A hierarchy
evaluate('database','oracle')

(0.093997495495894812, 0.76230161650880313)

In [17]:
# USES hierarchy
evaluate('ruby','ruby-on-rails')

(0.0005084793071781224, 0.89413843055747877)

In [18]:
# IS-A hierarchy
evaluate('frameworks','ruby-on-rails')

(-0.029689993357889966, 0.57048883172717779)

In [19]:
# PART-OF hierarchy
evaluate('web-applications','ruby-on-rails')

(0.061193973521479372, 0.69688410924691535)

In [20]:
# PART-OF hierarchy
evaluate('web','ruby-on-rails')

(0.085996394071704008, 0.71642165092562582)

In [21]:
# PART-OF hierarchy
evaluate('web','web-applications')

(0.024802420550224635, 0.93570023662497925)

In [22]:
# IS-A hierarchy
evaluate('.net','.net-4.0')

(0.042007338237080027, 0.96030815224117505)

In [23]:
# USES hierarchy
evaluate('android','android-studio') 

(0.12799580440187042, 0.82370215299345206)

In [24]:
# PART-OF hierarchy
evaluate('asp.net','asp.net-mvc') 

(0.037485702123526088, 0.84275196216780723)

In [25]:
# IS-A hierarchy
evaluate('asp.net-mvc','asp.net-mvc-5') 

(0.011575250312908847, 0.96185433698816181)

In [26]:
# PART-of hierarchy
evaluate('asp.net','asp.net-mvc-5')

(0.049060952436434935, 0.80538136703023122)

## what about unrelated stuff?

this will probably fluctuate around zero

In [27]:
evaluate('java','arrays')

(0.12109613945695896, 0.63315370365888091)

In [28]:
evaluate('ruby','python-2.7') 

(-0.011485884854176454, 0.64061660110488106)

In [29]:
evaluate('database','python-2.7') 

(0.031984013568033376, 0.65083198793931007)

## what are the tag pairs that are the best cobrinha parts?

In [30]:
similarities_to_other_tags = sorted_pairwise_similarity_dict["sql"]

In [31]:
similarities_to_other_tags[1]

('sql-server', 0.95254598931613399)

In [32]:
lst = sorted_pairwise_similarity_dict.items()

In [33]:
def make_pairwise_cobrinha_factors_v1(pairwise_similarity_dict):

    outgoing_cobrinha_factors_dict = dict()
    incoming_cobrinha_factors_dict = dict()


    for tag in tag_vocabulary:
        outgoing_cobrinha_factors_dict[tag] = list()
        incoming_cobrinha_factors_dict[tag] = list()
        
        
    for (tag, similarities_to_other_tags) in tqdm(pairwise_similarity_dict.items()):
    
        for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

            (avg_avg, mutual_similarity)= evaluate(tag,other_tag)

            cobrinha_factor = avg_avg * mutual_similarity
            pair = "{} ->  {}".format(tag,other_tag)    

            outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
            incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))    
        
        
    return (outgoing_cobrinha_factors_dict,incoming_cobrinha_factors_dict)

In [34]:
outgoing_factors_dict,incoming_factors_dict = make_pairwise_cobrinha_factors_v1(sorted_pairwise_similarity_dict)

100%|██████████| 737/737 [00:24<00:00, 29.93it/s]


## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [38]:
sorted_outgoing_cobrinha_factors_dict = dict()
sorted_incoming_cobrinha_factors_dict = dict()

for (tag,similarities) in outgoing_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_outgoing_cobrinha_factors_dict[tag] = sorted_similarities
    
for (tag,similarities) in incoming_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_incoming_cobrinha_factors_dict[tag] = sorted_similarities        

In [47]:
sorted_incoming_cobrinha_factors_dict['android'][:10]

[('c# ->  android', 0.069443708768014006),
 ('.net ->  android', 0.063583917605022938),
 ('c#-4.0 ->  android', 0.055285809691326973),
 ('web ->  android', 0.050111929669495701),
 ('design ->  android', 0.040999492252817883),
 ('performance ->  android', 0.038999282076389336),
 ('java ->  android', 0.035541515646188314),
 ('user-interface ->  android', 0.035157020736310822),
 ('optimization ->  android', 0.034164527914380315),
 ('cocoa ->  android', 0.031956620730637791)]

In [50]:
sorted_incoming_cobrinha_factors_dict['scala'][:10]

[('c# ->  scala', 0.12305054476325752),
 ('.net ->  scala', 0.11701096947596533),
 ('c#-4.0 ->  scala', 0.11064242595402106),
 ('web ->  scala', 0.097594288451952643),
 ('design ->  scala', 0.095809983801927984),
 ('java ->  scala', 0.095203822441709829),
 ('performance ->  scala', 0.094615661060767561),
 ('optimization ->  scala', 0.093029182173753652),
 ('dynamic ->  scala', 0.08737851388859591),
 ('language-agnostic ->  scala', 0.086275570695435164)]

In [49]:
sorted_incoming_cobrinha_factors_dict['javascript'][:10]

[('c# ->  javascript', 0.035512134292219548),
 ('.net ->  javascript', 0.030833022041302956),
 ('c#-4.0 ->  javascript', 0.021551564800825197),
 ('web ->  javascript', 0.018789550178211128),
 ('design ->  javascript', 0.0078801601177236145),
 ('performance ->  javascript', 0.0067050583752320632),
 ('optimization ->  javascript', 0.0031506925748634087),
 ('user-interface ->  javascript', -0.00058418204873059349),
 ('cocoa ->  javascript', -0.0015390447421814348),
 ('java ->  javascript', -0.0016300551610532099)]

Note in the above list, that it's nearly the same as the raw top most general tags.

It looks like the mutual tag similarity isn't affecting it enough...


if "sql" is a "good" parent tag, then the avg of all OUTGOING cobrinha factors should be higher than the average INCOMING cobrinha factors, right?

In [41]:
TAG_NAME='sql'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.031799859384609795 
average of incoming factors: -0.031799859384609795


In [42]:
TAG_NAME='performance'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.08331681178985266 
average of incoming factors: -0.08331681178985266


In [43]:
TAG_NAME='user-interface'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.07693146712993348 
average of incoming factors: -0.0769314671299335


now let's see whether a reasonably "bad" parent tag shows the opposite behaviour.

In [44]:
TAG_NAME = 'android-fragments'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
    
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))    

average of outgoing factors: -0.036906575333133504 
average of incoming factors: 0.036906575333133504


## now let's try to find a triple. two tags B and C that have good cobrinha factor and a third tag A that has good cobrinha factor with tag B and tag C (but cobrinha(A,B) should probably be higher than cobrinha (A,C))


A -> B -> C

(TAG C is the most specific tag of the three)

### let's experiment first

In [51]:
# this is not very good, because numpy is a subset of scipy
# so scipy should, in theory, be more general than numpy

# perhaps this has to do with the fact that numpy tags are much more common than scipy, and this
# causes numpy-tagged documents to have more words, in general, than scipy?

global_similarity_index['scipy'],global_similarity_index['numpy']

(0.30949315039067699, 0.3329764129312906)

it's bad that 'c#' and things like '.net' and 'design' are at the top **but** 'python' and 'performance' 
and 'optimization' and 'statistics' at the top are indicative that something is correct

In [59]:
sorted_incoming_cobrinha_factors_dict['numpy'][:10]

[('optimization ->  numpy', 0.12690520655630516),
 ('c# ->  numpy', 0.12675415644882873),
 ('.net ->  numpy', 0.12396412448538291),
 ('python ->  numpy', 0.12089679569627088),
 ('performance ->  numpy', 0.12009387439255283),
 ('c#-4.0 ->  numpy', 0.11454889392597092),
 ('web ->  numpy', 0.112109053487704),
 ('design ->  numpy', 0.10978622616194599),
 ('language-agnostic ->  numpy', 0.10905402386014457),
 ('statistics ->  numpy', 0.10551707009304812)]

In [61]:
sorted_incoming_cobrinha_factors_dict['scipy'][:10]

[('optimization ->  scipy', 0.13239369398562054),
 ('c# ->  scipy', 0.12697070983275771),
 ('.net ->  scipy', 0.12553673878070437),
 ('python ->  scipy', 0.12472987151595769),
 ('performance ->  scipy', 0.11941611235100101),
 ('c#-4.0 ->  scipy', 0.1154874373257008),
 ('web ->  scipy', 0.11523784750646386),
 ('statistics ->  scipy', 0.11447571694585298),
 ('design ->  scipy', 0.11193569069957214),
 ('language-agnostic ->  scipy', 0.11138279952459677)]

### let's try some triples, starting with tags very low in the hierarchy (i.e. low generality, high specificity)

In [62]:
sorted(global_similarity_index.items(),key=lambda tpl: tpl[1])[:10]

[('gulp', 0.19527066254503755),
 ('pip', 0.21634628933565575),
 ('vagrant', 0.23903271161456216),
 ('ckeditor', 0.23906470957447498),
 ('sbt', 0.2395941030817981),
 ('recyclerview', 0.240645562610522),
 ('socket.io', 0.24387152236585527),
 ('npm', 0.2501623604716901),
 ('docker', 0.25114799860316461),
 ('gem', 0.25210788788163307)]

In [63]:
sorted_incoming_cobrinha_factors_dict['gulp'][:20]

[('javascript ->  gulp', 0.1165377901599134),
 ('web ->  gulp', 0.11252837978131403),
 ('c# ->  gulp', 0.11079307772772183),
 ('.net ->  gulp', 0.11067822164359718),
 ('node.js ->  gulp', 0.10335771316200482),
 ('c#-4.0 ->  gulp', 0.10318618523697559),
 ('web-applications ->  gulp', 0.10030571586983099),
 ('linux ->  gulp', 0.097864926545714617),
 ('automation ->  gulp', 0.09773528866157645),
 ('performance ->  gulp', 0.096468732524700646),
 ('windows ->  gulp', 0.095785874012095082),
 ('.net-4.0 ->  gulp', 0.094683985033850268),
 ('optimization ->  gulp', 0.094366675749437495),
 ('html5 ->  gulp', 0.094356872046374288),
 ('design ->  gulp', 0.094243052608060138),
 ('configuration ->  gulp', 0.093773236240157862),
 ('cocoa ->  gulp', 0.092433370519195202),
 ('php ->  gulp', 0.092304436537185702),
 ('dynamic ->  gulp', 0.092294376882344473),
 ('html ->  gulp', 0.092279526938796444)]

In [64]:
sorted_incoming_cobrinha_factors_dict['javascript'][:20]

[('c# ->  javascript', 0.035512134292219548),
 ('.net ->  javascript', 0.030833022041302956),
 ('c#-4.0 ->  javascript', 0.021551564800825197),
 ('web ->  javascript', 0.018789550178211128),
 ('design ->  javascript', 0.0078801601177236145),
 ('performance ->  javascript', 0.0067050583752320632),
 ('optimization ->  javascript', 0.0031506925748634087),
 ('user-interface ->  javascript', -0.00058418204873059349),
 ('cocoa ->  javascript', -0.0015390447421814348),
 ('java ->  javascript', -0.0016300551610532099),
 ('web-applications ->  javascript', -0.0017124202389043389),
 ('.net-4.0 ->  javascript', -0.002176369967637184),
 ('dynamic ->  javascript', -0.0026171154991778359),
 ('language-agnostic ->  javascript', -0.00397048688823857),
 ('security ->  javascript', -0.0052844859978735349),
 ('php ->  javascript', -0.0061005975161411551),
 ('asp.net ->  javascript', -0.0073511076631431662),
 ('automation ->  javascript', -0.0078704852688523813),
 ('ios ->  javascript', -0.011702158863775

so we get **"c#" -> "javascript" -> "gulp"**, which is clearly not good

In [65]:
sorted_incoming_cobrinha_factors_dict['hive'][:10]

[('c# ->  hive', 0.12778353181970245),
 ('.net ->  hive', 0.12369783237880885),
 ('c#-4.0 ->  hive', 0.12155164018203478),
 ('database ->  hive', 0.11756862236890592),
 ('performance ->  hive', 0.11720354105256708),
 ('optimization ->  hive', 0.11423880000067886),
 ('java ->  hive', 0.11281993088864363),
 ('web ->  hive', 0.11260837494979277),
 ('design ->  hive', 0.10778895781917346),
 ('dynamic ->  hive', 0.1059951701618147)]

In [67]:
sorted_incoming_cobrinha_factors_dict['c#'][:10]

[('.net ->  c#', -0.0046296089996649163),
 ('c#-4.0 ->  c#', -0.016185519882942825),
 ('web ->  c#', -0.01790181205880997),
 ('design ->  c#', -0.028385499289171001),
 ('performance ->  c#', -0.029279289411717018),
 ('optimization ->  c#', -0.032525567858528788),
 ('javascript ->  c#', -0.035512134292219548),
 ('user-interface ->  c#', -0.037076056097362989),
 ('cocoa ->  c#', -0.037366751573831541),
 ('web-applications ->  c#', -0.037446569463747942)]

so we get **".net" -> "c#" -> "hive"**, which is also clearly not good

In [68]:
sorted_incoming_cobrinha_factors_dict['docker'][:10]

[('.net ->  docker', 0.12832555081187366),
 ('c# ->  docker', 0.12567965695399747),
 ('web ->  docker', 0.12284676876231916),
 ('linux ->  docker', 0.11832055461812542),
 ('c#-4.0 ->  docker', 0.11573627120965377),
 ('web-applications ->  docker', 0.11152213332473625),
 ('performance ->  docker', 0.11109195161762249),
 ('design ->  docker', 0.11028875892843248),
 ('windows ->  docker', 0.10930461290736113),
 ('security ->  docker', 0.10807951808120234)]

In [69]:
sorted_incoming_cobrinha_factors_dict['.net'][:10]

[('c# ->  .net', 0.0046296089996649163),
 ('c#-4.0 ->  .net', -0.011036661366274248),
 ('web ->  .net', -0.014094951178712121),
 ('design ->  .net', -0.024754635024207806),
 ('performance ->  .net', -0.025677379206722646),
 ('optimization ->  .net', -0.02876889606165953),
 ('javascript ->  .net', -0.030833022041302956),
 ('user-interface ->  .net', -0.032788448132743536),
 ('cocoa ->  .net', -0.033645874937763373),
 ('java ->  .net', -0.034558376846936259)]

so we get "c#" -> ".net" -> "docker", which is also clearly not good

> i think the lesson here is that tags that are very common like "c#" and ".net" are dominating the effect here, although it's nice to see more reasonable parent tags a bit down the ranking..

> maybe we need to tweak the weighting so that tag similarity has more say than generality.