### version 2 uses normalized generality

In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import get_metrics_for_tag_pair

In [2]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [3]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [4]:
sorted_pairwise_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_pairwise_similarity_dict.p","rb"))

In [5]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

In [6]:
global_similarity_index = pickle.load(open(PICKLE_ROOT+"global_similarity_index.p","rb"))

In [7]:
tag_frequency_index = pickle.load(open(PICKLE_ROOT+"tag_frequency_index.p","rb"))

In [8]:
sorted(tag_frequency_index.items(),key=lambda tpl: tpl[1],reverse=True)[:20]

[('javascript', 49956),
 ('java', 45428),
 ('c#', 39420),
 ('php', 38873),
 ('android', 36024),
 ('jquery', 30606),
 ('python', 26592),
 ('html', 23401),
 ('c++', 18631),
 ('ios', 18456),
 ('css', 16798),
 ('mysql', 16690),
 ('sql', 13905),
 ('asp.net', 11573),
 ('objective-c', 10239),
 ('ruby-on-rails', 10018),
 ('.net', 9169),
 ('c', 8896),
 ('angularjs', 8468),
 ('iphone', 8163)]

In [71]:
global_similarity_index_v2 = dict()

for tag in tag_vocabulary:
    
    similarities_with_current_tag = list()
    
    for other_tag, similarity_to_other_tag in sorted_pairwise_similarity_dict[tag]:
        
        if other_tag != tag:
            similarities_with_current_tag.append(similarity_to_other_tag)

    freq = tag_frequency_index[tag]       
            
    global_avg_sim_wrt_tag = np.array(similarities_with_current_tag).mean() / np.log(freq)
    
    print("tag: "+tag+" freq: "+ str(freq))
    
    global_similarity_index_v2[tag] = global_avg_sim_wrt_tag

tag: .htaccess freq: 2074
tag: .net freq: 9169
tag: .net-4.0 freq: 313
tag: 3d freq: 387
tag: access-vba freq: 319
tag: actionscript-3 freq: 1447
tag: active-directory freq: 381
tag: activerecord freq: 816
tag: ajax freq: 6053
tag: algorithm freq: 2729
tag: amazon-ec2 freq: 591
tag: amazon-s3 freq: 653
tag: amazon-web-services freq: 1383
tag: android freq: 36024
tag: android-actionbar freq: 322
tag: android-activity freq: 890
tag: android-asynctask freq: 455
tag: android-edittext freq: 357
tag: android-emulator freq: 325
tag: android-fragments freq: 1169
tag: android-intent freq: 892
tag: android-layout freq: 1647
tag: android-listview freq: 577
tag: android-ndk freq: 425
tag: android-studio freq: 1380
tag: android-viewpager freq: 319
tag: angular-ui-router freq: 305
tag: angular2 freq: 1558
tag: angularjs freq: 8468
tag: angularjs-directive freq: 540
tag: animation freq: 1107
tag: ant freq: 492
tag: apache freq: 2535
tag: apache-spark freq: 915
tag: api freq: 1890
tag: architecture fr

### least general tags

In [72]:
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index_v2.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1] )

In [73]:
sorted_tags_by_global_avg_similarities[:20]

[('gulp', 0.03459942504344083),
 ('git', 0.038525571638842182),
 ('docker', 0.039225768413069829),
 ('boost', 0.039923982102107236),
 ('paypal', 0.041578266020776744),
 ('socket.io', 0.041685333589716281),
 ('npm', 0.042146220246530995),
 ('d3.js', 0.042685676986351132),
 ('.htaccess', 0.042775596888459638),
 ('google-maps', 0.042841501390705403),
 ('google-maps-api-3', 0.043174855685175241),
 ('android-fragments', 0.043189690057949037),
 ('recyclerview', 0.04319886381021866),
 ('gradle', 0.043316949304465342),
 ('tkinter', 0.04383055286005174),
 ('apache-spark', 0.044167096431171955),
 ('heroku', 0.044197525787492774),
 ('webview', 0.044249421670785508),
 ('uitableview', 0.044373724086656095),
 ('hive', 0.04438275494483529)]

note that tags such as "web", "performance", "design", "security" and "user-interface" are **nowhere near** the top assigned tags on SO, but they show up here... and it makes sense because they're very general indeed

### most general tags

In [74]:
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index_v2.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1],reverse=True )

In [75]:
sorted_tags_by_global_avg_similarities[:20]

[('coldfusion', 0.10126949169741077),
 ('.net-4.0', 0.099866593822834748),
 ('coffeescript', 0.099787675571294795),
 ('vb6', 0.098526648545393503),
 ('telerik', 0.097878109648431591),
 ('blackberry', 0.097861085857123886),
 ('mfc', 0.097778432998253603),
 ('automation', 0.096189595661966998),
 ('windows-runtime', 0.09481087770903647),
 ('internationalization', 0.092074587196500551),
 ('formatting', 0.091981182119723559),
 ('get', 0.091764572420760263),
 ('webforms', 0.090860124239995815),
 ('network-programming', 0.09083603234962459),
 ('prolog', 0.090515022580088367),
 ('concurrency', 0.089983335814520421),
 ('website', 0.089760608017421459),
 ('design', 0.089688198073155617),
 ('com', 0.089611591522300971),
 ('web-scraping', 0.089456021716997278)]

In [25]:
def evaluate(a,b):
    return get_metrics_for_tag_pair(a,b,tag_vectors_index, sorted_pairwise_similarity_dict, global_similarity_index_v2)

> the result is a tuple of the form (difference_of_global_averages, mutual_similarity)

positive difference means the first tag is "more general" than the second tag (according to our measure)


In [26]:
evaluate('sql','sql-server')

(-0.0038743695114508425, 0.95360188572903903)

In [27]:
evaluate('sql-server','sql-server-2008')

(-0.0077137429211412839, 0.98371627237454562)

In [28]:
evaluate('python','python-3.x')

(-0.014578811401950692, 0.97481770134639489)

In [29]:
evaluate('python-2.7','python-3.x')

(-0.0011267936724401184, 0.97453923487543426)

In [30]:
evaluate('database','oracle')

(0.0083521136297889098, 0.76511062673459118)

In [31]:
# hmmm... maybe because rails is a much more stronger child of something like "web-framework"?
evaluate('ruby','ruby-on-rails')

(0.0024735863134016309, 0.89492664209399198)

In [32]:
# hmmmm
evaluate('frameworks','ruby-on-rails')

(0.026283454915515536, 0.57081851125952732)

In [33]:
evaluate('web','ruby-on-rails')

(0.03364917427786146, 0.71753596334150105)

In [34]:
evaluate('.net','.net-4.0')

(-0.032337918512040834, 0.96277274170640192)

In [35]:
evaluate('android','android-studio') 

(-0.0041151581314893132, 0.82069625351587672)

In [36]:
evaluate('asp.net','asp.net-mvc') 

(-0.0007618717296940386, 0.84710098775189802)

In [37]:
evaluate('asp.net-mvc','asp.net-mvc-5') 

(-0.024928081832259155, 0.96488890593856147)

In [38]:
evaluate('asp.net','asp.net-mvc-5') 

(-0.025689953561953194, 0.8120323924510352)

## what about unrelated stuff?

this will probably fluctuate around zero

In [39]:
evaluate('java','arrays')

(0.0033904039905508221, 0.63486068665743045)

In [40]:
evaluate('ruby','python-2.7') 

(-0.0097213999368659584, 0.64158676537827219)

In [41]:
evaluate('database','python-2.7') 

(-0.0016449620448198837, 0.65515824936886746)

## what are the tag pairs that are the best cobrinha parts?

In [42]:
similarities_to_other_tags = sorted_pairwise_similarity_dict["sql"]

In [43]:
similarities_to_other_tags[1]

('sql-server', 0.95360188572903903)

In [None]:
lst = sorted_pairwise_similarity_dict.items()

In [44]:
def make_pairwise_cobrinha_factors(pairwise_similarity_dict):

    outgoing_cobrinha_factors_dict = dict()
    incoming_cobrinha_factors_dict = dict()


    for tag in tag_vocabulary:
        outgoing_cobrinha_factors_dict[tag] = list()
        incoming_cobrinha_factors_dict[tag] = list()
        
        
    for (tag, similarities_to_other_tags) in tqdm(pairwise_similarity_dict.items()):
    
        for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

            (avg_avg, mutual_similarity)= evaluate(tag,other_tag)

            cobrinha_factor = avg_avg * mutual_similarity
            pair = "{} ->  {}".format(tag,other_tag)    

            outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
            incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))    
        
        
    return (outgoing_cobrinha_factors_dict,incoming_cobrinha_factors_dict)

In [45]:
outgoing_factors_dict,incoming_factors_dict = make_pairwise_cobrinha_factors(sorted_pairwise_similarity_dict)

100%|██████████| 617/617 [00:13<00:00, 45.27it/s]


In [46]:
incoming_factors_dict['sql'][:10]

[('graph ->  sql', 0.0081831529207102155),
 ('uitableview ->  sql', -0.0032182639527065437),
 ('deployment ->  sql', 0.014899860283764237),
 ('regex ->  sql', -0.00090107271024596495),
 ('command-line ->  sql', 0.014748599008997595),
 ('datagrid ->  sql', 0.003518207603231919),
 ('serialization ->  sql', 0.012575405780442995),
 ('garbage-collection ->  sql', 0.006465973228088155),
 ('html ->  sql', 0.0014545052682485619),
 ('url ->  sql', 0.0067025797091360287)]

## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [47]:
sorted_outgoing_cobrinha_factors_dict = dict()
sorted_incoming_cobrinha_factors_dict = dict()

for (tag,similarities) in outgoing_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_outgoing_cobrinha_factors_dict[tag] = sorted_similarities
    
for (tag,similarities) in incoming_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_incoming_cobrinha_factors_dict[tag] = sorted_similarities        

In [48]:
sorted_outgoing_cobrinha_factors_dict['sql'][:20]

[('sql ->  git', 0.0048014206588106647),
 ('sql ->  docker', 0.0037894525352867537),
 ('sql ->  boost', 0.0036038663207627873),
 ('sql ->  hive', 0.0036018893805477587),
 ('sql ->  gulp', 0.0035785764153825387),
 ('sql ->  pandas', 0.0034261453941780377),
 ('sql ->  .htaccess', 0.0033605553612145395),
 ('sql ->  paypal', 0.0033552110052657751),
 ('sql ->  d3.js', 0.0033172476459604008),
 ('sql ->  google-maps', 0.0032649929583456658),
 ('sql ->  uitableview', 0.0032182639527065437),
 ('sql ->  apache-spark', 0.0032084346564580494),
 ('sql ->  android-fragments', 0.002947077868197116),
 ('sql ->  socket.io', 0.0029097162416245113),
 ('sql ->  facebook', 0.0028964925316431324),
 ('sql ->  google-maps-api-3', 0.0028082296377408208),
 ('sql ->  npm', 0.0027684122617202085),
 ('sql ->  heroku', 0.0027569156188631898),
 ('sql ->  gradle', 0.0027289091031215012),
 ('sql ->  css', 0.0027001111852271117)]

In [49]:
sorted_incoming_cobrinha_factors_dict['sql'][:20]

[('coldfusion ->  sql', 0.036699516455666957),
 ('vb6 ->  sql', 0.031343884611312038),
 ('telerik ->  sql', 0.030669812708359171),
 ('.net-4.0 ->  sql', 0.030071566004863614),
 ('coffeescript ->  sql', 0.029769292923173089),
 ('mfc ->  sql', 0.028862776322690954),
 ('automation ->  sql', 0.027847623277168033),
 ('blackberry ->  sql', 0.027413423506744611),
 ('c#-4.0 ->  sql', 0.026313845005058265),
 ('formatting ->  sql', 0.026029890231648197),
 ('design ->  sql', 0.025386641505940291),
 ('optimization ->  sql', 0.024965918228188402),
 ('get ->  sql', 0.02476508000155591),
 ('windows-runtime ->  sql', 0.024268531681949257),
 ('dynamic ->  sql', 0.024239633279662935),
 ('access-vba ->  sql', 0.02421942892754771),
 ('webforms ->  sql', 0.024074137582456145),
 ('concurrency ->  sql', 0.024013903487885586),
 ('web ->  sql', 0.023943727466055272),
 ('prolog ->  sql', 0.023703509468304938)]

if "sql" is a "good" parent tag, then the avg of all OUTGOING cobrinha factors should be higher than the average INCOMING cobrinha factors, right?

In [50]:
TAG_NAME='sql'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: -0.007061846721720351 
average of incoming factors: 0.007061846721720351


In [51]:
TAG_NAME='performance'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.004605382828130135 
average of incoming factors: -0.004605382828130135


In [52]:
TAG_NAME='user-interface'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.007133652265720398 
average of incoming factors: -0.007133652265720398


now let's see whether a reasonably "bad" parent tag shows the opposite behaviour.

In [53]:
TAG_NAME = 'android-fragments'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
    
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))    

average of outgoing factors: -0.007077082195748274 
average of incoming factors: 0.007077082195748274


## now let's try to find a triple. two tags B and C that have good cobrinha factor and a third tag A that has good cobrinha factor with tag B and tag C (but cobrinha(A,B) should probably be higher than cobrinha (A,C))


A -> B -> C

(TAG C is the most specific tag of the three)

### let's experiment first

In [54]:
# this is not very good, because numpy is a subset of scipy
# so scipy should, in theory, be more general than numpy

# perhaps this has to do with the fact that numpy tags are much more common than scipy, and this
# causes numpy-tagged documents to have more words, in general, than scipy?

global_similarity_index['scipy'],global_similarity_index['numpy']

(0.32409407806410806, 0.34442704064918939)

it's bad that 'c#' and things like '.coldfusion' and 'mfc' are at the top **but** 'python' and 'performance' 
and 'optimization' and 'python-2.7' at the top are indicative that something is correct

In [55]:
sorted_incoming_cobrinha_factors_dict['numpy'][:20]

[('coldfusion ->  numpy', 0.02589522754484154),
 ('coffeescript ->  numpy', 0.023194067983073725),
 ('vb6 ->  numpy', 0.022473967850230644),
 ('.net-4.0 ->  numpy', 0.02230892690997718),
 ('mfc ->  numpy', 0.022260899214391623),
 ('telerik ->  numpy', 0.02148387132231477),
 ('automation ->  numpy', 0.021192255433562596),
 ('optimization ->  numpy', 0.020957514553867327),
 ('blackberry ->  numpy', 0.02075549073651467),
 ('formatting ->  numpy', 0.019935200181282489),
 ('machine-learning ->  numpy', 0.019900148388518842),
 ('prolog ->  numpy', 0.019298590206521839),
 ('design ->  numpy', 0.019246674115799462),
 ('windows-runtime ->  numpy', 0.018713591555311354),
 ('web-scraping ->  numpy', 0.018699001300237206),
 ('concurrency ->  numpy', 0.018584825640340624),
 ('c#-4.0 ->  numpy', 0.018431479334317342),
 ('web ->  numpy', 0.018428156885954335),
 ('get ->  numpy', 0.018296387959262913),
 ('internationalization ->  numpy', 0.018274287454960712)]

In [56]:
sorted_incoming_cobrinha_factors_dict['scipy'][:20]

[('coldfusion ->  scipy', 0.021100536995893607),
 ('coffeescript ->  scipy', 0.018822896009353364),
 ('.net-4.0 ->  scipy', 0.01823018832150634),
 ('vb6 ->  scipy', 0.018088128702347456),
 ('mfc ->  scipy', 0.017955834215231462),
 ('telerik ->  scipy', 0.01714364171896593),
 ('automation ->  scipy', 0.017122301326080409),
 ('blackberry ->  scipy', 0.016867420547047051),
 ('optimization ->  scipy', 0.016114805890325139),
 ('formatting ->  scipy', 0.015289262685859841),
 ('machine-learning ->  scipy', 0.015255920497826045),
 ('windows-runtime ->  scipy', 0.014983639476206241),
 ('design ->  scipy', 0.014864218628366678),
 ('prolog ->  scipy', 0.014796391239193124),
 ('web-scraping ->  scipy', 0.014670244408865295),
 ('concurrency ->  scipy', 0.014351915184365982),
 ('internationalization ->  scipy', 0.014344979159856547),
 ('web ->  scipy', 0.014326536112164818),
 ('get ->  scipy', 0.014285562883384161),
 ('c#-4.0 ->  scipy', 0.014083150810259834)]

In [57]:
sorted_outgoing_cobrinha_factors_dict['numpy'][:20]

[('numpy ->  git', 0.0023435256821268814),
 ('numpy ->  gulp', 0.0020583385627834837),
 ('numpy ->  boost', 0.0018662191878075834),
 ('numpy ->  docker', 0.0018179587187993683),
 ('numpy ->  matplotlib', 0.0016278764618811905),
 ('numpy ->  pandas', 0.0015533838716925974),
 ('numpy ->  d3.js', 0.0014333545472973821),
 ('numpy ->  paypal', 0.0013781166618693577),
 ('numpy ->  google-maps', 0.0013233817710609447),
 ('numpy ->  .htaccess', 0.0013165131944484333),
 ('numpy ->  npm', 0.0012562542027551274),
 ('numpy ->  tkinter', 0.0012187733592343849),
 ('numpy ->  socket.io', 0.00119632940984289),
 ('numpy ->  apache-spark', 0.0011289127302659748),
 ('numpy ->  google-maps-api-3', 0.0011046817198514809),
 ('numpy ->  android-fragments', 0.0010778307132150073),
 ('numpy ->  gradle', 0.0010650951725953391),
 ('numpy ->  uitableview', 0.0010580771811844693),
 ('numpy ->  heroku', 0.00095179872968751485),
 ('numpy ->  jenkins', 0.00088539079554069008)]

In [58]:
sorted_outgoing_cobrinha_factors_dict['scipy'][:20]

[('scipy ->  numpy', 0.0053938559735865459),
 ('scipy ->  matplotlib', 0.0047604339100780985),
 ('scipy ->  pandas', 0.0040084782180982315),
 ('scipy ->  git', 0.0039607197741449857),
 ('scipy ->  boost', 0.0032995543100331061),
 ('scipy ->  docker', 0.0032139296252932494),
 ('scipy ->  gulp', 0.0031349480958149005),
 ('scipy ->  d3.js', 0.0031300054245027159),
 ('scipy ->  tkinter', 0.0031009875541846131),
 ('scipy ->  google-maps', 0.0029509687044743508),
 ('scipy ->  .htaccess', 0.0029045637467599817),
 ('scipy ->  apache-spark', 0.0028936533537261564),
 ('scipy ->  css', 0.0028779577868379641),
 ('scipy ->  uitableview', 0.0028149385977678642),
 ('scipy ->  paypal', 0.0027741704082717162),
 ('scipy ->  tensorflow', 0.0027442882815996729),
 ('scipy ->  npm', 0.0027412028849619972),
 ('scipy ->  facebook', 0.0026301994514870393),
 ('scipy ->  maven', 0.0026152977243037705),
 ('scipy ->  gradle', 0.0025718506909711359)]

### let's try some triples, starting with tags very low in the hierarchy (i.e. low generality, high specificity)

In [59]:
sorted(global_similarity_index.items(),key=lambda tpl: tpl[1])[:10]

[('gulp', 0.20157625401838811),
 ('recyclerview', 0.2486415240114675),
 ('socket.io', 0.250170981198097),
 ('docker', 0.26050547377925304),
 ('npm', 0.26063850119479154),
 ('webpack', 0.26227745762404953),
 ('hive', 0.26293479328716896),
 ('boost', 0.26300122888860139),
 ('tensorflow', 0.26525913580033628),
 ('paypal', 0.2682650078066654)]

In [60]:
sorted_incoming_cobrinha_factors_dict['gulp'][:20]

[('coffeescript ->  gulp', 0.024095914565408025),
 ('coldfusion ->  gulp', 0.018941245493473639),
 ('ecmascript-6 ->  gulp', 0.018126324091426959),
 ('.net-4.0 ->  gulp', 0.017201433929503272),
 ('automation ->  gulp', 0.017132660136769472),
 ('vb6 ->  gulp', 0.016211174186261151),
 ('blackberry ->  gulp', 0.016114185444949029),
 ('mfc ->  gulp', 0.015983907374467657),
 ('web ->  gulp', 0.015896569082224286),
 ('telerik ->  gulp', 0.015617813677730673),
 ('windows-runtime ->  gulp', 0.015379855112670151),
 ('configuration ->  gulp', 0.015265058658836966),
 ('web-applications ->  gulp', 0.015187863365688749),
 ('compilation ->  gulp', 0.015035171890604803),
 ('scripting ->  gulp', 0.014712328191724178),
 ('internationalization ->  gulp', 0.014692411624676027),
 ('c#-4.0 ->  gulp', 0.014412297259483993),
 ('get ->  gulp', 0.014325632357744533),
 ('io ->  gulp', 0.014135404027554357),
 ('design ->  gulp', 0.013957761702973285)]

In [61]:
sorted_incoming_cobrinha_factors_dict['coffeescript'][:20]

[('coldfusion ->  coffeescript', 0.0012282083985074152),
 ('.net-4.0 ->  coffeescript', 5.8687116919705623e-05),
 ('vb6 ->  coffeescript', -0.00093164050656022938),
 ('telerik ->  coffeescript', -0.0014308448127878226),
 ('blackberry ->  coffeescript', -0.0014331446505911989),
 ('mfc ->  coffeescript', -0.0015458192337177803),
 ('automation ->  coffeescript', -0.0027457789361602434),
 ('windows-runtime ->  coffeescript', -0.0035640745079460212),
 ('formatting ->  coffeescript', -0.0054593717871049318),
 ('internationalization ->  coffeescript', -0.0055021942749518332),
 ('network-programming ->  coffeescript', -0.0060310353260931522),
 ('get ->  coffeescript', -0.0061037896423316714),
 ('prolog ->  coffeescript', -0.0064266732299154156),
 ('webforms ->  coffeescript', -0.0065155993827617905),
 ('website ->  coffeescript', -0.0068110962741301934),
 ('com ->  coffeescript', -0.007117721283698324),
 ('concurrency ->  coffeescript', -0.0071517100942702291),
 ('web-scraping ->  coffeescript

so we get **"coldfusion" -> "coffeescript" -> "gulp"**, which is clearly not good

In [62]:
sorted_incoming_cobrinha_factors_dict['hive'][:10]

[('coldfusion ->  hive', 0.022155420173935188),
 ('vb6 ->  hive', 0.018646110147832861),
 ('coffeescript ->  hive', 0.01842062788029061),
 ('.net-4.0 ->  hive', 0.01832536755584702),
 ('telerik ->  hive', 0.018059791512740175),
 ('automation ->  hive', 0.017592278113604254),
 ('mfc ->  hive', 0.017585357235741696),
 ('blackberry ->  hive', 0.017352343808624278),
 ('c#-4.0 ->  hive', 0.016408095896826375),
 ('formatting ->  hive', 0.016325062207043377)]

In [63]:
sorted_incoming_cobrinha_factors_dict['coldfusion'][:10]

[('.net-4.0 ->  coldfusion', -0.0011559416986044108),
 ('coffeescript ->  coldfusion', -0.0012282083985074152),
 ('vb6 ->  coldfusion', -0.0022719507628618891),
 ('blackberry ->  coldfusion', -0.0027608328628917775),
 ('telerik ->  coldfusion', -0.0027823264698254844),
 ('mfc ->  coldfusion', -0.0028914134196486312),
 ('automation ->  coldfusion', -0.0042175503957080753),
 ('windows-runtime ->  coldfusion', -0.0049426597175209587),
 ('internationalization ->  coldfusion', -0.0070508627876512044),
 ('formatting ->  coldfusion', -0.0072748780542959014)]

so we get **".net-4.0" -> "coldfusion" -> "hive"**, which is also clearly not good

In [64]:
sorted_incoming_cobrinha_factors_dict['docker'][:10]

[('coldfusion ->  docker', 0.02340743564498704),
 ('.net-4.0 ->  docker', 0.021495603397879967),
 ('coffeescript ->  docker', 0.0212899348226562),
 ('blackberry ->  docker', 0.021079526661830297),
 ('automation ->  docker', 0.020938468586005635),
 ('vb6 ->  docker', 0.020421622847714069),
 ('mfc ->  docker', 0.02029129749487063),
 ('telerik ->  docker', 0.01883763788123502),
 ('web ->  docker', 0.018647233168941214),
 ('configuration ->  docker', 0.018397032071080986)]

In [65]:
sorted_incoming_cobrinha_factors_dict['coldfusion'][:10]

[('.net-4.0 ->  coldfusion', -0.0011559416986044108),
 ('coffeescript ->  coldfusion', -0.0012282083985074152),
 ('vb6 ->  coldfusion', -0.0022719507628618891),
 ('blackberry ->  coldfusion', -0.0027608328628917775),
 ('telerik ->  coldfusion', -0.0027823264698254844),
 ('mfc ->  coldfusion', -0.0028914134196486312),
 ('automation ->  coldfusion', -0.0042175503957080753),
 ('windows-runtime ->  coldfusion', -0.0049426597175209587),
 ('internationalization ->  coldfusion', -0.0070508627876512044),
 ('formatting ->  coldfusion', -0.0072748780542959014)]

so we get ".net-4.0" -> "coldfusion" -> "docker", which is also clearly not good

> i think the lesson here is that tags that are very common like "c#" and ".net" (and "coldfusion"??) are dominating the effect here, although it's nice to see more reasonable parent tags a bit down the ranking..

> maybe we need to tweak the weighting so that tag similarity has more say than generality.