In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import get_metrics_for_tag_pair

In [2]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [3]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [4]:
sorted_pairwise_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_pairwise_similarity_dict.p","rb"))

In [5]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

### build global similarity index

In [6]:
global_similarity_index = dict()

for tag in tag_vocabulary:
    
    similarities_with_current_tag = list()
    
    for other_tag, similarity_to_other_tag in sorted_pairwise_similarity_dict[tag]:
        
        if other_tag != tag:
            similarities_with_current_tag.append(similarity_to_other_tag)

    global_avg_sim_wrt_tag = np.array(similarities_with_current_tag).mean()
    
    global_similarity_index[tag] = global_avg_sim_wrt_tag

In [7]:
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1] )

In [8]:
sorted_tags_by_global_avg_similarities[:20]

[('gulp', 0.20157625401838811),
 ('recyclerview', 0.2486415240114675),
 ('socket.io', 0.250170981198097),
 ('docker', 0.26050547377925304),
 ('npm', 0.26063850119479154),
 ('webpack', 0.26227745762404953),
 ('hive', 0.26293479328716896),
 ('boost', 0.26300122888860139),
 ('tensorflow', 0.26525913580033628),
 ('paypal', 0.2682650078066654),
 ('jqgrid', 0.27335205238806776),
 ('datepicker', 0.27457654086216854),
 ('youtube', 0.27527423917699367),
 ('redis', 0.27565122750670068),
 ('gruntjs', 0.27628801171262746),
 ('tkinter', 0.27881940450204129),
 ('webview', 0.28032248688184008),
 ('google-maps-api-3', 0.28471284945307385),
 ('ffmpeg', 0.2851533827007513),
 ('android-viewpager', 0.28785705728288735)]

note that tags such as "web", "performance", "design", "security" and "user-interface" are **nowhere near** the top assigned tags on SO, but they show up here... and it makes sense because they're very general indeed

### most general tags

In [9]:
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1],reverse=True )

In [10]:
sorted_tags_by_global_avg_similarities[:20]

[('c#', 0.62086018089096218),
 ('.net', 0.61610350838601591),
 ('coldfusion', 0.61594321744620228),
 ('c#-4.0', 0.60513992363716151),
 ('web', 0.5995461781195971),
 ('design', 0.58691166177828435),
 ('performance', 0.58572425127871253),
 ('optimization', 0.58234921856048838),
 ('mfc', 0.58107761052288953),
 ('coffeescript', 0.57715447085540483),
 ('cocoa', 0.57547813285594474),
 ('user-interface', 0.57538308841469177),
 ('javascript', 0.57492880330418661),
 ('web-applications', 0.5749284108778886),
 ('.net-4.0', 0.5738537400531506),
 ('java', 0.57303658627661958),
 ('dynamic', 0.57137276132148918),
 ('security', 0.57017136654076717),
 ('php', 0.56842198234585617),
 ('vb6', 0.56802495758111193)]

In [11]:
def evaluate(a,b):
    return get_metrics_for_tag_pair(a,b,tag_vectors_index, sorted_pairwise_similarity_dict, global_similarity_index)

> the result is a tuple of the form (difference_of_global_averages, mutual_similarity)

positive difference means the first tag is "more general" than the second tag (according to our measure)


In [12]:
evaluate('sql','sql-server')

(0.00047482132722476145, 0.95360188572903903)

In [13]:
evaluate('sql-server','sql-server-2008')

(0.019398963335061636, 0.98371627237454562)

In [14]:
evaluate('python','python-3.x')

(0.031045849292555494, 0.97481770134639489)

In [15]:
evaluate('python-2.7','python-3.x')

(0.0049889424541228466, 0.97453923487543426)

In [16]:
evaluate('database','oracle')

(0.095925720399016368, 0.76511062673459118)

In [17]:
# hmmm... maybe because rails is a much more stronger child of something like "web-framework"?
evaluate('ruby','ruby-on-rails')

(-0.0015628093592291448, 0.89492664209399198)

In [18]:
# hmmmm
evaluate('frameworks','ruby-on-rails')

(-0.028865729862849354, 0.57081851125952732)

In [19]:
evaluate('web','ruby-on-rails')

(0.089850936924137592, 0.71753596334150105)

In [20]:
evaluate('.net','.net-4.0')

(0.042249768332865312, 0.96277274170640192)

In [21]:
evaluate('android','android-studio') 

(0.13463097078015446, 0.82069625351587672)

In [22]:
evaluate('asp.net','asp.net-mvc') 

(0.037657070813489568, 0.84710098775189802)

In [23]:
evaluate('asp.net-mvc','asp.net-mvc-5') 

(0.010203698004170092, 0.96488890593856147)

In [24]:
evaluate('asp.net','asp.net-mvc-5') 

(0.04786076881765966, 0.8120323924510352)

## what about unrelated stuff?

this will probably fluctuate around zero

In [25]:
evaluate('java','arrays')

(0.12366041154176743, 0.63486068665743045)

In [26]:
evaluate('ruby','python-2.7') 

(-0.016712129281118426, 0.64158676537827219)

In [27]:
evaluate('database','python-2.7') 

(0.031927816047780722, 0.65515824936886746)

## what are the tag pairs that are the best cobrinha parts?

In [28]:
similarities_to_other_tags = sorted_pairwise_similarity_dict["sql"]

In [29]:
similarities_to_other_tags[1]

('sql-server', 0.95360188572903903)

In [30]:
lst = sorted_pairwise_similarity_dict.items()

In [31]:
def make_pairwise_cobrinha_factors(pairwise_similarity_dict):

    outgoing_cobrinha_factors_dict = dict()
    incoming_cobrinha_factors_dict = dict()


    for tag in tag_vocabulary:
        outgoing_cobrinha_factors_dict[tag] = list()
        incoming_cobrinha_factors_dict[tag] = list()
        
        
    for (tag, similarities_to_other_tags) in tqdm(pairwise_similarity_dict.items()):
    
        for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

            (avg_avg, mutual_similarity)= evaluate(tag,other_tag)

            cobrinha_factor = avg_avg * mutual_similarity
            pair = "{} ->  {}".format(tag,other_tag)    

            outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
            incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))    
        
        
    return (outgoing_cobrinha_factors_dict,incoming_cobrinha_factors_dict)

In [32]:
outgoing_factors_dict,incoming_factors_dict = make_pairwise_cobrinha_factors(sorted_pairwise_similarity_dict)

100%|██████████| 617/617 [00:14<00:00, 43.92it/s]


In [33]:
incoming_factors_dict['sql'][:10]

[('version-control ->  sql', -0.018510537108355547),
 ('google-chrome-extension ->  sql', -0.055317792305215822),
 ('cakephp ->  sql', -0.038234430465597213),
 ('interface ->  sql', -0.042381660583867727),
 ('error-handling ->  sql', 0.021530847064390107),
 ('winforms ->  sql', 0.036942423751299421),
 ('gradle ->  sql', -0.062316860849039757),
 ('dll ->  sql', -0.053332503858211228),
 ('performance ->  sql', 0.062679989136039949),
 ('web-services ->  sql', -0.0061070761099806681)]

## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [34]:
sorted_outgoing_cobrinha_factors_dict = dict()
sorted_incoming_cobrinha_factors_dict = dict()

for (tag,similarities) in outgoing_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_outgoing_cobrinha_factors_dict[tag] = sorted_similarities
    
for (tag,similarities) in incoming_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_incoming_cobrinha_factors_dict[tag] = sorted_similarities        

In [35]:
sorted_outgoing_cobrinha_factors_dict['sql'][:20]

[('sql ->  hive', 0.10481384505682605),
 ('sql ->  pdo', 0.096508894621038982),
 ('sql ->  stored-procedures', 0.095564603684111377),
 ('sql ->  sqlalchemy', 0.092958113106314522),
 ('sql ->  join', 0.087701825404291278),
 ('sql ->  cassandra', 0.083233442388011156),
 ('sql ->  group-by', 0.081127930118162092),
 ('sql ->  jqgrid', 0.079138030461739423),
 ('sql ->  solr', 0.078568611011026246),
 ('sql ->  pandas', 0.078068274537160448),
 ('sql ->  datepicker', 0.077660894115956378),
 ('sql ->  date', 0.077457620670715155),
 ('sql ->  triggers', 0.077448203846902208),
 ('sql ->  apache-spark', 0.076403101087904934),
 ('sql ->  django-models', 0.074770701660689304),
 ('sql ->  redis', 0.073780347937918697),
 ('sql ->  doctrine2', 0.072550926419718109),
 ('sql ->  mongoose', 0.071363510832935129),
 ('sql ->  highcharts', 0.071247514648943591),
 ('sql ->  plsql', 0.07114640551682086)]

In [36]:
pickle.dump(sorted_outgoing_cobrinha_factors_dict,open(PICKLE_ROOT+"/sorted_outgoing_cobrinha_factors_dict.p","wb"))

In [37]:
sorted_incoming_cobrinha_factors_dict['sql'][:20]

[('coldfusion ->  sql', 0.086136477638461426),
 ('c# ->  sql', 0.08582034591512909),
 ('.net ->  sql', 0.080569148464709583),
 ('c#-4.0 ->  sql', 0.074846616126380744),
 ('web ->  sql', 0.0643909379411267),
 ('performance ->  sql', 0.062679989136039949),
 ('optimization ->  sql', 0.059888399258906294),
 ('design ->  sql', 0.058318854548678481),
 ('mfc ->  sql', 0.050708268713768295),
 ('dynamic ->  sql', 0.048020965251519769),
 ('php ->  sql', 0.047619066960061442),
 ('coffeescript ->  sql', 0.047600222416206116),
 ('database ->  sql', 0.047279777849082945),
 ('cocoa ->  sql', 0.046804702293624809),
 ('javascript ->  sql', 0.046457566462457502),
 ('web-applications ->  sql', 0.04632288676818893),
 ('user-interface ->  sql', 0.046132164635539347),
 ('.net-4.0 ->  sql', 0.045903890936721883),
 ('vb6 ->  sql', 0.045264723691653783),
 ('java ->  sql', 0.044742733509705805)]

In [38]:
pickle.dump(sorted_incoming_cobrinha_factors_dict,open(PICKLE_ROOT+"/sorted_incoming_cobrinha_factors_dict.p","wb"))

In [39]:
sorted_incoming_cobrinha_factors_dict["sql"][:20]

[('coldfusion ->  sql', 0.086136477638461426),
 ('c# ->  sql', 0.08582034591512909),
 ('.net ->  sql', 0.080569148464709583),
 ('c#-4.0 ->  sql', 0.074846616126380744),
 ('web ->  sql', 0.0643909379411267),
 ('performance ->  sql', 0.062679989136039949),
 ('optimization ->  sql', 0.059888399258906294),
 ('design ->  sql', 0.058318854548678481),
 ('mfc ->  sql', 0.050708268713768295),
 ('dynamic ->  sql', 0.048020965251519769),
 ('php ->  sql', 0.047619066960061442),
 ('coffeescript ->  sql', 0.047600222416206116),
 ('database ->  sql', 0.047279777849082945),
 ('cocoa ->  sql', 0.046804702293624809),
 ('javascript ->  sql', 0.046457566462457502),
 ('web-applications ->  sql', 0.04632288676818893),
 ('user-interface ->  sql', 0.046132164635539347),
 ('.net-4.0 ->  sql', 0.045903890936721883),
 ('vb6 ->  sql', 0.045264723691653783),
 ('java ->  sql', 0.044742733509705805)]

if "sql" is a "good" parent tag, then the avg of all OUTGOING cobrinha factors should be higher than the average INCOMING cobrinha factors, right?

In [42]:
TAG_NAME='sql'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.02851584842915533 
average of incoming factors: -0.02851584842915533


In [44]:
TAG_NAME='performance'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.08253090373308757 
average of incoming factors: -0.08253090373308758


In [48]:
TAG_NAME='user-interface'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.07535414028267097 
average of incoming factors: -0.07535414028267097


now let's see whether a reasonably "bad" parent tag shows the opposite behaviour.

In [43]:
TAG_NAME = 'android-fragments'

outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
    
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))    

average of outgoing factors: -0.04185923679648722 
average of incoming factors: 0.04185923679648721


## now let's try to find a triple. two tags B and C that have good cobrinha factor and a third tag A that has good cobrinha factor with tag B and tag C (but cobrinha(A,B) should probably be higher than cobrinha (A,C))


A -> B -> C

(TAG C is the most specific tag of the three)

### let's experiment first

In [50]:
# this is not very good, because numpy is a subset of scipy
# so scipy should, in theory, be more general than numpy

# perhaps this has to do with the fact that numpy tags are much more common than scipy, and this
# causes numpy-tagged documents to have more words, in general, than scipy?

global_similarity_index['scipy'],global_similarity_index['numpy']

(0.32409407806410806, 0.34442704064918939)

it's bad that 'c#' and things like '.coldfusion' and 'mfc' are at the top **but** 'python' and 'performance' 
and 'optimization' and 'python-2.7' at the top are indicative that something is correct

In [52]:
sorted_incoming_cobrinha_factors_dict['numpy'][:20]

[('coldfusion ->  numpy', 0.13294348909544823),
 ('optimization ->  numpy', 0.13179793666404241),
 ('c# ->  numpy', 0.13023244205211987),
 ('.net ->  numpy', 0.12760040886279764),
 ('python ->  numpy', 0.12477790856052123),
 ('performance ->  numpy', 0.12361369609795958),
 ('c#-4.0 ->  numpy', 0.11842401984044743),
 ('web ->  numpy', 0.11581092296405479),
 ('design ->  numpy', 0.11298754210660097),
 ('mfc ->  numpy', 0.10664968960041467),
 ('math ->  numpy', 0.10630016751059837),
 ('cocoa ->  numpy', 0.10600117020453721),
 ('php ->  numpy', 0.10535413303967064),
 ('dynamic ->  numpy', 0.10511197854063933),
 ('coffeescript ->  numpy', 0.10500697868247204),
 ('python-2.7 ->  numpy', 0.1048952400092816),
 ('user-interface ->  numpy', 0.10478427288961617),
 ('javascript ->  numpy', 0.1039326267560098),
 ('r ->  numpy', 0.10377448373066621),
 ('machine-learning ->  numpy', 0.10360721834871484)]

In [53]:
sorted_incoming_cobrinha_factors_dict['scipy'][:20]

[('optimization ->  scipy', 0.1368306508024093),
 ('coldfusion ->  scipy', 0.13543514497039677),
 ('c# ->  scipy', 0.13062910523665469),
 ('.net ->  scipy', 0.12935110824846979),
 ('python ->  scipy', 0.12816671937576549),
 ('performance ->  scipy', 0.12305910198449684),
 ('c#-4.0 ->  scipy', 0.11936142871636783),
 ('web ->  scipy', 0.1189429516749895),
 ('design ->  scipy', 0.11527826293085172),
 ('math ->  scipy', 0.11090748232639235),
 ('python-2.7 ->  scipy', 0.11017031682550597),
 ('mfc ->  scipy', 0.10992188633486821),
 ('machine-learning ->  scipy', 0.10950161498490187),
 ('user-interface ->  scipy', 0.10835503745024376),
 ('coffeescript ->  scipy', 0.1082877035029647),
 ('cocoa ->  scipy', 0.10809540046591634),
 ('r ->  scipy', 0.10733245087498154),
 ('javascript ->  scipy', 0.10700642139649028),
 ('dynamic ->  scipy', 0.10474759219960336),
 ('php ->  scipy', 0.10361920646609851)]

In [54]:
sorted_outgoing_cobrinha_factors_dict['numpy'][:20]

[('numpy ->  tensorflow', 0.024112718429186834),
 ('numpy ->  gulp', 0.021332975132876873),
 ('numpy ->  matplotlib', 0.020133343369748199),
 ('numpy ->  boost', 0.01796500638219584),
 ('numpy ->  hive', 0.017648024101791688),
 ('numpy ->  tkinter', 0.017566060511581338),
 ('numpy ->  npm', 0.01687844200415424),
 ('numpy ->  socket.io', 0.016837013279648507),
 ('numpy ->  docker', 0.016661506034059088),
 ('numpy ->  webpack', 0.016607012738235862),
 ('numpy ->  recyclerview', 0.016334797554224314),
 ('numpy ->  paypal', 0.015425577041885071),
 ('numpy ->  jqgrid', 0.015339256273071493),
 ('numpy ->  redis', 0.015274553664503769),
 ('numpy ->  scipy', 0.01478593309992198),
 ('numpy ->  datepicker', 0.014245628066827517),
 ('numpy ->  gruntjs', 0.014157920320947964),
 ('numpy ->  youtube', 0.014015837863164033),
 ('numpy ->  ffmpeg', 0.01389883640113882),
 ('numpy ->  d3.js', 0.013412977220033446)]

In [55]:
sorted_outgoing_cobrinha_factors_dict['scipy'][:20]

[('scipy ->  gulp', 0.018116861035357836),
 ('scipy ->  tensorflow', 0.016409688889617854),
 ('scipy ->  socket.io', 0.013317604216152483),
 ('scipy ->  npm', 0.012739711058474382),
 ('scipy ->  boost', 0.012697124155380169),
 ('scipy ->  webpack', 0.012483636713490774),
 ('scipy ->  docker', 0.012330579392634952),
 ('scipy ->  hive', 0.012301044209264249),
 ('scipy ->  recyclerview', 0.011991545594783729),
 ('scipy ->  tkinter', 0.011729593578654373),
 ('scipy ->  paypal', 0.010890364459380171),
 ('scipy ->  redis', 0.010266316992668308),
 ('scipy ->  jqgrid', 0.010151805353142548),
 ('scipy ->  gruntjs', 0.0097410207189764144),
 ('scipy ->  datepicker', 0.0096095973432967965),
 ('scipy ->  youtube', 0.009504826286414381),
 ('scipy ->  matplotlib', 0.0091859024619993148),
 ('scipy ->  ffmpeg', 0.0088545063144286808),
 ('scipy ->  webview', 0.0082015532094536717),
 ('scipy ->  google-maps-api-3', 0.0079395781157411156)]

### let's try some triples, starting with tags very low in the hierarchy (i.e. low generality, high specificity)

In [56]:
sorted(global_similarity_index.items(),key=lambda tpl: tpl[1])[:10]

[('gulp', 0.20157625401838811),
 ('recyclerview', 0.2486415240114675),
 ('socket.io', 0.250170981198097),
 ('docker', 0.26050547377925304),
 ('npm', 0.26063850119479154),
 ('webpack', 0.26227745762404953),
 ('hive', 0.26293479328716896),
 ('boost', 0.26300122888860139),
 ('tensorflow', 0.26525913580033628),
 ('paypal', 0.2682650078066654)]

In [57]:
sorted_incoming_cobrinha_factors_dict['gulp'][:20]

[('coffeescript ->  gulp', 0.13882717441030512),
 ('javascript ->  gulp', 0.12033206487075539),
 ('coldfusion ->  gulp', 0.11772339180950987),
 ('web ->  gulp', 0.11633950191405606),
 ('c# ->  gulp', 0.11452330026514988),
 ('.net ->  gulp', 0.11448401290734911),
 ('c#-4.0 ->  gulp', 0.10699476788839357),
 ('node.js ->  gulp', 0.10678131807941359),
 ('web-applications ->  gulp', 0.10419416569360677),
 ('ecmascript-6 ->  gulp', 0.10197472323314236),
 ('automation ->  gulp', 0.10157133557243157),
 ('linux ->  gulp', 0.10107469324106806),
 ('performance ->  gulp', 0.10017051222068918),
 ('windows ->  gulp', 0.09909466731890966),
 ('optimization ->  gulp', 0.098718772929079393),
 ('.net-4.0 ->  gulp', 0.098115280610889866),
 ('configuration ->  gulp', 0.097985843258104652),
 ('design ->  gulp', 0.097631867646236437),
 ('html5 ->  gulp', 0.096834775374255),
 ('cocoa ->  gulp', 0.096202414661816102)]

In [58]:
sorted_incoming_cobrinha_factors_dict['coffeescript'][:20]

[('c# ->  coffeescript', 0.035445919050105981),
 ('coldfusion ->  coffeescript', 0.032150186174086902),
 ('.net ->  coffeescript', 0.03126648138536204),
 ('c#-4.0 ->  coffeescript', 0.022034150047126558),
 ('web ->  coffeescript', 0.018076878485529555),
 ('design ->  coffeescript', 0.0075495281995318604),
 ('performance ->  coffeescript', 0.0066114267753566566),
 ('optimization ->  coffeescript', 0.0039848127730503129),
 ('mfc ->  coffeescript', 0.0030182840220132723),
 ('cocoa ->  coffeescript', -0.0012907318889337019),
 ('user-interface ->  coffeescript', -0.0013297721046117872),
 ('web-applications ->  coffeescript', -0.0016975697879196233),
 ('javascript ->  coffeescript', -0.0019645150807168048),
 ('.net-4.0 ->  coffeescript', -0.0024545700231874386),
 ('java ->  coffeescript', -0.002990910195516673),
 ('dynamic ->  coffeescript', -0.0044574828723866365),
 ('security ->  coffeescript', -0.0051950130275499589),
 ('php ->  coffeescript', -0.0066807928131792814),
 ('vb6 ->  coffeescr

so we get **"c#" -> "coffeescript" -> "gulp"**, which is clearly not good

In [60]:
sorted_incoming_cobrinha_factors_dict['hive'][:10]

[('coldfusion ->  hive', 0.13748459498035853),
 ('c# ->  hive', 0.13161798431971952),
 ('.net ->  hive', 0.12769980195420483),
 ('c#-4.0 ->  hive', 0.12596012215642938),
 ('database ->  hive', 0.12145453843415036),
 ('performance ->  hive', 0.12104652250281114),
 ('optimization ->  hive', 0.11872944117317449),
 ('web ->  hive', 0.11638067545269859),
 ('java ->  hive', 0.1156458923354256),
 ('design ->  hive', 0.1111543344936201)]

In [61]:
sorted_incoming_cobrinha_factors_dict['coldfusion'][:10]

[('c# ->  coldfusion', 0.0042803293067116633),
 ('.net ->  coldfusion', 0.00014084887208769701),
 ('c#-4.0 ->  coldfusion', -0.0091766853357258069),
 ('web ->  coldfusion', -0.014480419471328708),
 ('design ->  coldfusion', -0.024499406618856384),
 ('performance ->  coldfusion', -0.025661573715688574),
 ('optimization ->  coldfusion', -0.028176300926012431),
 ('mfc ->  coldfusion', -0.028876880175804281),
 ('coffeescript ->  coldfusion', -0.032150186174086902),
 ('user-interface ->  coldfusion', -0.032662688192373973)]

so we get **"c#" -> "coldfusion" -> "hive"**, which is also clearly not good

In [63]:
sorted_incoming_cobrinha_factors_dict['docker'][:10]

[('coldfusion ->  docker', 0.13409714424381344),
 ('.net ->  docker', 0.13260708249310407),
 ('c# ->  docker', 0.1296229575297374),
 ('web ->  docker', 0.12707355307046081),
 ('linux ->  docker', 0.12178583748256994),
 ('c#-4.0 ->  docker', 0.11985084216284854),
 ('web-applications ->  docker', 0.11579560027797013),
 ('performance ->  docker', 0.11527832747947893),
 ('design ->  docker', 0.11406034237402304),
 ('windows ->  docker', 0.11313187630533728)]

In [64]:
sorted_incoming_cobrinha_factors_dict['coldfusion'][:10]

[('c# ->  coldfusion', 0.0042803293067116633),
 ('.net ->  coldfusion', 0.00014084887208769701),
 ('c#-4.0 ->  coldfusion', -0.0091766853357258069),
 ('web ->  coldfusion', -0.014480419471328708),
 ('design ->  coldfusion', -0.024499406618856384),
 ('performance ->  coldfusion', -0.025661573715688574),
 ('optimization ->  coldfusion', -0.028176300926012431),
 ('mfc ->  coldfusion', -0.028876880175804281),
 ('coffeescript ->  coldfusion', -0.032150186174086902),
 ('user-interface ->  coldfusion', -0.032662688192373973)]

so we get "c#" -> ".net" -> "vagrant", which is also clearly not good

> i think the lesson here is that tags that are very common like "c#" and ".net" (and "coldfusion"??) are dominating the effect here, although it's nice to see more reasonable parent tags a bit down the ranking..

> maybe we need to tweak the weighting so that tag similarity has more say than generality.