In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import get_metrics_for_tag_pair

In [2]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [3]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [4]:
sorted_pairwise_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_pairwise_similarity_dict.p","rb"))

In [5]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

In [6]:
global_similarity_index = pickle.load(open(PICKLE_ROOT+"global_similarity_index.p","rb"))

### top and bottom tags by avg global similarity

In [7]:
# bottom
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1] )
sorted_tags_by_global_avg_similarities

[('gulp', 0.19766495986060426),
 ('vagrant', 0.23857715599421261),
 ('socket.io', 0.24158315695390806),
 ('recyclerview', 0.24221057782043745),
 ('npm', 0.24231542434386466),
 ('hive', 0.24605001039331367),
 ('webpack', 0.2493951735363871),
 ('docker', 0.25095802293264158),
 ('jqgrid', 0.2598639010154486),
 ('youtube', 0.26563900160188986),
 ('datepicker', 0.26816533883839311),
 ('paypal', 0.2709259754453166),
 ('tensorflow', 0.27096232790665409),
 ('boost', 0.27126078716224117),
 ('gruntjs', 0.27268325513792768),
 ('tkinter', 0.27322499288040253),
 ('webview', 0.27369232506109586),
 ('redis', 0.27777218314489327),
 ('ffmpeg', 0.27789775174679776),
 ('solr', 0.2779502962572698),
 ('three.js', 0.28348669966985779),
 ('cmake', 0.28349518135341456),
 ('google-maps-api-3', 0.28553878286108819),
 ('android-viewpager', 0.28608377022949311),
 ('ftp', 0.28628729752226861),
 ('gradle', 0.2892250476315788),
 ('django-models', 0.2897025535409814),
 ('d3.js', 0.29068508230329743),
 ('linked-list',

note that tags such as "web", "performance", "design", "security" and "user-interface" are **nowhere near** the top assigned tags on SO, but they show up here... and it makes sense because they're very general indeed

In [8]:
# top
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1],reverse=True )
sorted_tags_by_global_avg_similarities

[('c#', 0.61429474374359472),
 ('.net', 0.60856767935547096),
 ('coldfusion', 0.60133827473238266),
 ('web', 0.58749988631852479),
 ('c#-4.0', 0.58703059122895906),
 ('design', 0.57828630242397228),
 ('mfc', 0.57473086547385188),
 ('performance', 0.57384818812305094),
 ('optimization', 0.56875933925500555),
 ('.net-4.0', 0.56798499202169783),
 ('javascript', 0.56788158824929913),
 ('java', 0.56740237398190985),
 ('web-applications', 0.56587494861950982),
 ('user-interface', 0.5655724222812859),
 ('security', 0.56419463079111587),
 ('php', 0.56297895418686694),
 ('cocoa', 0.56254237759167014),
 ('asp.net', 0.56224469352671291),
 ('coffeescript', 0.55656888421314732),
 ('winforms', 0.55459198118370545),
 ('ios', 0.5540301052911113),
 ('dynamic', 0.55351068327095154),
 ('concurrency', 0.55267148872013983),
 ('cocoa-touch', 0.55240060669064717),
 ('windows', 0.55174105696106779),
 ('linux', 0.5482666224087257),
 ('html5', 0.54759174101172481),
 ('database', 0.54677282384283743),
 ('model-v

In [9]:
def evaluate(a,b):
    return get_metrics_for_tag_pair(a,b,tag_vectors_index, sorted_pairwise_similarity_dict, global_similarity_index)

> the result is a tuple of the form (difference_of_global_averages, mutual_similarity)

positive difference means the first tag is "more general" than the second tag (according to our measure)


In [10]:
evaluate('sql','sql-server')

(0.0014637286328647581, 0.95058720181444423)

In [11]:
evaluate('sql-server','sql-server-2008')

(0.010008514319155559, 0.9821119605172598)

In [12]:
evaluate('python','python-3.x')

(0.033323675934456398, 0.97409185491307604)

In [13]:
evaluate('python-2.7','python-3.x')

(0.0080123725767506926, 0.9684609799143078)

In [15]:
evaluate('database','oracle')

(0.096227895157305732, 0.75508846182626643)

In [16]:
# hmmm... maybe because rails is a much more stronger child of something like "web-framework"?
evaluate('ruby','ruby-on-rails')

(-0.0033985158054713471, 0.89351526935353753)

In [17]:
# hmmmm
evaluate('frameworks','ruby-on-rails')

(-0.032766935362076199, 0.57008494724236014)

In [20]:
evaluate('web','ruby-on-rails')

(0.081283044467135634, 0.71068977434233216)

In [18]:
evaluate('.net','.net-4.0')

(0.040582687333773126, 0.94375767300965463)

In [19]:
evaluate('android','android-studio') 

(0.1304544666069315, 0.82292004418484122)

In [21]:
evaluate('asp.net','asp.net-mvc') 

(0.034968450719514266, 0.84565024635719455)

In [22]:
evaluate('asp.net-mvc','asp.net-mvc-5') 

(0.0070798645057762455, 0.95671340973047336)

In [23]:
evaluate('asp.net','asp.net-mvc-5') 

(0.042048315225290511, 0.82161437322541109)

## what about unrelated stuff?

this will probably fluctuate around zero

In [25]:
evaluate('java','arrays')

(0.12649887344671018, 0.62727084914851483)

In [26]:
evaluate('ruby','python-2.7') 

(-0.016851103574039517, 0.64133340229546998)

In [27]:
evaluate('database','python-2.7') 

(0.027103394222880106, 0.6490934775334003)

## what are the tag pairs that are the best cobrinha parts?

In [28]:
similarities_to_other_tags = sorted_pairwise_similarity_dict["sql"]

In [29]:
similarities_to_other_tags[1]

('sql-server', 0.95058720181444423)

In [30]:
lst = sorted_pairwise_similarity_dict.items()

In [34]:
def make_pairwise_cobrinha_factor(pairwise_similarity_dict):

    outgoing_cobrinha_factors_dict = dict()
    incoming_cobrinha_factors_dict = dict()


    for tag in tag_vocabulary:
        outgoing_cobrinha_factors_dict[tag] = list()
        incoming_cobrinha_factors_dict[tag] = list()
        
        
    for (tag, similarities_to_other_tags) in tqdm(pairwise_similarity_dict.items()):
    
        for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

            (avg_avg, mutual_similarity)= evaluate(tag,other_tag)

            cobrinha_factor = avg_avg * mutual_similarity
            pair = "{} ->  {}".format(tag,other_tag)    

            outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
            incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))    
        
        
    return (outgoing_cobrinha_factors_dict,incoming_cobrinha_factors_dict)

In [35]:
outgoing_factors_dict,incoming_factors_dict = make_pairwise_cobrinha_factor(sorted_pairwise_similarity_dict)

100%|██████████| 618/618 [00:13<00:00, 44.90it/s]


In [36]:
incoming_factors_dict['sql']

[('webpack ->  sql', -0.061934197280978119),
 ('lambda ->  sql', -0.046427250594213633),
 ('github ->  sql', -0.055538934548633449),
 ('go ->  sql', -0.03288196186202097),
 ('xsd ->  sql', -0.063911578657909196),
 ('ionic-framework ->  sql', -0.044915735440478298),
 ('heroku ->  sql', -0.063812697415960726),
 ('windows ->  sql', 0.03419272204024755),
 ('asp.net-mvc-2 ->  sql', -0.0026520842940314987),
 ('file-io ->  sql', -0.0049470694267037922),
 ('merge ->  sql', -0.057815894315084188),
 ('http ->  sql', 0.0093973697669063189),
 ('xpath ->  sql', -0.058869605615832328),
 ('mfc ->  sql', 0.050435049802104677),
 ('emacs ->  sql', -0.062873235027305824),
 ('foreach ->  sql', -0.038926975092705235),
 ('table ->  sql', -0.018789427674080365),
 ('iis-7 ->  sql', -0.014264685605627951),
 ('android-asynctask ->  sql', -0.053976504875829258),
 ('xml ->  sql', -0.021027932459946459),
 ('typescript ->  sql', -0.031751804585411388),
 ('pdf ->  sql', -0.06374043412267133),
 ('tfs ->  sql', -0.055

## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [37]:
sorted_outgoing_factors_dict = dict()
sorted_incoming_factors_dict = dict()

for (tag,similarities) in outgoing_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_outgoing_factors_dict[tag] = sorted_similarities
    
for (tag,similarities) in incoming_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_incoming_factors_dict[tag] = sorted_similarities        

In [38]:
sorted_outgoing_factors_dict['sql'][:20]

[('sql ->  hive', 0.10276002713402385),
 ('sql ->  sqlalchemy', 0.094713737813830662),
 ('sql ->  stored-procedures', 0.093697645399657289),
 ('sql ->  pdo', 0.092448899800979589),
 ('sql ->  cassandra', 0.081286907002972864),
 ('sql ->  triggers', 0.080766614926709529),
 ('sql ->  group-by', 0.079990703914387812),
 ('sql ->  solr', 0.079526179575641884),
 ('sql ->  crystal-reports', 0.079489616749060299),
 ('sql ->  join', 0.079430678775405286),
 ('sql ->  jqgrid', 0.079298781585696851),
 ('sql ->  pandas', 0.075814129772925537),
 ('sql ->  datepicker', 0.075305507452054235),
 ('sql ->  apache-spark', 0.074622211190207607),
 ('sql ->  django-models', 0.074128399924297636),
 ('sql ->  date', 0.074089251831816569),
 ('sql ->  doctrine2', 0.073740729782105288),
 ('sql ->  redis', 0.072463749411050854),
 ('sql ->  awk', 0.071576014960321135),
 ('sql ->  mongoose', 0.071425372118284536)]

In [39]:
pickle.dump(sorted_outgoing_factors_dict,open(PICKLE_ROOT+"/sorted_outgoing_cobrinha_factors_dict.p","wb"))

In [40]:
sorted_incoming_factors_dict['sql'][:20]

[('c# ->  sql', 0.085390300456521517),
 ('coldfusion ->  sql', 0.080063227078340182),
 ('.net ->  sql', 0.079232246702481607),
 ('c#-4.0 ->  sql', 0.064849222935643103),
 ('web ->  sql', 0.059771984718923983),
 ('performance ->  sql', 0.058155180925289235),
 ('design ->  sql', 0.056220383191762383),
 ('optimization ->  sql', 0.053104891599728074),
 ('mfc ->  sql', 0.050435049802104677),
 ('php ->  sql', 0.047990328803707426),
 ('.net-4.0 ->  sql', 0.046292334747653791),
 ('javascript ->  sql', 0.045768591706195058),
 ('java ->  sql', 0.045135552585241989),
 ('web-applications ->  sql', 0.044673000616481426),
 ('security ->  sql', 0.044533369686082541),
 ('asp.net ->  sql', 0.044172766699079873),
 ('database ->  sql', 0.043839806447319593),
 ('user-interface ->  sql', 0.043635938933251225),
 ('cocoa ->  sql', 0.042109292814787716),
 ('dynamic ->  sql', 0.038831760049065585)]

In [41]:
pickle.dump(sorted_incoming_factors_dict,open(PICKLE_ROOT+"/sorted_incoming_cobrinha_factors_dict.p","wb"))

In [42]:
sorted_incoming_factors_dict["sql"][:20]

[('c# ->  sql', 0.085390300456521517),
 ('coldfusion ->  sql', 0.080063227078340182),
 ('.net ->  sql', 0.079232246702481607),
 ('c#-4.0 ->  sql', 0.064849222935643103),
 ('web ->  sql', 0.059771984718923983),
 ('performance ->  sql', 0.058155180925289235),
 ('design ->  sql', 0.056220383191762383),
 ('optimization ->  sql', 0.053104891599728074),
 ('mfc ->  sql', 0.050435049802104677),
 ('php ->  sql', 0.047990328803707426),
 ('.net-4.0 ->  sql', 0.046292334747653791),
 ('javascript ->  sql', 0.045768591706195058),
 ('java ->  sql', 0.045135552585241989),
 ('web-applications ->  sql', 0.044673000616481426),
 ('security ->  sql', 0.044533369686082541),
 ('asp.net ->  sql', 0.044172766699079873),
 ('database ->  sql', 0.043839806447319593),
 ('user-interface ->  sql', 0.043635938933251225),
 ('cocoa ->  sql', 0.042109292814787716),
 ('dynamic ->  sql', 0.038831760049065585)]