In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import evaluate_cobrinha

In [2]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [3]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [4]:
sorted_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_similarity_dict.p","rb"))

In [5]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

In [6]:
global_similarity_index = pickle.load(open(PICKLE_ROOT+"global_similarity_index.p","rb"))

### top and bottom tags by avg global similarity

In [7]:
# bottom
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1] )
sorted_tags_by_global_avg_similarities

[('nsdateformatter', 0.1369113894923151),
 ('pygame', 0.14859146231145598),
 ('gulp', 0.15354641191116433),
 ('theano', 0.15478170217439616),
 ('sftp', 0.15503543619381696),
 ('paypal-sandbox', 0.15899195507098682),
 ('axis2', 0.15983060823164733),
 ('log4j', 0.16019117586477241),
 ('mechanize', 0.16387178585079126),
 ('gitlab', 0.1638739455529411),
 ('fancybox', 0.16389120151796757),
 ('jetty', 0.16918591219495652),
 ('javamail', 0.17060498284020989),
 ('virtualenv', 0.17240526727876299),
 ('slf4j', 0.17546285220394753),
 ('apache-kafka', 0.17999755723900276),
 ('quartz-scheduler', 0.18159744811305786),
 ('passport.js', 0.18271926144252998),
 ('realm', 0.18346472342061457),
 ('travis-ci', 0.18411426161946165),
 ('cocoapods', 0.18420145539203461),
 ('retrofit', 0.18458239812421867),
 ('xdebug', 0.18534810597472184),
 ('composer-php', 0.18556924445154585),
 ('hbase', 0.18591040501330608),
 ('phpmailer', 0.18657447573128971),
 ('android-notifications', 0.18668397735601744),
 ('webpack', 

note that tags such as "web", "performance", "design", "security" and "user-interface" are **nowhere near** the top assigned tags on SO, but they show up here... and it makes sense because they're very general indeed

In [8]:
# top
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1],reverse=True )
sorted_tags_by_global_avg_similarities

[('c#', 0.52796384399674001),
 ('.net', 0.52237488943626997),
 ('web', 0.49698595034123894),
 ('c#-4.0', 0.49237870705839931),
 ('performance', 0.49045843519413501),
 ('design', 0.48792240596086045),
 ('java', 0.48656225358421379),
 ('javascript', 0.4859904125176337),
 ('asp.net', 0.48341469702671458),
 ('security', 0.48225525430704314),
 ('php', 0.47973539893855099),
 ('user-interface', 0.47965186513161667),
 ('.net-4.0', 0.47752612700700081),
 ('html5', 0.47723670445625721),
 ('optimization', 0.47654262078466109),
 ('windows', 0.47547176112432721),
 ('ios', 0.47507086838459761),
 ('cocoa', 0.47291592025162482),
 ('cocoa-touch', 0.47099675887289727),
 ('winforms', 0.46932929711010801),
 ('linux', 0.46915636196451127),
 ('html', 0.46857508306472001),
 ('python', 0.46815861572867451),
 ('database', 0.46749503757698785),
 ('iphone', 0.46603809814152419),
 ('debugging', 0.46454308828181434),
 ('objective-c', 0.46374456510672163),
 ('dynamic', 0.46250069077647887),
 ('web-applications', 0.

In [9]:
def ev(a,b):
    return evaluate_cobrinha(a,b,tag_vectors_index, sorted_similarity_dict, global_similarity_index)

remember, the result is a tuple of the form (difference_of_global_averages, mutual_similarity)

positive difference means the first tag is "more general" than the second tag (according to our measure)


In [10]:
ev('sql','sql-server')

(0.0046481632200305323, 0.94608012307374756)

In [11]:
ev('sql-server','sql-server-2008')

(0.0072115339038011372, 0.96609280857516122)

In [12]:
ev('python','python-3.x')

(0.037947191617892284, 0.95415102565588328)

In [13]:
ev('python-2.7','python-3.x')

(0.017031930552592223, 0.93856081340971576)

In [14]:
ev('database','oracle')

(0.085258126321881567, 0.74186643781216666)

In [15]:
# hmmm... maybe because rails is a much more stronger child of something like "web-framework"?
ev('ruby','ruby-on-rails')

(3.0801422104487219e-05, 0.90130886624722428)

In [16]:
# hmmmm
ev('frameworks','ruby-on-rails')

(-0.021379740850297202, 0.58776099862052555)

In [17]:
ev('.net','.net-3.5')

(0.10189813925712238, 0.82610366044219774)

In [18]:
# makes sense
ev('.net-3.5','.net-4.0') 

(-0.057049376827853215, 0.76058674607166943)

In [19]:
ev('android','android-service') 

(0.13861274631348885, 0.68468875828253828)

In [20]:
ev('asp.net','asp.net-mvc') 

(0.034241093906521636, 0.84005391327352352)

In [21]:
ev('asp.net-mvc','asp.net-mvc-5') 

(0.01188788746170899, 0.89472933091423534)

In [22]:
ev('asp.net','asp.net-mvc-5') 

(0.046128981368230626, 0.78797945773202083)

## what about unrelated stuff?

this will probably fluctuate around zero

In [23]:
ev('java','arrays')

(0.1155915868012774, 0.62033156246617382)

In [24]:
ev('ruby','python-2.7') 

(-0.01237052618558937, 0.64212738224793176)

In [25]:
ev('database','python-2.7') 

(0.0202516829136134, 0.64969036563896276)

## what's the mean and stddev of the similarity between all tags and each other?

In [26]:
running_avgs = []

for (tag, similarities_to_other_tags) in sorted_similarity_dict.items():
    sims = [tpl[1] for tpl in similarities_to_other_tags]
    
    avg = np.array(sims).mean()
    
    running_avgs.append(avg)
    
np.array(running_avgs).mean(),np.array(running_avgs).std()

(0.31264621826436056, 0.06498841791742585)

## what are the tag pairs that are the best cobrinha parts?

In [27]:
similarities_to_other_tags = sorted_similarity_dict["sql"]

In [28]:
similarities_to_other_tags[1]

('tsql', 0.93644411780142545)

In [29]:
lst = sorted_similarity_dict.items()

In [30]:
outgoing_cobrinha_factors_dict = dict()
incoming_cobrinha_factors_dict = dict()


for tag in tag_vocabulary:
    outgoing_cobrinha_factors_dict[tag] = list()
    incoming_cobrinha_factors_dict[tag] = list()   

In [31]:
for (tag, similarities_to_other_tags) in tqdm(sorted_similarity_dict.items()):
    
    for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

        (avg_avg, mutual_similarity)= ev(tag,other_tag)
        
        cobrinha_factor = avg_avg * mutual_similarity
        pair = "{} ->  {}".format(tag,other_tag)    
        
        outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
        incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))

100%|██████████| 1704/1704 [02:24<00:00, 11.12it/s]


In [32]:
incoming_cobrinha_factors_dict['sql']

[('configuration ->  sql', 0.010280384490069393),
 ('maven-3 ->  sql', -0.050896557159101061),
 ('java-stream ->  sql', -0.050898597275896781),
 ('asp.net-core ->  sql', -0.025797617654279733),
 ('pycharm ->  sql', -0.053092905656013427),
 ('three.js ->  sql', -0.052412598513822017),
 ('equals ->  sql', -0.05641813699977135),
 ('join ->  sql', -0.07000414139385297),
 ('internationalization ->  sql', -0.017830817802244662),
 ('aop ->  sql', -0.04847186149941695),
 ('anchor ->  sql', -0.038966931125919521),
 ('asterisk ->  sql', -0.058180113511762992),
 ('user-interface ->  sql', 0.035165200504195346),
 ('draw ->  sql', -0.048870065566385297),
 ('c++-cli ->  sql', -0.018712655144286019),
 ('chef ->  sql', -0.057808556500292818),
 ('compare ->  sql', -0.034713225551558327),
 ('mongodb-query ->  sql', -0.067258238362668213),
 ('open-source ->  sql', -0.037007341849379399),
 ('uiimagepickercontroller ->  sql', -0.051972099202729695),
 ('xquery ->  sql', -0.054421979020099667),
 ('load-balan

## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [33]:
sorted_outgoing_cobrinha_factors_dict = dict()
sorted_incoming_cobrinha_factors_dict = dict()

for (tag,similarities) in outgoing_cobrinha_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_outgoing_cobrinha_factors_dict[tag] = sorted_similarities
    
for (tag,similarities) in incoming_cobrinha_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_incoming_cobrinha_factors_dict[tag] = sorted_similarities        

In [34]:
sorted_outgoing_cobrinha_factors_dict['sql'][:20]

[('sql ->  left-join', 0.11577437145661604),
 ('sql ->  union', 0.10682778444440537),
 ('sql ->  inner-join', 0.10638646934945287),
 ('sql ->  distinct', 0.10072914563365006),
 ('sql ->  hql', 0.097797428955005053),
 ('sql ->  subquery', 0.097569194466405745),
 ('sql ->  sql-order-by', 0.095941251164230801),
 ('sql ->  case', 0.092984011162197469),
 ('sql ->  pivot', 0.089149057808170534),
 ('sql ->  data.table', 0.088978850192963643),
 ('sql ->  greatest-n-per-group', 0.0876375345161753),
 ('sql ->  sql-azure', 0.086058702325092967),
 ('sql ->  stored-procedures', 0.084834823617687599),
 ('sql ->  prepared-statement', 0.083812741599067681),
 ('sql ->  query-optimization', 0.083544261256610866),
 ('sql ->  innodb', 0.083249143747091137),
 ('sql ->  hive', 0.083010051724714304),
 ('sql ->  sparql', 0.081786956529279073),
 ('sql ->  spark-dataframe', 0.080874610418233836),
 ('sql ->  primary-key', 0.080668747788729053)]

In [35]:
pickle.dump(sorted_outgoing_cobrinha_factors_dict,open(PICKLE_ROOT+"/sorted_outgoing_cobrinha_factors_dict.p","wb"))

In [36]:
sorted_incoming_cobrinha_factors_dict['sql'][:20]

[('c# ->  sql', 0.076017263109879704),
 ('.net ->  sql', 0.069735249488555889),
 ('performance ->  sql', 0.05062492185752912),
 ('web ->  sql', 0.048105766071413376),
 ('c#-4.0 ->  sql', 0.048010251944838801),
 ('design ->  sql', 0.043711189046683356),
 ('php ->  sql', 0.04118491960810703),
 ('asp.net ->  sql', 0.040728545186396403),
 ('java ->  sql', 0.040250902230947044),
 ('javascript ->  sql', 0.040198527752782404),
 ('database ->  sql', 0.039079570732830547),
 ('security ->  sql', 0.038636913519285138),
 ('optimization ->  sql', 0.038418706355791005),
 ('user-interface ->  sql', 0.035165200504195346),
 ('.net-4.0 ->  sql', 0.034749300219532715),
 ('html5 ->  sql', 0.033611038485945169),
 ('windows ->  sql', 0.032167628434307011),
 ('ios ->  sql', 0.031834341239257026),
 ('cocoa ->  sql', 0.030729127259777458),
 ('cocoa-touch ->  sql', 0.029636974571906265)]

In [37]:
pickle.dump(sorted_incoming_cobrinha_factors_dict,open(PICKLE_ROOT+"/sorted_incoming_cobrinha_factors_dict.p","wb"))

In [38]:
sorted_incoming_cobrinha_factors_dict["sql"][:20]

[('c# ->  sql', 0.076017263109879704),
 ('.net ->  sql', 0.069735249488555889),
 ('performance ->  sql', 0.05062492185752912),
 ('web ->  sql', 0.048105766071413376),
 ('c#-4.0 ->  sql', 0.048010251944838801),
 ('design ->  sql', 0.043711189046683356),
 ('php ->  sql', 0.04118491960810703),
 ('asp.net ->  sql', 0.040728545186396403),
 ('java ->  sql', 0.040250902230947044),
 ('javascript ->  sql', 0.040198527752782404),
 ('database ->  sql', 0.039079570732830547),
 ('security ->  sql', 0.038636913519285138),
 ('optimization ->  sql', 0.038418706355791005),
 ('user-interface ->  sql', 0.035165200504195346),
 ('.net-4.0 ->  sql', 0.034749300219532715),
 ('html5 ->  sql', 0.033611038485945169),
 ('windows ->  sql', 0.032167628434307011),
 ('ios ->  sql', 0.031834341239257026),
 ('cocoa ->  sql', 0.030729127259777458),
 ('cocoa-touch ->  sql', 0.029636974571906265)]