In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import evaluate_cobrinha

In [2]:
PICKLE_ROOT = "/media/felipe/ssd_vol/auto-tagger/data/tag-hierarchy/"

In [4]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [5]:
sorted_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_similarity_dict.p","rb"))

In [6]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

In [7]:
global_similarity_index = pickle.load(open(PICKLE_ROOT+"global_similarity_index.p","rb"))

## top and bottom tags by avg global similarity

In [8]:
# bottom
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1] )
sorted_tags_by_global_avg_similarities

[('nsdateformatter', 0.1235006242737529),
 ('gulp', 0.12550886299406785),
 ('theano', 0.13279889214672427),
 ('iad', 0.13424252876119119),
 ('pygame', 0.13871944265382741),
 ('axis2', 0.14036686548531579),
 ('sftp', 0.14242771100503593),
 ('paypal-sandbox', 0.14398429016189587),
 ('travis-ci', 0.14473097238693353),
 ('gitlab', 0.14503271788250655),
 ('log4j', 0.14508752021779084),
 ('slf4j', 0.14819543986540723),
 ('javamail', 0.14902018199608563),
 ('mechanize', 0.15049932244217945),
 ('apache-kafka', 0.15106057493225525),
 ('fancybox', 0.15159771490274485),
 ('jetty', 0.15201504807841859),
 ('virtualenv', 0.15447451239571078),
 ('webpack', 0.15522150974420912),
 ('llvm', 0.15725147767239797),
 ('quartz-scheduler', 0.15783489671850803),
 ('midi', 0.15820764687164901),
 ('cocoapods', 0.15948048952623028),
 ('retrofit', 0.1596460411736727),
 ('wxpython', 0.16242012845974035),
 ('hbase', 0.16243101847306801),
 ('android-volley', 0.16243944126192991),
 ('rmi', 0.16364924228231351),
 ('pas

In [9]:
# top
sorted_global_similarity_index = dict()
as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]
sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1],reverse=True )
sorted_tags_by_global_avg_similarities

# note that tags such as "web", "performance", "design" and "security" are NOWHERE NEAR the top assigned 
# tags on SO, but they show up here... and it makes sense because they're very general indeed


[('c#', 0.49919061461063147),
 ('.net', 0.4906330688920813),
 ('web', 0.46236556224477521),
 ('javascript', 0.46090793161679788),
 ('java', 0.45979486356464466),
 ('performance', 0.45854124725191381),
 ('design', 0.45395573094628389),
 ('asp.net', 0.45367976141959593),
 ('c#-4.0', 0.45299475453155036),
 ('php', 0.45174358633581246),
 ('security', 0.45005267514074676),
 ('ios', 0.4496675786386487),
 ('html5', 0.44855217883790166),
 ('user-interface', 0.44806437813616451),
 ('windows', 0.4443752702298982),
 ('objective-c', 0.44265952429592326),
 ('python', 0.44144165140387531),
 ('html', 0.4412673770548915),
 ('winforms', 0.43905899622518629),
 ('cocoa-touch', 0.4389965363565857),
 ('database', 0.43897614932603191),
 ('linux', 0.4387044930399584),
 ('iphone', 0.43767839535312786),
 ('optimization', 0.43441716788329221),
 ('cocoa', 0.43375038948195138),
 ('c++', 0.4332760811469023),
 ('.net-4.0', 0.43154741280860065),
 ('debugging', 0.43022635613098187),
 ('osx', 0.42898463182403673),
 ('

In [10]:
def ev(a,b):
    return evaluate_cobrinha(a,b,tag_vectors_index, sorted_similarity_dict, global_similarity_index)

In [11]:
ev('sql','sql-server')

(0.0070967009816618742, 0.93703034503959293)

In [12]:
ev('sql-server','sql-server-2008')

(0.0096505228770326079, 0.95314746117205129)

In [13]:
ev('python','python-3.x')

(0.040985284208367223, 0.93970513911177911)

In [14]:
ev('python-2.7','python-3.x')

(0.015970770594200379, 0.91621126289995569)

In [15]:
ev('database','oracle')

(0.084673539379494567, 0.7216454102798322)

In [16]:
# hmmm... maybe because rails is a much more stronger child of something like "web-framework"?
ev('ruby','ruby-on-rails')

(-0.0001910435892515383, 0.89301743279045898)

In [17]:
ev('frameworks','ruby-on-rails')

(-0.026744766541324128, 0.57094666325493837)

In [18]:
ev('.net','.net-3.5')

(0.11443380397153846, 0.78668759626158458)

In [19]:
# makes sense
ev('.net-3.5','.net-4.0') 

(-0.055348147888057808, 0.70594395665365461)

In [20]:
ev('android','android-service') 

(0.14700006370638508, 0.64239765071837984)

In [21]:
ev('asp.net','asp.net-mvc') 

(0.03085921578057349, 0.83935673681214829)

In [22]:
ev('asp.net-mvc','asp.net-mvc-5') 

(0.027155937346624803, 0.855903967197713)

## what about unrelated stuff?

In [23]:
ev('java','arrays') 

(0.11210779948226035, 0.61990945740723769)

In [24]:
ev('ruby','python-2.7') 

(-0.0091886073917304967, 0.62794700331060238)

In [25]:
ev('database','python-2.7') 

(0.022549011536323438, 0.63820873170802761)

## what's the mean and stddev of the similarity between all tags and each other?

In [26]:
running_avgs = []

for (tag, similarities_to_other_tags) in sorted_similarity_dict.items():
    sims = [tpl[1] for tpl in similarities_to_other_tags]
    
    avg = np.array(sims).mean()
    
    running_avgs.append(avg)
    
np.array(running_avgs).mean(),np.array(running_avgs).std()

(0.27801305871657722, 0.062845079461859549)

## what are the tag pairs that are the best cobrinha parts?

In [27]:
similarities_to_other_tags = sorted_similarity_dict["sql"]

In [28]:
similarities_to_other_tags[1]

('tsql', 0.92462096587922771)

In [29]:
lst = sorted_similarity_dict.items()

In [30]:
outgoing_cobrinha_factors_dict = dict()
incoming_cobrinha_factors_dict = dict()


for tag in tag_vocabulary:
    outgoing_cobrinha_factors_dict[tag] = list()
    incoming_cobrinha_factors_dict[tag] = list()

    
    

In [31]:
# TOP_VALUES_TO_CONSIDER = 50

for (tag, similarities_to_other_tags) in tqdm(sorted_similarity_dict.items()):
    
    for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

        (avg_avg, mutual_similarity)= ev(tag,other_tag)
        
        cobrinha_factor = avg_avg * mutual_similarity
        pair = "{} ->  {}".format(tag,other_tag)    
        
        outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
        incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))

100%|██████████| 1704/1704 [22:56<00:00,  1.34it/s]


In [32]:
incoming_cobrinha_factors_dict['sql']

[('preg-replace ->  sql', -0.057389376315855722),
 ('vaadin ->  sql', -0.052744085811893292),
 ('android-layout ->  sql', -0.013391365354426444),
 ('httpclient ->  sql', -0.049718240971936958),
 ('exchange-server ->  sql', -0.057637841663218781),
 ('tableview ->  sql', -0.063631191013914082),
 ('awk ->  sql', -0.059712127536309278),
 ('package ->  sql', -0.041086969452543386),
 ('scrollview ->  sql', -0.04965718483216211),
 ('regex ->  sql', -0.024466701217179837),
 ('file-io ->  sql', -0.0088514777254997123),
 ('richfaces ->  sql', -0.05200920307339691),
 ('.net-4.0 ->  sql', 0.019316479509167046),
 ('load-balancing ->  sql', -0.045901005841555542),
 ('git ->  sql', -0.052097396536755609),
 ('udp ->  sql', -0.052668528542991158),
 ('webview ->  sql', -0.049219534535902755),
 ('segmentation-fault ->  sql', -0.053737536331156605),
 ('winapi ->  sql', 0.0073153785124733701),
 ('field ->  sql', -0.042016321793309268),
 ('maven ->  sql', -0.046559275080789927),
 ('clr ->  sql', -0.04410380

## each tag, with its most likely child/parent tags in its cobrinha

In [33]:
sorted_outgoing_cobrinha_factors_dict = dict()
sorted_incoming_cobrinha_factors_dict = dict()

for (tag,similarities) in outgoing_cobrinha_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_outgoing_cobrinha_factors_dict[tag] = sorted_similarities
    
for (tag,similarities) in incoming_cobrinha_factors_dict.items():
    sorted_similarities = sorted(similarities,key=lambda tpl: tpl[1], reverse=True)
    
    sorted_incoming_cobrinha_factors_dict[tag] = sorted_similarities        

In [35]:
pickle.dump(sorted_outgoing_cobrinha_factors_dict,open(PICKLE_ROOT+"/sorted_outgoing_cobrinha_factors_dict.p","wb"))

In [36]:
pickle.dump(sorted_incoming_cobrinha_factors_dict,open(PICKLE_ROOT+"/sorted_incoming_cobrinha_factors_dict.p","wb"))

In [37]:
sorted_incoming_cobrinha_factors_dict["cocoa"][:20]

[('c# ->  cocoa', 0.050269015322713412),
 ('.net ->  cocoa', 0.043480403166406396),
 ('web ->  cocoa', 0.020357344291883593),
 ('javascript ->  cocoa', 0.019193286853413313),
 ('java ->  cocoa', 0.017796699554601537),
 ('performance ->  cocoa', 0.017509169789314431),
 ('design ->  cocoa', 0.014648085949147127),
 ('asp.net ->  cocoa', 0.013691914414744058),
 ('c#-4.0 ->  cocoa', 0.013334739405938469),
 ('ios ->  cocoa', 0.01301635419733287),
 ('php ->  cocoa', 0.012021867130245124),
 ('security ->  cocoa', 0.011434863962037465),
 ('user-interface ->  cocoa', 0.010672397030586688),
 ('html5 ->  cocoa', 0.010382024748637336),
 ('windows ->  cocoa', 0.0075579940416240707),
 ('objective-c ->  cocoa', 0.0074903425439164289),
 ('python ->  cocoa', 0.005285674128554331),
 ('html ->  cocoa', 0.0051343533811515889),
 ('cocoa-touch ->  cocoa', 0.0043010629891521643),
 ('winforms ->  cocoa', 0.0037189101104810156)]

In [38]:
sorted_global_similarity_index = dict()

as_tpls = [(tag,avg_global_sim) for tag,avg_global_sim in global_similarity_index.items()]

sorted_tags_by_global_avg_similarities = sorted(as_tpls, key=lambda tpl:tpl[1] )

sorted_tags_by_global_avg_similarities


[('nsdateformatter', 0.1235006242737529),
 ('gulp', 0.12550886299406785),
 ('theano', 0.13279889214672427),
 ('iad', 0.13424252876119119),
 ('pygame', 0.13871944265382741),
 ('axis2', 0.14036686548531579),
 ('sftp', 0.14242771100503593),
 ('paypal-sandbox', 0.14398429016189587),
 ('travis-ci', 0.14473097238693353),
 ('gitlab', 0.14503271788250655),
 ('log4j', 0.14508752021779084),
 ('slf4j', 0.14819543986540723),
 ('javamail', 0.14902018199608563),
 ('mechanize', 0.15049932244217945),
 ('apache-kafka', 0.15106057493225525),
 ('fancybox', 0.15159771490274485),
 ('jetty', 0.15201504807841859),
 ('virtualenv', 0.15447451239571078),
 ('webpack', 0.15522150974420912),
 ('llvm', 0.15725147767239797),
 ('quartz-scheduler', 0.15783489671850803),
 ('midi', 0.15820764687164901),
 ('cocoapods', 0.15948048952623028),
 ('retrofit', 0.1596460411736727),
 ('wxpython', 0.16242012845974035),
 ('hbase', 0.16243101847306801),
 ('android-volley', 0.16243944126192991),
 ('rmi', 0.16364924228231351),
 ('pas

In [None]:
sorted_tags_by_global_avg_similarities

In [None]:
for very_specific_tag,_ in sorted_tags_by_global_avg_similarities[:50]:
    print(very_specific_tag,sorted_incoming_cobrinha_factors_dict[very_specific_tag][:10],"\n")

## now let's try to find a triple. two tags C and B that have good cobrinha factor and a third tag A that has good cobrinha factor with tag C and tag B (but cobrinha(A,B) should probably be higher than cobrinha (A,C))


A -> B -> C

(TAG C is the most specific tag of the three)
