In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import evaluate_cobrinha

In [2]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [3]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [4]:
sorted_pairwise_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_pairwise_similarity_dict.p","rb"))

In [5]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

In [6]:
global_similarity_index = pickle.load(open(PICKLE_ROOT+"global_similarity_index.p","rb"))

## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [7]:
sorted_outgoing_cobrinha_factors_dict = pickle.load(open(PICKLE_ROOT+"/sorted_outgoing_cobrinha_factors_dict.p","rb"))

In [8]:
TAG_NAME='sql'

In [9]:
sorted_outgoing_cobrinha_factors_dict[TAG_NAME][:20]

[('sql ->  left-join', 0.11577437145661604),
 ('sql ->  union', 0.10682778444440537),
 ('sql ->  inner-join', 0.10638646934945287),
 ('sql ->  distinct', 0.10072914563365006),
 ('sql ->  hql', 0.097797428955005053),
 ('sql ->  subquery', 0.097569194466405745),
 ('sql ->  sql-order-by', 0.095941251164230801),
 ('sql ->  case', 0.092984011162197469),
 ('sql ->  pivot', 0.089149057808170534),
 ('sql ->  data.table', 0.088978850192963643),
 ('sql ->  greatest-n-per-group', 0.0876375345161753),
 ('sql ->  sql-azure', 0.086058702325092967),
 ('sql ->  stored-procedures', 0.084834823617687599),
 ('sql ->  prepared-statement', 0.083812741599067681),
 ('sql ->  query-optimization', 0.083544261256610866),
 ('sql ->  innodb', 0.083249143747091137),
 ('sql ->  hive', 0.083010051724714304),
 ('sql ->  sparql', 0.081786956529279073),
 ('sql ->  spark-dataframe', 0.080874610418233836),
 ('sql ->  primary-key', 0.080668747788729053)]

In [10]:
sorted_incoming_cobrinha_factors_dict = pickle.load(open(PICKLE_ROOT+"/sorted_incoming_cobrinha_factors_dict.p","rb"))

In [11]:
sorted_incoming_cobrinha_factors_dict[TAG_NAME][:20]

[('c# ->  sql', 0.076017263109879704),
 ('.net ->  sql', 0.069735249488555889),
 ('performance ->  sql', 0.05062492185752912),
 ('web ->  sql', 0.048105766071413376),
 ('c#-4.0 ->  sql', 0.048010251944838801),
 ('design ->  sql', 0.043711189046683356),
 ('php ->  sql', 0.04118491960810703),
 ('asp.net ->  sql', 0.040728545186396403),
 ('java ->  sql', 0.040250902230947044),
 ('javascript ->  sql', 0.040198527752782404),
 ('database ->  sql', 0.039079570732830547),
 ('security ->  sql', 0.038636913519285138),
 ('optimization ->  sql', 0.038418706355791005),
 ('user-interface ->  sql', 0.035165200504195346),
 ('.net-4.0 ->  sql', 0.034749300219532715),
 ('html5 ->  sql', 0.033611038485945169),
 ('windows ->  sql', 0.032167628434307011),
 ('ios ->  sql', 0.031834341239257026),
 ('cocoa ->  sql', 0.030729127259777458),
 ('cocoa-touch ->  sql', 0.029636974571906265)]

if "sql" is a "good" parent tag, then the avg of all OUTGOING cobrinha factors should be higher than the average INCOMING cobrinha factors, right?

In [16]:
outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.039994866739055904 
average of incoming factors: -0.039994866739055904


## now let's try to find a triple. two tags B and C that have good cobrinha factor and a third tag A that has good cobrinha factor with tag B and tag C (but cobrinha(A,B) should probably be higher than cobrinha (A,C))


A -> B -> C

(TAG C is the most specific tag of the three)


In [None]:
# take a random tag to be C

tag_C = 'numpy'
tag_C

In [None]:
global_similarity_index[tag_C]

In [None]:
sorted_incoming_cobrinha_factors_dict[tag_C]

### experimenting with different ways to calculate the cobrinha factor, using some known tags

In [None]:
sorted_similarity_numpy = sorted_similarity_dict['numpy']
sorted_similarity_numpy[:20]

In [None]:
global_similarity_index['numpy']

normalizing the global similarity factors to spread out the values

In [None]:
min_global_similarity = np.array([value for tag,value in global_similarity_index.items()]).min()
max_global_similarity = np.array([value for tag,value in global_similarity_index.items()]).max()

min_global_similarity,max_global_similarity

In [None]:
normalized_global_similarity_index = dict()

for tag,value in global_similarity_index.items():
    normalized_value = (value - min_global_similarity) / (max_global_similarity - min_global_similarity)
    normalized_global_similarity_index[tag] = normalized_value

In [None]:
np.array([ value for value in normalized_global_similarity_index.values()]).min()

In [None]:
for (tag, similarities_to_other_tags) in tqdm(sorted_similarity_dict.items()):
    
    for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

        (avg_avg, mutual_similarity)= evaluate_cobrinha(tag,other_tag,tag_vectors_index,pairwise_similarity_index)
        
        cobrinha_factor = avg_avg * mutual_similarity
        pair = "{} ->  {}".format(tag,other_tag)    
        
        outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
        incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))