In [1]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import evaluate_cobrinha

In [2]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [4]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [5]:
sorted_pairwise_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_pairwise_similarity_dict.p","rb"))

In [6]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

In [7]:
global_similarity_index = pickle.load(open(PICKLE_ROOT+"global_similarity_index.p","rb"))

## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [8]:
sorted_outgoing_cobrinha_factors_dict = pickle.load(open(PICKLE_ROOT+"/sorted_outgoing_cobrinha_factors_dict.p","rb"))

In [9]:
TAG_NAME='sql'

In [10]:
sorted_outgoing_cobrinha_factors_dict[TAG_NAME][:20]

[('sql ->  left-join', 0.11577437145661604),
 ('sql ->  union', 0.10682778444440537),
 ('sql ->  inner-join', 0.10638646934945287),
 ('sql ->  distinct', 0.10072914563365006),
 ('sql ->  hql', 0.097797428955005053),
 ('sql ->  subquery', 0.097569194466405745),
 ('sql ->  sql-order-by', 0.095941251164230801),
 ('sql ->  case', 0.092984011162197469),
 ('sql ->  pivot', 0.089149057808170534),
 ('sql ->  data.table', 0.088978850192963643),
 ('sql ->  greatest-n-per-group', 0.0876375345161753),
 ('sql ->  sql-azure', 0.086058702325092967),
 ('sql ->  stored-procedures', 0.084834823617687599),
 ('sql ->  prepared-statement', 0.083812741599067681),
 ('sql ->  query-optimization', 0.083544261256610866),
 ('sql ->  innodb', 0.083249143747091137),
 ('sql ->  hive', 0.083010051724714304),
 ('sql ->  sparql', 0.081786956529279073),
 ('sql ->  spark-dataframe', 0.080874610418233836),
 ('sql ->  primary-key', 0.080668747788729053)]

In [11]:
sorted_incoming_cobrinha_factors_dict = pickle.load(open(PICKLE_ROOT+"/sorted_incoming_cobrinha_factors_dict.p","rb"))

In [12]:
sorted_incoming_cobrinha_factors_dict[TAG_NAME][:20]

[('c# ->  sql', 0.076017263109879704),
 ('.net ->  sql', 0.069735249488555889),
 ('performance ->  sql', 0.05062492185752912),
 ('web ->  sql', 0.048105766071413376),
 ('c#-4.0 ->  sql', 0.048010251944838801),
 ('design ->  sql', 0.043711189046683356),
 ('php ->  sql', 0.04118491960810703),
 ('asp.net ->  sql', 0.040728545186396403),
 ('java ->  sql', 0.040250902230947044),
 ('javascript ->  sql', 0.040198527752782404),
 ('database ->  sql', 0.039079570732830547),
 ('security ->  sql', 0.038636913519285138),
 ('optimization ->  sql', 0.038418706355791005),
 ('user-interface ->  sql', 0.035165200504195346),
 ('.net-4.0 ->  sql', 0.034749300219532715),
 ('html5 ->  sql', 0.033611038485945169),
 ('windows ->  sql', 0.032167628434307011),
 ('ios ->  sql', 0.031834341239257026),
 ('cocoa ->  sql', 0.030729127259777458),
 ('cocoa-touch ->  sql', 0.029636974571906265)]

if "sql" is a "good" parent tag, then the avg of all OUTGOING cobrinha factors should be higher than the average INCOMING cobrinha factors, right?

In [13]:
outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.039994866739055904 
average of incoming factors: -0.039994866739055904


## now let's try to find a triple. two tags B and C that have good cobrinha factor and a third tag A that has good cobrinha factor with tag B and tag C (but cobrinha(A,B) should probably be higher than cobrinha (A,C))


A -> B -> C

(TAG C is the most specific tag of the three)


### let's experiment first

In [31]:
# this is not very good, because numpy is a subset of scipy
# so scipy should, in theory, be more general than numpy

# perhaps this has to do with the fact that numpy tags are much more common than scipy, and this
# causes numpy-tagged documents to have more words, in general, than scipy?

global_similarity_index['scipy'],global_similarity_index['numpy']

(0.26037167905094943, 0.28028507548017551)

it's bad that 'c#' and things like '.net' and 'web' are at the top **but** 'python' and 'performance' 
and 'optimization' and 'python-2.7' at the top are indicative that something is correct

In [33]:
sorted_incoming_cobrinha_factors_dict['numpy'][:20]

[('c# ->  numpy', 0.11162422141423706),
 ('python ->  numpy', 0.10862859843715135),
 ('.net ->  numpy', 0.10828346874922866),
 ('performance ->  numpy', 0.1071618732484329),
 ('optimization ->  numpy', 0.099375054051086334),
 ('python-2.7 ->  numpy', 0.092768478809388349),
 ('web ->  numpy', 0.091382434149046135),
 ('design ->  numpy', 0.090448274587472399),
 ('php ->  numpy', 0.089974847089701049),
 ('javascript ->  numpy', 0.088386052013526492),
 ('c#-4.0 ->  numpy', 0.088379977466540999),
 ('r ->  numpy', 0.087842816623927386),
 ('java ->  numpy', 0.086332934658997787),
 ('user-interface ->  numpy', 0.08467547164562525),
 ('security ->  numpy', 0.083996403208891499),
 ('math ->  numpy', 0.081984727986086997),
 ('windows ->  numpy', 0.081096259401745044),
 ('html5 ->  numpy', 0.080889803940833491),
 ('cocoa ->  numpy', 0.080830250454214173),
 ('asp.net ->  numpy', 0.080570333051964274)]

In [34]:
sorted_incoming_cobrinha_factors_dict['scipy'][:20]

[('python ->  scipy', 0.11491094022020819),
 ('c# ->  scipy', 0.11043251279541243),
 ('.net ->  scipy', 0.10880932146609334),
 ('performance ->  scipy', 0.10348102320358005),
 ('optimization ->  scipy', 0.10156871215722899),
 ('python-2.7 ->  scipy', 0.10125245870618636),
 ('web ->  scipy', 0.094734034192475086),
 ('design ->  scipy', 0.091747143068092965),
 ('r ->  scipy', 0.089216383765767279),
 ('javascript ->  scipy', 0.088538077031058135),
 ('c#-4.0 ->  scipy', 0.088373332626036125),
 ('security ->  scipy', 0.087370145250604575),
 ('java ->  scipy', 0.087123272339262967),
 ('user-interface ->  scipy', 0.086999546988538054),
 ('php ->  scipy', 0.086985520182252724),
 ('windows ->  scipy', 0.086524408252761809),
 ('python-3.x ->  scipy', 0.085846739655010496),
 ('linux ->  scipy', 0.085010985009069145),
 ('math ->  scipy', 0.083425033099711263),
 ('asp.net ->  scipy', 0.082494122974798162)]

In [30]:
sorted_outgoing_cobrinha_factors_dict[tag_C]

[('scipy ->  theano', 0.027203280894129649),
 ('scipy ->  virtualenv', 0.02566795131270112),
 ('scipy ->  pip', 0.017732215805306054),
 ('scipy ->  pygame', 0.016871348260870649),
 ('scipy ->  nltk', 0.015423768128593817),
 ('scipy ->  mechanize', 0.014764132707962271),
 ('scipy ->  twisted', 0.014496057132803636),
 ('scipy ->  urllib2', 0.014310340078749892),
 ('scipy ->  gnuplot', 0.013901513490291397),
 ('scipy ->  nsdateformatter', 0.013883133440107923),
 ('scipy ->  scrapy', 0.013324713594493384),
 ('scipy ->  log4j', 0.013192715301918147),
 ('scipy ->  kivy', 0.013131996369553253),
 ('scipy ->  gulp', 0.013001027234831515),
 ('scipy ->  sftp', 0.012964230620984162),
 ('scipy ->  wxpython', 0.012902549653441203),
 ('scipy ->  cocoapods', 0.012826433705457158),
 ('scipy ->  fancybox', 0.012817051127308009),
 ('scipy ->  paypal-sandbox', 0.012741629662739836),
 ('scipy ->  gitlab', 0.012456007146906409),
 ('scipy ->  travis-ci', 0.012389281809148096),
 ('scipy ->  hbase', 0.01202084

### experimenting with different ways to calculate the cobrinha factor, using some known tags

note how raw 'pairwise similarity' (i.e. average of documents) expresses itself in all sorts of different *types* of relationships, such as:

- 'numpy' **IS A** 'scipy','python', ETC
- 'numpy' **USES** 'multidimensional-array', 'loops', ETC
- 'numpy' **CO-OCCURS WITH** 'matplotlib', 'indexing', 'statistics', 'machine-learning' ETC

In [26]:
sorted_similarity_numpy = sorted_pairwise_similarity_dict['numpy']
sorted_similarity_numpy[:30]

[('scipy', 0.71220018976303312),
 ('python', 0.57821642530026207),
 ('arrays', 0.56707206244731345),
 ('python-2.7', 0.55565578746729072),
 ('python-3.x', 0.53378160036961086),
 ('multidimensional-array', 0.52358685958739304),
 ('performance', 0.50988691055663016),
 ('optimization', 0.50636346924187403),
 ('scikit-learn', 0.50269673341875387),
 ('iteration', 0.49489882739209157),
 ('r', 0.49125292152372624),
 ('algorithm', 0.48620205540964845),
 ('for-loop', 0.47844985879878799),
 ('machine-learning', 0.4777108537821213),
 ('math', 0.47610187885126209),
 ('loops', 0.47416232150673188),
 ('statistics', 0.46123269280994039),
 ('fft', 0.46111290957137097),
 ('matplotlib', 0.45871371350908507),
 ('data-structures', 0.45830745382589022),
 ('php', 0.45112764540868511),
 ('c#', 0.45069846858088591),
 ('.net', 0.44729573341793327),
 ('sorting', 0.44226460480910779),
 ('pandas', 0.44132086307074725),
 ('indexing', 0.44045952247265951),
 ('function', 0.43935526182288787),
 ('language-agnostic', 

In [21]:
global_similarity_index['numpy']

0.28028507548017551

normalizing the global similarity factors to spread out the values

In [22]:
min_global_similarity = np.array([value for tag,value in global_similarity_index.items()]).min()
max_global_similarity = np.array([value for tag,value in global_similarity_index.items()]).max()

min_global_similarity,max_global_similarity

(0.13690997865007154, 0.52795499262682699)

In [23]:
normalized_global_similarity_index = dict()

for tag,value in global_similarity_index.items():
    normalized_value = (value - min_global_similarity) / (max_global_similarity - min_global_similarity)
    normalized_global_similarity_index[tag] = normalized_value

In [None]:
for (tag, similarities_to_other_tags) in tqdm(sorted_similarity_dict.items()):
    
    for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

        (avg_avg, mutual_similarity)= evaluate_cobrinha(tag,other_tag,tag_vectors_index,pairwise_similarity_index)
        
        cobrinha_factor = avg_avg * mutual_similarity
        pair = "{} ->  {}".format(tag,other_tag)    
        
        outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
        incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))