In [4]:
import csv
import os
import pickle
import re
import sys
import numpy as np

from joblib import Parallel, delayed

from difflib import SequenceMatcher,get_close_matches

from scipy import spatial

from tqdm import *

module_path = os.path.abspath(os.path.join('../helpers/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# my stuff in the helpers/ directory
import embeddings_helper, files_helper, texts_helper, metrics_helper, tags_helper, cobrinha_helper

from cobrinha_helper import get_metrics_for_tag_pair

In [5]:
PICKLE_ROOT = "/media/felipe/SSD_VOLUME/auto-tagger/data/tag-hierarchy/"

In [6]:
tag_vocabulary = pickle.load(open(PICKLE_ROOT+"tag_vocabulary.p","rb"))

In [7]:
sorted_pairwise_similarity_dict = pickle.load(open(PICKLE_ROOT+"sorted_pairwise_similarity_dict.p","rb"))

In [8]:
tag_vectors_index = pickle.load(open(PICKLE_ROOT+"tag_vectors_index.p","rb"))

In [9]:
global_similarity_index = pickle.load(open(PICKLE_ROOT+"global_similarity_index.p","rb"))

## each tag, with its most likely child/parent tags in its cobrinha

"parent" -> "child"

a leaf/terminal tag has no "parent" tag
the root tag has no "parent" tag

In [10]:
sorted_outgoing_cobrinha_factors_dict = pickle.load(open(PICKLE_ROOT+"/sorted_outgoing_cobrinha_factors_dict.p","rb"))

In [11]:
TAG_NAME='sql'

In [12]:
sorted_outgoing_cobrinha_factors_dict[TAG_NAME][:20]

[('sql ->  hive', 0.10276002713402385),
 ('sql ->  sqlalchemy', 0.094713737813830662),
 ('sql ->  stored-procedures', 0.093697645399657289),
 ('sql ->  pdo', 0.092448899800979589),
 ('sql ->  cassandra', 0.081286907002972864),
 ('sql ->  triggers', 0.080766614926709529),
 ('sql ->  group-by', 0.079990703914387812),
 ('sql ->  solr', 0.079526179575641884),
 ('sql ->  crystal-reports', 0.079489616749060299),
 ('sql ->  join', 0.079430678775405286),
 ('sql ->  jqgrid', 0.079298781585696851),
 ('sql ->  pandas', 0.075814129772925537),
 ('sql ->  datepicker', 0.075305507452054235),
 ('sql ->  apache-spark', 0.074622211190207607),
 ('sql ->  django-models', 0.074128399924297636),
 ('sql ->  date', 0.074089251831816569),
 ('sql ->  doctrine2', 0.073740729782105288),
 ('sql ->  redis', 0.072463749411050854),
 ('sql ->  awk', 0.071576014960321135),
 ('sql ->  mongoose', 0.071425372118284536)]

In [13]:
sorted_incoming_cobrinha_factors_dict = pickle.load(open(PICKLE_ROOT+"/sorted_incoming_cobrinha_factors_dict.p","rb"))

In [14]:
sorted_incoming_cobrinha_factors_dict[TAG_NAME][:20]

[('c# ->  sql', 0.085390300456521517),
 ('coldfusion ->  sql', 0.080063227078340182),
 ('.net ->  sql', 0.079232246702481607),
 ('c#-4.0 ->  sql', 0.064849222935643103),
 ('web ->  sql', 0.059771984718923983),
 ('performance ->  sql', 0.058155180925289235),
 ('design ->  sql', 0.056220383191762383),
 ('optimization ->  sql', 0.053104891599728074),
 ('mfc ->  sql', 0.050435049802104677),
 ('php ->  sql', 0.047990328803707426),
 ('.net-4.0 ->  sql', 0.046292334747653791),
 ('javascript ->  sql', 0.045768591706195058),
 ('java ->  sql', 0.045135552585241989),
 ('web-applications ->  sql', 0.044673000616481426),
 ('security ->  sql', 0.044533369686082541),
 ('asp.net ->  sql', 0.044172766699079873),
 ('database ->  sql', 0.043839806447319593),
 ('user-interface ->  sql', 0.043635938933251225),
 ('cocoa ->  sql', 0.042109292814787716),
 ('dynamic ->  sql', 0.038831760049065585)]

if "sql" is a "good" parent tag, then the avg of all OUTGOING cobrinha factors should be higher than the average INCOMING cobrinha factors, right?

In [15]:
outgoing_factors = list()

for descr, factor in sorted_outgoing_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[1].strip()
    outgoing_factors.append(factor)
    
incoming_factors = list()

for descr, factor in sorted_incoming_cobrinha_factors_dict[TAG_NAME]:
    child_tag = descr.split("->")[0].strip()
    incoming_factors.append(factor)
       
print("average of outgoing factors: {} \naverage of incoming factors: {}".format(np.array(outgoing_factors).mean(),np.array(incoming_factors).mean()))

average of outgoing factors: 0.02967704688622151 
average of incoming factors: -0.02967704688622152


## now let's try to find a triple. two tags B and C that have good cobrinha factor and a third tag A that has good cobrinha factor with tag B and tag C (but cobrinha(A,B) should probably be higher than cobrinha (A,C))


A -> B -> C

(TAG C is the most specific tag of the three)


### let's experiment first

In [16]:
# this is not very good, because numpy is a subset of scipy
# so scipy should, in theory, be more general than numpy

# perhaps this has to do with the fact that numpy tags are much more common than scipy, and this
# causes numpy-tagged documents to have more words, in general, than scipy?

global_similarity_index['scipy'],global_similarity_index['numpy']

(0.32109000056951026, 0.34095440824790835)

it's bad that 'c#' and things like '.net' and 'web' are at the top **but** 'python' and 'performance' 
and 'optimization' and 'python-2.7' at the top are indicative that something is correct

In [17]:
sorted_incoming_cobrinha_factors_dict['numpy'][:20]

[('c# ->  numpy', 0.12847555317981488),
 ('optimization ->  numpy', 0.12641248779887426),
 ('coldfusion ->  numpy', 0.1258992496446853),
 ('.net ->  numpy', 0.12541055144850613),
 ('python ->  numpy', 0.12346404854110155),
 ('performance ->  numpy', 0.12027819824676393),
 ('design ->  numpy', 0.11034647441294521),
 ('web ->  numpy', 0.11023087644509201),
 ('c#-4.0 ->  numpy', 0.10882455977295125),
 ('mfc ->  numpy', 0.10766332411621088),
 ('php ->  numpy', 0.10419585056303851),
 ('python-2.7 ->  numpy', 0.10384015079829514),
 ('r ->  numpy', 0.10352278112127131),
 ('math ->  numpy', 0.10317520640333971),
 ('javascript ->  numpy', 0.10223946431537063),
 ('user-interface ->  numpy', 0.10207049856573772),
 ('cocoa ->  numpy', 0.10055989159272466),
 ('machine-learning ->  numpy', 0.099747408944912005),
 ('java ->  numpy', 0.099275438663545088),
 ('.net-4.0 ->  numpy', 0.099070839805774052)]

In [20]:
sorted_incoming_cobrinha_factors_dict['scipy'][:20]

[('python ->  scipy', 0.12893465769770013),
 ('c# ->  scipy', 0.12892766907173855),
 ('optimization ->  scipy', 0.12870550711244355),
 ('coldfusion ->  scipy', 0.12825639455165852),
 ('.net ->  scipy', 0.12761206184327517),
 ('performance ->  scipy', 0.11919865762374271),
 ('web ->  scipy', 0.11456223962274026),
 ('design ->  scipy', 0.11315218506253372),
 ('python-2.7 ->  scipy', 0.11178686640977074),
 ('mfc ->  scipy', 0.11055195097575705),
 ('c#-4.0 ->  scipy', 0.10950663709182108),
 ('machine-learning ->  scipy', 0.10706595675074412),
 ('r ->  scipy', 0.10581884293358823),
 ('user-interface ->  scipy', 0.10542317942255876),
 ('math ->  scipy', 0.10541333669928703),
 ('javascript ->  scipy', 0.10429025460960396),
 ('python-3.x ->  scipy', 0.10299121453902661),
 ('cocoa ->  scipy', 0.10278439830524326),
 ('.net-4.0 ->  scipy', 0.10275653361422707),
 ('php ->  scipy', 0.10269273593356935)]

In [21]:
sorted_outgoing_cobrinha_factors_dict['numpy'][:20]

[('numpy ->  tensorflow', 0.022369597493423075),
 ('numpy ->  matplotlib', 0.021948282995422543),
 ('numpy ->  gulp', 0.021270398513764551),
 ('numpy ->  hive', 0.019474775332498966),
 ('numpy ->  npm', 0.018568786468442251),
 ('numpy ->  vagrant', 0.018468191912356609),
 ('numpy ->  webpack', 0.017774954927796193),
 ('numpy ->  tkinter', 0.017686390666989415),
 ('numpy ->  docker', 0.017484463249672122),
 ('numpy ->  socket.io', 0.017377061233184218),
 ('numpy ->  jqgrid', 0.016997906178795187),
 ('numpy ->  recyclerview', 0.016566225367826561),
 ('numpy ->  boost', 0.015999526703962708),
 ('numpy ->  youtube', 0.015119115654782538),
 ('numpy ->  scipy', 0.014659031564540129),
 ('numpy ->  datepicker', 0.0145125603891015),
 ('numpy ->  ffmpeg', 0.014509324864142855),
 ('numpy ->  paypal', 0.014508934403959053),
 ('numpy ->  solr', 0.014279051985933293),
 ('numpy ->  redis', 0.014235484849011127)]

In [22]:
sorted_outgoing_cobrinha_factors_dict['scipy'][:20]

[('scipy ->  gulp', 0.018323301785808675),
 ('scipy ->  npm', 0.014958369932752246),
 ('scipy ->  vagrant', 0.014657292087827949),
 ('scipy ->  hive', 0.014356348399593688),
 ('scipy ->  webpack', 0.014184347483725402),
 ('scipy ->  tensorflow', 0.014150049365652718),
 ('scipy ->  socket.io', 0.014134085854913126),
 ('scipy ->  docker', 0.013331978921138609),
 ('scipy ->  recyclerview', 0.012377320575864787),
 ('scipy ->  tkinter', 0.012075900007056098),
 ('scipy ->  jqgrid', 0.011706003343797556),
 ('scipy ->  matplotlib', 0.010907777015241166),
 ('scipy ->  boost', 0.010845701599994076),
 ('scipy ->  youtube', 0.010730472233130358),
 ('scipy ->  paypal', 0.010091584479948406),
 ('scipy ->  datepicker', 0.010012571438284112),
 ('scipy ->  gruntjs', 0.009896958241459277),
 ('scipy ->  ffmpeg', 0.009544040491082156),
 ('scipy ->  redis', 0.0093120541188558264),
 ('scipy ->  solr', 0.0092756884127813896)]

### let's try some triples, starting with tags very low in the hierarchy (i.e. low generality, high specificity)

In [27]:
sorted(global_similarity_index.items(),key=lambda tpl: tpl[1])[:10]

[('gulp', 0.19766495986060426),
 ('vagrant', 0.23857715599421261),
 ('socket.io', 0.24158315695390806),
 ('recyclerview', 0.24221057782043745),
 ('npm', 0.24231542434386466),
 ('hive', 0.24605001039331367),
 ('webpack', 0.2493951735363871),
 ('docker', 0.25095802293264158),
 ('jqgrid', 0.2598639010154486),
 ('youtube', 0.26563900160188986)]

**gulp**

In [34]:
sorted_incoming_cobrinha_factors_dict['gulp'][:20]

[('coffeescript ->  gulp', 0.12777525041758539),
 ('javascript ->  gulp', 0.11807356807143261),
 ('c# ->  gulp', 0.11324329677604095),
 ('.net ->  gulp', 0.11266161915520347),
 ('web ->  gulp', 0.11175254915893425),
 ('coldfusion ->  gulp', 0.11138210259907434),
 ('node.js ->  gulp', 0.10423616618758133),
 ('c#-4.0 ->  gulp', 0.10114252641436468),
 ('windows ->  gulp', 0.099776613691774652),
 ('web-applications ->  gulp', 0.099295068531828637),
 ('linux ->  gulp', 0.09901007576224076),
 ('.net-4.0 ->  gulp', 0.098948597076992079),
 ('performance ->  gulp', 0.096398289357611411),
 ('html5 ->  gulp', 0.096162717669970282),
 ('optimization ->  gulp', 0.095342993218827846),
 ('mfc ->  gulp', 0.095089416595493761),
 ('php ->  gulp', 0.09497876192022589),
 ('security ->  gulp', 0.094552945362584975),
 ('design ->  gulp', 0.094352021696298624),
 ('configuration ->  gulp', 0.094005409555496688)]

In [33]:
sorted_incoming_cobrinha_factors_dict['coffeescript'][:20]

[('c# ->  coffeescript', 0.045593817359619523),
 ('.net ->  coffeescript', 0.040580788063916962),
 ('coldfusion ->  coffeescript', 0.035490180324854327),
 ('web ->  coffeescript', 0.02413187449524163),
 ('c#-4.0 ->  coffeescript', 0.02275534878679698),
 ('design ->  coffeescript', 0.016268077998788971),
 ('mfc ->  coffeescript', 0.01360654029285167),
 ('performance ->  coffeescript', 0.012742745591927488),
 ('javascript ->  coffeescript', 0.0096436547762082676),
 ('optimization ->  coffeescript', 0.0089898445247370874),
 ('.net-4.0 ->  coffeescript', 0.0083501564874030546),
 ('java ->  coffeescript', 0.007745308565757854),
 ('web-applications ->  coffeescript', 0.0068758764929214844),
 ('user-interface ->  coffeescript', 0.0065735452239365005),
 ('security ->  coffeescript', 0.0055238777509721828),
 ('php ->  coffeescript', 0.004772964972236466),
 ('cocoa ->  coffeescript', 0.0044496078473494141),
 ('asp.net ->  coffeescript', 0.0041192880208538634),
 ('winforms ->  coffeescript', -0.0

so we get **"c#" -> "coffeescript" -> "gulp"**, which is clearly not good

**hive**

In [32]:
sorted_incoming_cobrinha_factors_dict['hive'][:10]

[('coldfusion ->  hive', 0.13180310706480242),
 ('c# ->  hive', 0.12755044935404264),
 ('.net ->  hive', 0.12322638364537729),
 ('c#-4.0 ->  hive', 0.11654178507327721),
 ('performance ->  hive', 0.11557675787626696),
 ('java ->  hive', 0.11522054427271003),
 ('database ->  hive', 0.11456189832666856),
 ('optimization ->  hive', 0.11197134972375021),
 ('web ->  hive', 0.11017933361388417),
 ('design ->  hive', 0.10685465890596936)]

In [36]:
sorted_incoming_cobrinha_factors_dict['coldfusion'][:10]

[('c# ->  coldfusion', 0.011171848427792159),
 ('.net ->  coldfusion', 0.006262291713491637),
 ('web ->  coldfusion', -0.011801265280286287),
 ('c#-4.0 ->  coldfusion', -0.011813813693388979),
 ('design ->  coldfusion', -0.01902585119813301),
 ('mfc ->  coldfusion', -0.021781790262029428),
 ('performance ->  coldfusion', -0.02261719702195528),
 ('optimization ->  coldfusion', -0.026487990456318625),
 ('java ->  coldfusion', -0.026575144550786828),
 ('.net-4.0 ->  coldfusion', -0.027225658307209438)]

so we get **"c#" -> "coldfusion" -> "hive"**, which is also clearly not good

**vagrant**

In [37]:
sorted_incoming_cobrinha_factors_dict['vagrant'][:10]

[('.net ->  vagrant', 0.12755478719109667),
 ('coldfusion ->  vagrant', 0.12502620326175895),
 ('c# ->  vagrant', 0.12464896258138368),
 ('web ->  vagrant', 0.11825343334156184),
 ('linux ->  vagrant', 0.11712452451458935),
 ('windows ->  vagrant', 0.11454400943890299),
 ('configuration ->  vagrant', 0.11413024840816846),
 ('security ->  vagrant', 0.11014917905957004),
 ('c#-4.0 ->  vagrant', 0.10978488445558492),
 ('web-applications ->  vagrant', 0.10948348634312907)]

In [38]:
sorted_incoming_cobrinha_factors_dict['.net'][:10]

[('c# ->  .net', 0.0055145420529605438),
 ('coldfusion ->  .net', -0.006262291713491637),
 ('web ->  .net', -0.017830226180964654),
 ('c#-4.0 ->  .net', -0.019914010104834724),
 ('design ->  .net', -0.026139008135239228),
 ('performance ->  .net', -0.029170595040097506),
 ('mfc ->  .net', -0.029271747994794164),
 ('javascript ->  .net', -0.032096981664867873),
 ('optimization ->  .net', -0.032459393298804769),
 ('java ->  .net', -0.034343002431481806)]

so we get **"c#" -> ".net" -> "vagrant"**, which is also clearly not good

> i think the lesson here is that tags that are very common like "c#" and ".net" are dominating the effect here, although it's nice to see more reasonable parent tags a bit down the ranking..

> maybe we need to tweak the weighting so that tag similarity has more say than generality.

### experimenting with different ways to calculate the cobrinha factor, using some known tags

> try to make the results in the examples above work better 


it's interesting to see how raw 'pairwise similarity' (i.e. average of documents) expresses itself in all sorts of different *types* of relationships, such as:

- 'numpy' **IS A** 'scipy','python', ETC
- 'numpy' **USES** 'multidimensional-array', 'loops', ETC
- 'numpy' **CO-OCCURS WITH** 'matplotlib', 'indexing', 'statistics', 'machine-learning' ETC

In [41]:
sorted_similarity_numpy = sorted_pairwise_similarity_dict['numpy']
sorted_similarity_numpy[:20]

[('scipy', 0.73795462728452554),
 ('python', 0.6051378355446887),
 ('python-2.7', 0.58103762068282272),
 ('python-3.x', 0.57065000547652256),
 ('arrays', 0.56649383192167813),
 ('optimization', 0.5549155026628283),
 ('multidimensional-array', 0.53539223899624244),
 ('machine-learning', 0.52378698228612841),
 ('math', 0.52241755901039955),
 ('r', 0.52006878905730758),
 ('performance', 0.51645088293576003),
 ('algorithm', 0.50774531441482151),
 ('loops', 0.50507150303237203),
 ('for-loop', 0.50342738124795994),
 ('matplotlib', 0.4972864599245761),
 ('data-structures', 0.4912846772342272),
 ('coldfusion', 0.48351401853152909),
 ('c#', 0.47002047080549647),
 ('php', 0.46929878911534928),
 ('.net', 0.46862605478970987)]

In [42]:
global_similarity_index['numpy']

0.34095440824790835

normalizing the global similarity factors to spread out the values

In [None]:
min_global_similarity = np.array([value for tag,value in global_similarity_index.items()]).min()
max_global_similarity = np.array([value for tag,value in global_similarity_index.items()]).max()

min_global_similarity,max_global_similarity

In [None]:
normalized_global_similarity_index = dict()

for tag,value in global_similarity_index.items():
    normalized_value = (value - min_global_similarity) / (max_global_similarity - min_global_similarity)
    normalized_global_similarity_index[tag] = normalized_value

In [None]:
for (tag, similarities_to_other_tags) in tqdm(sorted_similarity_dict.items()):
    
    for (other_tag, similarity_to_other_tag) in similarities_to_other_tags:

        (avg_avg, mutual_similarity)= evaluate_cobrinha(tag,other_tag,tag_vectors_index,pairwise_similarity_index)
        
        cobrinha_factor = avg_avg * mutual_similarity
        pair = "{} ->  {}".format(tag,other_tag)    
        
        outgoing_cobrinha_factors_dict[tag].append( (pair,cobrinha_factor))
        incoming_cobrinha_factors_dict[other_tag].append( (pair,cobrinha_factor))