## Relational Embedding

In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
import word2vec
import itertools
import numpy as np
from relational_embedder import composition
from data_prep import data_prep_utils as dpu
from scipy.spatial.distance import cosine

In [37]:
# Loading binary vectors
model = word2vec.load("/home/ubuntu/word2vec_c/data/seperson_drupalemployee.bin")

In [38]:
indexes, metrics = model.cosine('kimball', n=20)

In [39]:
res = model.generate_response(indexes, metrics).tolist()

In [40]:
res

[('kimball_richard_w', 0.7698834167080255),
 ('968548423', 0.7293372253649367),
 ('hicklin2', 0.7022531778883037),
 ('3-269', 0.6828941371679911),
 ('6172539707', 0.6112949479789522),
 ('poet', 0.5927492963115722),
 ('mclellan', 0.5882572489910447),
 ('mcam', 0.5664702002297626),
 ('7819816361', 0.5662322099605901),
 ('926279941', 0.5615623705031092),
 ('910006165', 0.5603171306918009),
 ('david_southard@ll_mit_edu', 0.5584729825706068),
 ('kimball_robert_p', 0.5427674581903972),
 ('900052036', 0.5411373575287313),
 ('7819816426', 0.5407423052343436),
 ('hicklin_ii_robert_william', 0.5379797395342437),
 ('zahmo', 0.5246461132707749),
 ('927045585', 0.5215292419146997),
 ('thurston_kimball_d', 0.51322404020071),
 ('jrye', 0.5016387809325586)]

#### Analogies

In [7]:
indexes, metrics = model.analogy(pos=['kimball', '3-269'], neg=['e23-266'], n=10)

In [8]:
res = model.generate_response(indexes, metrics).tolist()
res

[('kimball_richard_w', 0.28231005147222693),
 ('6172539707', 0.2623320371106695),
 ('910006165', 0.25320001353761157),
 ('7819816426', 0.23703705376079365),
 ('caryn', 0.23696845885282605),
 ('thurston_kimball_d', 0.23244744586696164),
 ('968548423', 0.22943614363511794),
 ('926279941', 0.2266240752917411),
 ('7819811585', 0.22512961794583908),
 ('7819815170', 0.22131277337164904)]

## Baseline composition

In [53]:
path_to_relation = "/data/datasets/mitdwh/Se_person.csv"
col_we_se, missing_words = composition.column_avg_composition(path_to_relation, model)

In [54]:
path_to_relation = "/data/datasets/mitdwh/Drupal_employee_directory.csv"
col_we_drupal, missing_words = composition.column_avg_composition(path_to_relation, model)

In [55]:
se_vec = composition.relation_column_composition(col_we_se)
drupal_vec = composition.relation_column_composition(col_we_drupal)

In [57]:
cosine(se_vec, drupal_vec)

0.67595657658132569

In [46]:
for a, b in itertools.combinations(col_we_se.keys(), 2):
    we_a = col_we[a]
    we_b = col_we[b]

    cos = cosine(we_a, we_b)
    print(str(a) + " -sim- " + str(b) + " is: " + str(cos))

Employee Type -sim- Middle Name is: 0.468415365418
Employee Type -sim- Last Name is: 0.620014389093
Employee Type -sim- Is Active is: 0.354705420918
Employee Type -sim- Mit Id is: 0.600311577721
Employee Type -sim- Full Name is: 0.561698343565
Employee Type -sim- First Name is: 0.846132504292
Employee Type -sim- Position Title is: 0.777724546264
Employee Type -sim- Organization is: 0.352706213248
Employee Type -sim- Krb Name is: 0.466058181094
Employee Type -sim- Payroll Rank is: 0.212323487352
Employee Type -sim- Office Location is: 0.487996025908
Middle Name -sim- Last Name is: 0.705980952575
Middle Name -sim- Is Active is: 0.62271853565
Middle Name -sim- Mit Id is: 0.744537221278
Middle Name -sim- Full Name is: 0.76888341386
Middle Name -sim- First Name is: 0.771352810961
Middle Name -sim- Position Title is: 0.933119558868
Middle Name -sim- Organization is: 0.730588432622
Middle Name -sim- Krb Name is: 0.803887993905
Middle Name -sim- Payroll Rank is: 0.645413842897
Middle Name -sim