## Relational Embedding

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import word2vec
import itertools
import numpy as np
from relational_embedder import composition
from data_prep import data_prep_utils as dpu
from scipy.spatial.distance import cosine
import pandas as pd

In [3]:
# Loading binary vectors
model = word2vec.load("/home/ubuntu/word2vec_c/data/mitdwhdata.bin")

In [4]:
indexes, metrics = model.cosine('madden_samuel_r', n=20)
res = model.generate_response(indexes, metrics).tolist()
res

[('6172586643', 0.7849313486501621),
 ('srmadden', 0.761890896610126),
 ('madden@csail_mit_edu', 0.7302375712183606),
 ('32-g938', 0.6994234649772965),
 ('alleyne_valerie', 0.6096030452657748),
 ('regev_aviv', 0.6001141852343141),
 ('http://db_csail_mit_edu/madden', 0.5726818600046826),
 ('924559605', 0.5446807208673953),
 ('cadogan_karen_m', 0.5440192014598965),
 ('cadogan@ll_mit_edu', 0.538459665843975),
 ('saxe_rebecca_r', 0.5261379573217481),
 ('lynn_jennifer_j', 0.5234204003276526),
 ('987515930', 0.5216549568539472),
 ('valleyne', 0.5056586181521495),
 ('920145823', 0.49755006654659845),
 ('nhuch_michelle_r', 0.49333601582586056),
 ('santos_adèle_naudé', 0.4914655348305545),
 ('braida_louis_d', 0.4896523932603062),
 ('julich_perez_april', 0.4838387563134501),
 ('hanafin_nicole', 0.4807667923350942)]

In [5]:
indexes, metrics = model.cosine('32-g938', n=20)
res = model.generate_response(indexes, metrics).tolist()
res

[('6172586643', 0.7704075547787494),
 ('madden_samuel_r', 0.6994234649772965),
 ('srmadden', 0.6367840073116177),
 ('madden@csail_mit_edu', 0.6346660052534312),
 ('ne30-6031', 0.5401929703967987),
 ('32_-_g938', 0.49038069710556587),
 ('32-g624', 0.48428400177943187),
 ('e51-296c', 0.48134845586704583),
 ('3-362', 0.4799728439087165),
 ('32-g644', 0.47550937456346765),
 ('32-g698', 0.47390516025064605),
 ('32-g939', 0.468810454226964),
 ('g938', 0.4650410016497478),
 ('32-g936', 0.46012925378551317),
 ('6173246179', 0.4592845652925835),
 ('6172534068', 0.45822708173608384),
 ('http://csg_csail_mit_edu/~devadas', 0.4580845377033581),
 ('devadas@mit_edu', 0.4533585811572982),
 ('900049965', 0.4508476901855568),
 ('6173244911', 0.4503895429486472)]

#### Basic Analogies

In [6]:
indexes, metrics = model.analogy(pos=['madden_samuel_r', '32-g936'], neg=['32-g938'], n=10)

In [7]:
res = model.generate_response(indexes, metrics).tolist()
res

[('katabi_dina', 0.2620763345752194),
 ('6173246027', 0.24025065821678065),
 ('dina@csail_mit_edu', 0.23896212073782416),
 ('wenger_rich', 0.21775141487554264),
 ('dinaktbi', 0.1936179557694433),
 ('951111241', 0.18444792210514846),
 ('pribble_daniel', 0.18236975950919623),
 ('wenger_richard', 0.1822369772140558),
 ('vuletic@mit_edu', 0.18132656363077188),
 ('essigmann_ellen_m', 0.18048992697689886)]

#### Analogies across Relations

In [8]:
indexes, metrics = model.analogy(pos=['gallop_sarah_e', '10-219'], neg=['11-245'], n=10)
res = model.generate_response(indexes, metrics).tolist()
res

[('hasseltine_ronald_e', 0.2610148318803308),
 ('6172530386', 0.24089952295960942),
 ('rhasselt@mit_edu', 0.21199482467337366),
 ('moazeni_hamid', 0.21169834408079385),
 ('medeiros-adams_judith_l', 0.2010895768989216),
 ('gallop', 0.1985037994512413),
 ('900000216', 0.19846011033483396),
 ('wong_yvonne', 0.19579254169153582),
 ('harris_jr_frederick_e', 0.19540866876017504),
 ('6172532152', 0.19526614543230933)]

## Baseline composition: Columns and Relations

In [9]:
# Files
import os
path = "/data/datasets/mitdwh/"
all_relations = [relation for relation in os.listdir(path)]

In [10]:
composition_vectors = dict()

In [11]:
for relation in all_relations:
    print("Computing vectors for: " + str(relation))
    col_we, missing_words = composition.column_avg_composition(path + "/" + relation, model)
    rel_we = composition.relation_column_composition(col_we)
    composition_vectors[relation] = rel_we
    for k, v in col_we.items():
        composition_vectors[relation +"." + k] = col_we[k]
print("Done!")
print("Total vectors: " + str(len(composition_vectors.items())))

Computing vectors for: Fclt_rooms.csv


  if self.run_code(code, result):


Computing vectors for: Hr_org_unit_new.csv
Computing vectors for: Sdo_relatemask_table.csv
Computing vectors for: Sdo_datums_old_snapshot.csv
Computing vectors for: Zpm_rooms_load.csv
Computing vectors for: Mit_student_directory.csv
Computing vectors for: Sdo_coord_axis_names.csv
Computing vectors for: Sdo_ellipsoids_old.csv
Computing vectors for: Iap_subject_session.csv
Computing vectors for: Ctx_stopwords.csv
Computing vectors for: Space_unit.csv
Computing vectors for: Sdo_datum_vertical.csv
Computing vectors for: Sdo_crs_vertical.csv
Computing vectors for: All_olap_functions.csv
Computing vectors for: short_drupal_course_catalog.csv
Computing vectors for: Sdo_crs_geocentric.csv
Computing vectors for: Ctx_parameters.csv
Computing vectors for: Time_day.csv
Computing vectors for: Sdo_units_of_measure.csv
Computing vectors for: Si_thumbnail_format.csv
Computing vectors for: Fclt_building_address_list.csv
Computing vectors for: Student_department.csv
Computing vectors for: Fclt_org_dlc_k

  if self.run_code(code, result):


Computing vectors for: Mrv_olap2_descriptors.csv
Computing vectors for: Wm_installation.csv
Computing vectors for: Person_auth_area.csv
Computing vectors for: Sdo_coord_op_paths.csv
Computing vectors for: Sdo_coord_system.csv
Computing vectors for: Fields.csv
Computing vectors for: Hr_faculty_roster.csv
Computing vectors for: Space_usage.csv
Computing vectors for: Tip_material.csv
Computing vectors for: Space_floor.csv
Computing vectors for: Sdo_crs_geographic3d.csv
Computing vectors for: Zip_usa.csv
Computing vectors for: All_olap_descriptor_types.csv
Computing vectors for: Sis_subject_code.csv
Computing vectors for: Sdo_available_ops.csv
Computing vectors for: Academic_term_parameter.csv
Computing vectors for: All_olap_columns.csv
Computing vectors for: Field_dictionary_definition.csv
Computing vectors for: Drupal_employee_directory.csv
Computing vectors for: Sdo_prime_meridians.csv
Computing vectors for: Academic_terms.csv
Computing vectors for: short_subject_offered_summary.csv
Com

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Computing vectors for: Master_dept_hierarchy.csv
Computing vectors for: Ir_institution.csv
Computing vectors for: Num.csv
Computing vectors for: Sdo_crs_compound.csv
Computing vectors for: Cis_hass_attribute.csv
Computing vectors for: Sdo_crs_engineering.csv
Computing vectors for: Iap_subject_person.csv
Computing vectors for: Si_values.csv
Computing vectors for: Fac_major_use.csv
Computing vectors for: Sdo_datums_old_format.csv
Computing vectors for: Sis_term_address_category.csv
Computing vectors for: Moira_list.csv
Computing vectors for: All_olap2_uentity_desc_uses.csv
Computing vectors for: Fac_building.csv
Computing vectors for: User_view.csv
Computing vectors for: All_hversion_view.csv
Computing vectors for: Ctx_preferences.csv
Computing vectors for: Fclt_major_user_hist.csv
Computing vectors for: Sdo_non_available_elem_units.csv
Computing vectors for: Subject_iap_schedule.csv
Computing vectors for: Fclt_building.csv
Computing vectors for: Fclt_organization.csv
Computing vectors f

In [69]:
# SERIALIZE
import pickle
path = "./temp/composition_vectors.pkl"
with open(path, 'wb') as f:
    pickle.dump(composition_vectors, f)

In [22]:
# DESERIALIZE
import pickle
path = "./temp/composition_vectors.pkl"
with open(path, 'rb') as f:
    composition_vectors = pickle.load(f)

In [12]:
def similar_to(table, column=None, threshold=0.2):
    similar_items = []
    if column is None:
        key = table
    else:
        key = table + "." + column
    for k, we in composition_vectors.items():
        distance = 0
        try:
            distance = cosine(composition_vectors[key], composition_vectors[k])
        except ValueError:
            continue
        if distance < threshold:
            similar_items.append(k)
    return similar_items

In [13]:
table = "Fac_building.csv"
column = "Building Name Long"
#key = "Tables.csv.Business Contact Email"
threshold = 0.2
sim_items = similar_to(table, column=column)
for el in sim_items:
    print(str(el))

Buildings.csv.Building Name
Fclt_building_hist_1.csv.Building Name Long
Fclt_building_hist_1.csv.Building Name
Fclt_building_list.csv.Building Name Long
Fac_building.csv.Building Name
Fclt_building.csv.Building Name
Fclt_building_list.csv.Building Name
Fac_building.csv.Building Name Long
Fclt_building.csv.Building Name Long


In [14]:
table = "Library_subject_offered.csv"
column = "Course Number Desc"
sim_items = similar_to(table, column)
for el in sim_items:
    print(str(el))

Student_department.csv.Department Name
short_subject_offered_summary.csv.Offer Dept Name
short_subject_summary.csv.Department Name
short_cis_course_catalog.csv.Department Name
short_subjects_offered.csv.Offer Dept Name
short_subjects_offered.csv.Master Course Number Desc
short_tip_subject_offered.csv.Master Course Number Desc
short_drupal_course_catalog.csv.Department Name
Mit_student_directory.csv.Department Name
short_course_catalog_subject_offered.csv.Department Name
subject_grouping_slice.csv.Department Name
short_tip_subject_offered.csv.Offer Dept Name
Library_subject_offered.csv.Offer Dept Name
short_tip_subject_offered.csv.Course Number Desc
Sis_course_description.csv.Department Name
short_subjects_offered.csv.Course Number Desc
Library_subject_offered.csv.Master Course Number Desc
Library_subject_offered.csv.Course Number Desc


In [15]:
table = "Warehouse_users.csv"
sim_items = similar_to(table)
for el in sim_items:
    print(str(el))

Warehouse_users.csv
Employee_directory.csv.Middle Name
Warehouse_users.csv.Middle Name
Mit_student_directory.csv.Middle Name
Drupal_employee_directory.csv.Middle Name
Se_person.csv.Middle Name
Hr_faculty_roster.csv.Middle Name
Warehouse_users.csv.Title


In [16]:
table = "Fac_building.csv"
sim_items = similar_to(table, threshold=0.1)
for el in sim_items:
    print(str(el))

Fclt_building_hist_1.csv
Fac_building.csv
Fclt_building.csv
Fclt_building_list.csv


### Other examples

In [25]:
path_to_relation = "/data/datasets/mitdwh/Se_person.csv"
col_we_se, missing_words = composition.column_avg_composition(path_to_relation, model)

In [26]:
path_to_relation = "/data/datasets/mitdwh/Drupal_employee_directory.csv"
col_we_drupal, missing_words = composition.column_avg_composition(path_to_relation, model)

In [27]:
se_vec = composition.relation_column_composition(col_we_se)
drupal_vec = composition.relation_column_composition(col_we_drupal)

In [28]:
cosine(se_vec, drupal_vec)

0.17816106687438826

In [30]:
for a, b in itertools.combinations(col_we_se.keys(), 2):
    we_a = col_we_se[a]
    we_b = col_we_se[b]

    cos = cosine(we_a, we_b)
    print(str(a) + " -sim- " + str(b) + " is: " + str(cos))

Is Active -sim- Organization is: 0.71580180098
Is Active -sim- Last Name is: 0.754346633966
Is Active -sim- Employee Type is: 0.593234068665
Is Active -sim- Krb Name is: 0.743188209349
Is Active -sim- Full Name is: 0.773458063759
Is Active -sim- Mit Id is: 0.713277708397
Is Active -sim- Office Location is: 1.04746727659
Is Active -sim- Position Title is: 0.654582383524
Is Active -sim- Payroll Rank is: 0.545001600076
Is Active -sim- Middle Name is: 1.0106378701
Is Active -sim- First Name is: 0.649957258588
Organization -sim- Last Name is: 0.550844356843
Organization -sim- Employee Type is: 0.431105480558
Organization -sim- Krb Name is: 0.49305379079
Organization -sim- Full Name is: 0.608568961034
Organization -sim- Mit Id is: 0.414800770861
Organization -sim- Office Location is: 0.929235802932
Organization -sim- Position Title is: 0.720257754811
Organization -sim- Payroll Rank is: 0.33552013582
Organization -sim- Middle Name is: 0.905511666916
Organization -sim- First Name is: 0.5615552

## Composition Rows

In [9]:
path_to_relation = "/data/datasets/mitdwh/Se_person.csv"
row_we, m_words = composition.row_avg_composition(path_to_relation, model)

In [24]:
df = pd.read_csv(path_to_relation, encoding='latin1')

In [35]:
row_ref = 0
distances = []
for i in range(len(row_we)):
    distance = cosine(row_we[row_ref], row_we[i])
    distances.append((i, distance))
distances = sorted(distances, key=lambda x: x[1])

In [36]:
len(distances)

16497

In [45]:
k = 10
topk = []
for i in range(k):
    idx = distances[i]
    row = df.iloc[idx[0]]
    topk.append((row, distances[i]))

In [47]:
df.iloc[0]

Mit Id                                   900000624
Krb Name                                    PJCORN
Full Name                        Cornelio, Paula J
Payroll Rank                         Support Staff
Position Title     Senior Administrative Assistant
Is Active                                        Y
Office Location                             24-122
Organization       Nuclear Science and Engineering
First Name                                   Paula
Last Name                                 Cornelio
Middle Name                                      J
Employee Type                        Support Staff
Name: 0, dtype: object

In [49]:
topk[1]

(Mit Id                                   900009589
 Krb Name                                   MAGNANO
 Full Name                  Magnano-Bleheen, Lisa J
 Payroll Rank                         Support Staff
 Position Title     Senior Administrative Assistant
 Is Active                                        Y
 Office Location                             24-107
 Organization       Nuclear Science and Engineering
 First Name                                    Lisa
 Last Name                          Magnano-Bleheen
 Middle Name                                      J
 Employee Type                        Support Staff
 Name: 2126, dtype: object, (2126, 0.20728029774227363))

In [50]:
topk[2]

(Mit Id                                     900053097
 Krb Name                                      TLAYTE
 Full Name                            Layte, Thomas E
 Payroll Rank                         Other Acad-Inst
 Position Title     Club Coach - Head Wrestling Coach
 Is Active                                          Y
 Office Location                              W20-549
 Organization                      Student Activities
 First Name                                    Thomas
 Last Name                                      Layte
 Middle Name                                        E
 Employee Type                   Other Academic Group
 Name: 1, dtype: object, (1, 0.24667526173039556))

In [51]:
topk[3]

(Mit Id                                   929452712
 Krb Name                                  RALLISON
 Full Name                        Allison, Robert L
 Payroll Rank                         Support Staff
 Position Title         Administrative Assistant II
 Is Active                                        Y
 Office Location                             24-209
 Organization       Nuclear Science and Engineering
 First Name                                  Robert
 Last Name                                  Allison
 Middle Name                                      L
 Employee Type                        Support Staff
 Name: 12127, dtype: object, (12127, 0.27658025668369934))

In [52]:
topk[4]

(Mit Id                               900001346
 Krb Name                              DROSSETT
 Full Name               Stewart, Denise Marian
 Payroll Rank                     Support Staff
 Position Title     Administrative Assistant II
 Is Active                                    Y
 Office Location                        E19-307
 Organization             MIT Energy Initiative
 First Name                              Denise
 Last Name                              Stewart
 Middle Name                             Marian
 Employee Type                    Support Staff
 Name: 4, dtype: object, (4, 0.2851865397543345))

## Operations across hierarchy

In [34]:
def topk_similar(vec, composition_vectors):
    distances = []
    errors = 0
    for key, value in composition_vectors.items():
        try:
            distance = cosine(vec, value)
        except ValueError:
            errors += 1
            continue
        distances.append((key, distance))
    distances = sorted(distances, key=lambda x: x[1])
    return distances, errors

In [109]:
entity = 'madden_samuel_r'
vec = model.get_vector(entity)

In [110]:
distances, errors = topk_similar(vec, composition_vectors)
print("ERRORS: " + str(errors))

ERRORS: 2


In [113]:
for i in range(10):
    print(str(distances[i]))

('Employee_directory.csv.Directory Full Name', 0.67486948191596019)
('Drupal_employee_directory.csv.Full Name', 0.67506362935953867)
('Employee_directory.csv.Full Name Uppercase', 0.67520124740119769)
('Employee_directory.csv.Full Name', 0.67520124740119769)
('Se_person.csv.Full Name', 0.7062898513094884)
('Hr_faculty_roster.csv.Endowed Chair', 0.73855610957255335)
('Library_material_status.csv.Library Material Status', 0.75961084105141052)
('Fclt_organization_hist.csv.Fclt Organization Hist Key', 0.76099739213383732)
('Fclt_major_user_hist.csv.Fclt Major Use Hist Key', 0.76563099929479261)
('Employee_directory.csv', 0.76570343434449251)


#### Check rows

In [114]:
distances = []
for i in range(len(row_we)):
    distance = cosine(vec, row_we[i])
    distances.append((i, distance))
distances = sorted(distances, key=lambda x: x[1])

In [115]:
k = 10
topk = []
for i in range(k):
    idx = distances[i]
    row = df.iloc[idx[0]]
    topk.append((row, distances[i]))

In [116]:
topk[0]

(Mit Id                                           987515930
 Krb Name                                          SRMADDEN
 Full Name                                 Madden, Samuel R
 Payroll Rank                               Faculty Tenured
 Position Title                                   Professor
 Is Active                                                Y
 Office Location                                    32-G938
 Organization       Electrical Engineering-Computer Science
 First Name                                          Samuel
 Last Name                                           Madden
 Middle Name                                              R
 Employee Type                                      Faculty
 Name: 11549, dtype: object, (11549, 0.31959939999764442))

In [117]:
topk[1]

(Mit Id                   927168762
 Krb Name                    AREGEV
 Full Name              Regev, Aviv
 Payroll Rank       Faculty Tenured
 Position Title           Professor
 Is Active                        Y
 Office Location          NE30-6031
 Organization               Biology
 First Name                    Aviv
 Last Name                    Regev
 Middle Name                    NaN
 Employee Type              Faculty
 Name: 13576, dtype: object, (13576, 0.51660243460771871))

In [118]:
topk[2]

(Mit Id                                           900040158
 Krb Name                                            RONITT
 Full Name                                Rubinfeld, Ronitt
 Payroll Rank                               Faculty Tenured
 Position Title                                   Professor
 Is Active                                                Y
 Office Location                                    32-G698
 Organization       Electrical Engineering-Computer Science
 First Name                                          Ronitt
 Last Name                                        Rubinfeld
 Middle Name                                            NaN
 Employee Type                                      Faculty
 Name: 9512, dtype: object, (9512, 0.57402982752297649))

In [119]:
topk[3]

(Mit Id                              920366745
 Krb Name                                 SAXE
 Full Name                     Saxe, Rebecca R
 Payroll Rank                  Faculty Tenured
 Position Title                      Professor
 Is Active                                   Y
 Office Location                       46-4019
 Organization       Brain & Cognitive Sciences
 First Name                            Rebecca
 Last Name                                Saxe
 Middle Name                                 R
 Employee Type                         Faculty
 Name: 6391, dtype: object, (6391, 0.58098534316260131))

### Designing API

In [42]:
def concept_qa(entity, attribute, n=20):
    #vec_entity = model.get_vector(entity)
    indexes, metrics = model.cosine(entity, n=n)
    res = model.generate_response(indexes, metrics).tolist()
    vec_attribute = composition_vectors[attribute]
    candidate_attribute_sim = []
    for e, score in res:
        vec_e = model.get_vector(e)
        distance = cosine(vec_e, vec_attribute)
        candidate_attribute_sim.append((e, distance))
    candidate_attribute_sim = sorted(candidate_attribute_sim, key=lambda x: x[1])
    return candidate_attribute_sim

def entity_to_attribute(entity, n=20):
    indexes, metrics = model.cosine(entity, n=n)
    res = model.generate_response(indexes, metrics).tolist()
    res_attr = []
    for e, score in res:
        vec_e = model.get_vector(e)
        distances, errors = topk_similar(vec_e, composition_vectors)
        res_attr.append((e, score, distances[:4]))
    return res_attr

In [30]:
# finding attributes
attribute = 'phone'
candidates = []
for key in composition_vectors.keys():
    if (key.lower()).find(attribute) != -1:
        candidates.append(key)
candidates

['Sis_admin_department.csv.Department Phone Number',
 'Employee_directory.csv.Office Phone',
 'Warehouse_users.csv.Office Phone',
 'Sis_admin_department.csv.Department Phone Area Code',
 'Mit_student_directory.csv.Office Phone',
 'Drupal_employee_directory.csv.Office Phone']

In [36]:
indexes, metrics = model.cosine('madden_samuel_r', n=20)
res = model.generate_response(indexes, metrics).tolist()
res

[('6172586643', 0.7849313486501621),
 ('srmadden', 0.761890896610126),
 ('madden@csail_mit_edu', 0.7302375712183606),
 ('32-g938', 0.6994234649772965),
 ('alleyne_valerie', 0.6096030452657748),
 ('regev_aviv', 0.6001141852343141),
 ('http://db_csail_mit_edu/madden', 0.5726818600046826),
 ('924559605', 0.5446807208673953),
 ('cadogan_karen_m', 0.5440192014598965),
 ('cadogan@ll_mit_edu', 0.538459665843975),
 ('saxe_rebecca_r', 0.5261379573217481),
 ('lynn_jennifer_j', 0.5234204003276526),
 ('987515930', 0.5216549568539472),
 ('valleyne', 0.5056586181521495),
 ('920145823', 0.49755006654659845),
 ('nhuch_michelle_r', 0.49333601582586056),
 ('santos_adèle_naudé', 0.4914655348305545),
 ('braida_louis_d', 0.4896523932603062),
 ('julich_perez_april', 0.4838387563134501),
 ('hanafin_nicole', 0.4807667923350942)]

In [43]:
entity_to_attribute('madden_samuel_r', n=20)

[('6172586643',
  0.7849313486501621,
  [('Hr_faculty_roster.csv.Endowed Chair', 0.66050691902555725),
   ('Employee_directory.csv.Email Address Uppercase', 0.70789657052496335),
   ('Employee_directory.csv.Email Address', 0.70789657052496335),
   ('Drupal_employee_directory.csv.Email Address', 0.70817566308491586)]),
 ('srmadden',
  0.761890896610126,
  [('Employee_directory.csv.Krb Name', 0.57197901273202312),
   ('Employee_directory.csv.Krb Name Uppercase', 0.57197901273202312),
   ('Person_auth_area.csv.User Name', 0.62726848217216447),
   ('Employee_directory.csv.Email Address Uppercase', 0.63008359054053731)]),
 ('madden@csail_mit_edu',
  0.7302375712183606,
  [('Employee_directory.csv.Email Address Uppercase', 0.63465140246012186),
   ('Employee_directory.csv.Email Address', 0.63465140246012186),
   ('Drupal_employee_directory.csv.Email Address', 0.63508656041952327),
   ('Hr_faculty_roster.csv.Endowed Chair', 0.6456518395636851)]),
 ('32-g938',
  0.6994234649772965,
  [('Employ

In [48]:
concept_qa('madden_samuel_r', 'Employee_directory.csv.Krb Name', n=10)

[('srmadden', 0.57197901273202312),
 ('cadogan@ll_mit_edu', 0.63590380760722576),
 ('madden@csail_mit_edu', 0.65477149798365653),
 ('http://db_csail_mit_edu/madden', 0.67920530214335784),
 ('924559605', 0.70867021430271904),
 ('6172586643', 0.76185283156160677),
 ('alleyne_valerie', 0.78329192108562595),
 ('cadogan_karen_m', 0.81693550841877094),
 ('regev_aviv', 0.85707862488625808),
 ('32-g938', 0.90008351149239774)]

# DENOISER

In [2]:
import word2vec
import pickle
we_model_path = "/data/raulcf/relemb/mitdwh/bench_mitdwh/we_db2vec_75_ns50_fns20_i40.bin"
we = word2vec.load(we_model_path)
relemb_path = "/data/raulcf/relemb/mitdwh/bench_mitdwh/relemb_db2vec_75_ns50_fn20/row.pkl"
with open(relemb_path, "rb") as f:
    relemb = pickle.load(f)

In [3]:
from relational_embedder.api import Fabric
from relational_embedder.data_prep import data_prep_utils as dpu
api = Fabric(we, None, relemb, None, None)

In [5]:
entity = "madden_samuel_r"
ev = api.row_vector_for(entity)
res = api.topk_related_entities(ev, k=10)
res

[('987515930', 0.9128845427368912),
 ('13975', 0.9076621200518711),
 ('madden@csail_mit_edu', 0.8777760586477653),
 ('srmadden', 0.8770624134650294),
 ('32-g938', 0.8736097595243766),
 ('calomo', 0.8600310711556872),
 ('vanwinkle', 0.8525091899721615),
 ('jaakkola_tommi_s', 0.8521197450874609),
 ('megretski_alexandre', 0.8444329342503496),
 ('ameh_samuel', 0.8438668363395274)]

In [6]:
entity = "madden_samuel_r"
ev = api.row_vector_for(entity)
res = api.topk_related_entities_conditional_denoising(ev, k=10)
res

[('987515930', 0.9128845427368912),
 ('13975', 0.9076621200518711),
 ('madden@csail_mit_edu', 0.8777760586477653),
 ('srmadden', 0.8770624134650294),
 ('32-g938', 0.8736097595243766),
 ('madden_samuel_r', 0.5),
 ('13975', 0.4),
 ('srmadden', 0.4),
 ('32-g938', 0.4),
 ('madden@csail_mit_edu', 0.4)]

In [7]:
entity = "32-g938"
ev = api.row_vector_for(entity)
res = api.topk_related_entities(ev, k=10)
res

[('madden@csail_mit_edu', 0.97023240954564),
 ('srmadden', 0.9573691164066921),
 ('987515930', 0.9442821775896386),
 ('13975', 0.9387502287421707),
 ('madden_samuel_r', 0.8736097595243766),
 ('32-d610a', 0.8712829959239551),
 ('tommi@csail_mit_edu', 0.8588132244661102),
 ('32-g470', 0.8549681790949837),
 ('32-d624', 0.8496715818319918),
 ('26-453', 0.8357115969857967)]

In [8]:
entity = "32-g938"
ev = api.row_vector_for(entity)
res = api.topk_related_entities_conditional_denoising(ev, k=10)
res

[('madden@csail_mit_edu', 0.97023240954564),
 ('srmadden', 0.9573691164066921),
 ('987515930', 0.9442821775896386),
 ('13975', 0.9387502287421707),
 ('madden_samuel_r', 0.8736097595243766),
 ('madden@csail_mit_edu', 0.6),
 ('32-g938', 0.5),
 ('32-g470', 0.4),
 ('srmadden', 0.4),
 ('987515930', 0.4)]

### debugging denoiser

In [31]:
from collections import defaultdict
import numpy as np
def test(el, k=10):
    res = api.topk_related_entities(el, k=k)
    fixed_group = [e for e, _ in res[:5]]  # top 5 elements
    print("Fixed Group: " + str(fixed_group))
    coh_set = defaultdict(int)
    for e, score in res:
        ev = api.M_R.get_vector(e)
        if np.array_equal(el, ev):  # don't include the querying vector
            continue
        sres = api.topk_related_entities(ev, k=10)
        for se, s_score in sres:
            coh_set[se] += 1

    coh_set = {key: (v / k) for key, v in coh_set.items()}

    # filter fixed_group elements from coh_set
    print("Original coh_set: " + str(coh_set))
    coh_set = {k: v for k, v in coh_set.items() if k not in fixed_group and not np.array_equal(el, api.M_R.get_vector(k))}
    print("Filtered coh_set: " + str(coh_set))

    final_res = sorted(coh_set.items(), key=lambda x: x[1], reverse=True)

    size_to_fill = 5  # fixed for now
    candidate_replacements = len(coh_set)
    if candidate_replacements >= size_to_fill:
        total_replacements = 5
    else:
        total_replacements = size_to_fill - candidate_replacements
    denoised_ranking = res[:5] + res[5:][:(5 - total_replacements)] + final_res[:total_replacements]

    assert(len(denoised_ranking) == k)
    return denoised_ranking

In [32]:
entity = "32-g938"
ev = api.row_vector_for(entity)
res = test(ev, k=10)
res

Fixed Group: ['madden@csail_mit_edu', 'srmadden', '987515930', '13975', 'madden_samuel_r']
Original coh_set: {'32-g938': 0.5, '32-g470': 0.4, 'srmadden': 0.4, '987515930': 0.4, '32-d610a': 0.4, '13975': 0.4, '32-d624': 0.3, 'madden_samuel_r': 0.4, 'asuman@mit_edu': 0.4, 'tommi@csail_mit_edu': 0.3, 'madden@csail_mit_edu': 0.6, 'http://db_csail_mit_edu/madden': 0.3, '900047958': 0.4, 'jaakkola': 0.3, '6468': 0.4, '988099961': 0.3, '1174': 0.2, 'calomo': 0.1, 'vanwinkle': 0.1, 'jaakkola_tommi_s': 0.1, 'megretski_alexandre': 0.1, 'ameh_samuel': 0.1, '32-g744': 0.1, 'asuman': 0.1, '32g-840': 0.1, 'samana@mit_edu': 0.1, '32-g492': 0.2, '32-g768': 0.1, 'ameyer@mit_edu': 0.3, 'chan@mit_edu': 0.2, 'tlp@mit_edu': 0.1, 'dimitrib@mit_edu': 0.1, 'jaillet@mit_edu': 0.1, '32-251': 0.1, 'phw@mit_edu': 0.1, '26-339': 0.1, '32-d662': 0.1, 'penfield@mit_edu': 0.1, 'jaillet': 0.1, 'redwine@mit_edu': 0.1, 'redwine': 0.1, '6-411': 0.1, 'jaffe@mit_edu': 0.1, '11566': 0.1, '900022090': 0.1, 'chenm@mit_edu': 0

[('madden@csail_mit_edu', 0.97023240954564),
 ('srmadden', 0.9573691164066921),
 ('987515930', 0.9442821775896386),
 ('13975', 0.9387502287421707),
 ('madden_samuel_r', 0.8736097595243766),
 ('32-g470', 0.4),
 ('32-d610a', 0.4),
 ('asuman@mit_edu', 0.4),
 ('900047958', 0.4),
 ('6468', 0.4)]

In [33]:
entity = "madden_samuel_r"
ev = api.row_vector_for(entity)
res = test(ev, k=10)
res

Fixed Group: ['987515930', '13975', 'madden@csail_mit_edu', 'srmadden', '32-g938']
Original coh_set: {'13975': 0.4, 'srmadden': 0.4, '32-g938': 0.4, 'madden_samuel_r': 0.5, 'madden@csail_mit_edu': 0.4, 'http://db_csail_mit_edu/madden': 0.3, '900047958': 0.4, '988099961': 0.2, '6468': 0.4, '1174': 0.1, '987515930': 0.4, 'jaakkola': 0.2, '32-g470': 0.2, '32-d610a': 0.3, '32-d624': 0.2, 'asuman@mit_edu': 0.1, 'tommi@csail_mit_edu': 0.2, '26-453': 0.1, 'calomo_jr_samuel_p': 0.1, 'eubank': 0.1, 'gerke': 0.1, 'vanwinkle': 0.2, 'flanigan': 0.1, 'ameh_samuel': 0.1, 'stambler': 0.2, 'rey': 0.1, 'benitz': 0.1, 'perli': 0.1, 'vanwinkle_samuel_c': 0.1, 'whelan': 0.1, 'melot': 0.1, 'vento': 0.1, 'sullenberger': 0.1, 'varey': 0.1, 'milechin': 0.1, 'jordy': 0.1, 'baah': 0.1, 'ozdaglar_asuman_e': 0.1, 'mithal_arvind': 0.1, 'meyer_albert_r': 0.1, 'solar_lezama_armando': 0.1, 'slotine': 0.1, 'amarasinghe': 0.1, 'suri_tavneet': 0.1, 'megretski_alexandre': 0.1, 'megretski': 0.1, 'tsitsiklis': 0.1, 'jaakko

[('987515930', 0.9128845427368912),
 ('13975', 0.9076621200518711),
 ('madden@csail_mit_edu', 0.8777760586477653),
 ('srmadden', 0.8770624134650294),
 ('32-g938', 0.8736097595243766),
 ('900047958', 0.4),
 ('6468', 0.4),
 ('http://db_csail_mit_edu/madden', 0.3),
 ('32-d610a', 0.3),
 ('988099961', 0.2)]