## Relational Embedding

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import word2vec
import itertools
import numpy as np
from relational_embedder import composition
from data_prep import data_prep_utils as dpu
from scipy.spatial.distance import cosine

In [19]:
# Loading binary vectors
model = word2vec.load("/home/ubuntu/word2vec_c/data/mitdwhdata.bin")

In [4]:
indexes, metrics = model.cosine('madden_samuel_r', n=20)

In [5]:
res = model.generate_response(indexes, metrics).tolist()

In [6]:
res

[('6172586643', 0.7849313486501621),
 ('srmadden', 0.761890896610126),
 ('madden@csail_mit_edu', 0.7302375712183606),
 ('32-g938', 0.6994234649772965),
 ('alleyne_valerie', 0.6096030452657748),
 ('regev_aviv', 0.6001141852343141),
 ('http://db_csail_mit_edu/madden', 0.5726818600046826),
 ('924559605', 0.5446807208673953),
 ('cadogan_karen_m', 0.5440192014598965),
 ('cadogan@ll_mit_edu', 0.538459665843975),
 ('saxe_rebecca_r', 0.5261379573217481),
 ('lynn_jennifer_j', 0.5234204003276526),
 ('987515930', 0.5216549568539472),
 ('valleyne', 0.5056586181521495),
 ('920145823', 0.49755006654659845),
 ('nhuch_michelle_r', 0.49333601582586056),
 ('santos_adèle_naudé', 0.4914655348305545),
 ('braida_louis_d', 0.4896523932603062),
 ('julich_perez_april', 0.4838387563134501),
 ('hanafin_nicole', 0.4807667923350942)]

#### Analogies

In [20]:
indexes, metrics = model.analogy(pos=['madden_samuel_r', '32-g936'], neg=['32-g938'], n=10)

In [21]:
res = model.generate_response(indexes, metrics).tolist()
res

[('katabi_dina', 0.2620763345752194),
 ('6173246027', 0.24025065821678065),
 ('dina@csail_mit_edu', 0.23896212073782416),
 ('wenger_rich', 0.21775141487554264),
 ('dinaktbi', 0.1936179557694433),
 ('951111241', 0.18444792210514846),
 ('pribble_daniel', 0.18236975950919623),
 ('wenger_richard', 0.1822369772140558),
 ('vuletic@mit_edu', 0.18132656363077188),
 ('essigmann_ellen_m', 0.18048992697689886)]

## Baseline composition

In [43]:
# Files
import os
path = "/data/datasets/mitdwh/"
all_relations = [relation for relation in os.listdir(path)]

In [46]:
composition_vectors = dict()

In [47]:
for relation in all_relations:
    print("Computing vectors for: " + str(relation))
    col_we, missing_words = composition.column_avg_composition(path + "/" + relation, model)
    rel_we = composition.relation_column_composition(col_we)
    composition_vectors[relation] = rel_we
    for k, v in col_we.items():
        composition_vectors[relation +"." + k] = col_we[k]
print("Done!")
print("Total vectors: " + str(len(composition_vectors.items())))

Computing vectors for: Fclt_rooms.csv


  if self.run_code(code, result):


Computing vectors for: Hr_org_unit_new.csv
Computing vectors for: Sdo_relatemask_table.csv
Computing vectors for: Sdo_datums_old_snapshot.csv
Computing vectors for: Zpm_rooms_load.csv
Computing vectors for: Mit_student_directory.csv
Computing vectors for: Sdo_coord_axis_names.csv
Computing vectors for: Sdo_ellipsoids_old.csv
Computing vectors for: Iap_subject_session.csv
Computing vectors for: Ctx_stopwords.csv
Computing vectors for: Space_unit.csv
Computing vectors for: Sdo_datum_vertical.csv
Computing vectors for: Sdo_crs_vertical.csv
Computing vectors for: All_olap_functions.csv
Computing vectors for: short_drupal_course_catalog.csv
Computing vectors for: Sdo_crs_geocentric.csv
Computing vectors for: Ctx_parameters.csv
Computing vectors for: Time_day.csv
Computing vectors for: Sdo_units_of_measure.csv
Computing vectors for: Si_thumbnail_format.csv
Computing vectors for: Fclt_building_address_list.csv
Computing vectors for: Student_department.csv
Computing vectors for: Fclt_org_dlc_k

  if self.run_code(code, result):


Computing vectors for: Mrv_olap2_descriptors.csv
Computing vectors for: Wm_installation.csv
Computing vectors for: Person_auth_area.csv
Computing vectors for: Sdo_coord_op_paths.csv
Computing vectors for: Sdo_coord_system.csv
Computing vectors for: Fields.csv
Computing vectors for: Hr_faculty_roster.csv
Computing vectors for: Space_usage.csv
Computing vectors for: Tip_material.csv
Computing vectors for: Space_floor.csv
Computing vectors for: Sdo_crs_geographic3d.csv
Computing vectors for: Zip_usa.csv
Computing vectors for: All_olap_descriptor_types.csv
Computing vectors for: Sis_subject_code.csv
Computing vectors for: Sdo_available_ops.csv
Computing vectors for: Academic_term_parameter.csv
Computing vectors for: All_olap_columns.csv
Computing vectors for: Field_dictionary_definition.csv
Computing vectors for: Drupal_employee_directory.csv
Computing vectors for: Sdo_prime_meridians.csv
Computing vectors for: Academic_terms.csv
Computing vectors for: short_subject_offered_summary.csv
Com

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Computing vectors for: Num.csv
Computing vectors for: Sdo_crs_compound.csv
Computing vectors for: Cis_hass_attribute.csv
Computing vectors for: Sdo_crs_engineering.csv
Computing vectors for: Iap_subject_person.csv
Computing vectors for: Si_values.csv
Computing vectors for: Fac_major_use.csv
Computing vectors for: Sdo_datums_old_format.csv
Computing vectors for: Sis_term_address_category.csv
Computing vectors for: Moira_list.csv
Computing vectors for: All_olap2_uentity_desc_uses.csv
Computing vectors for: Fac_building.csv
Computing vectors for: User_view.csv
Computing vectors for: All_hversion_view.csv
Computing vectors for: Ctx_preferences.csv
Computing vectors for: Fclt_major_user_hist.csv
Computing vectors for: Sdo_non_available_elem_units.csv
Computing vectors for: Subject_iap_schedule.csv
Computing vectors for: Fclt_building.csv
Computing vectors for: Fclt_organization.csv
Computing vectors for: Fclt_building_hist_1.csv
Computing vectors for: Buildings.csv
Computing vectors for: Sd

In [69]:
# SERIALIZE
import pickle
path = "./temp/composition_vectors.pkl"
with open(path, 'wb') as f:
    pickle.dump(composition_vectors, f)

In [22]:
# DESERIALIZE
import pickle
path = "./temp/composition_vectors.pkl"
with open(path, 'rb') as f:
    composition_vectors = pickle.load(f)

In [12]:
def similar_to(table, column=None, threshold=0.2):
    similar_items = []
    if column is None:
        key = table
    else:
        key = table + "." + column
    for k, we in composition_vectors.items():
        distance = 0
        try:
            distance = cosine(composition_vectors[key], composition_vectors[k])
        except ValueError:
            continue
        if distance < threshold:
            similar_items.append(k)
    return similar_items

In [13]:
table = "Fac_building.csv"
column = "Building Name Long"
#key = "Tables.csv.Business Contact Email"
threshold = 0.2
sim_items = similar_to(table, column=column)
for el in sim_items:
    print(str(el))

Fac_building.csv.Building Name Long
Fclt_building.csv.Building Name
Buildings.csv.Building Name
Fclt_building_list.csv.Building Name
Fclt_building.csv.Building Name Long
Fac_building.csv.Building Name
Fclt_building_hist_1.csv.Building Name Long
Fclt_building_list.csv.Building Name Long
Fclt_building_hist_1.csv.Building Name


In [14]:
table = "Library_subject_offered.csv"
column = "Course Number Desc"
sim_items = similar_to(table, column)
for el in sim_items:
    print(str(el))

short_subject_summary.csv.Department Name
short_tip_subject_offered.csv.Master Course Number Desc
short_cis_course_catalog.csv.Department Name
short_subjects_offered.csv.Master Course Number Desc
Student_department.csv.Department Name
short_subjects_offered.csv.Course Number Desc
short_subjects_offered.csv.Offer Dept Name
subject_grouping_slice.csv.Department Name
short_subject_offered_summary.csv.Offer Dept Name
Library_subject_offered.csv.Offer Dept Name
Library_subject_offered.csv.Course Number Desc
Library_subject_offered.csv.Master Course Number Desc
short_drupal_course_catalog.csv.Department Name
Sis_course_description.csv.Department Name
Mit_student_directory.csv.Department Name
short_tip_subject_offered.csv.Course Number Desc
short_course_catalog_subject_offered.csv.Department Name
short_tip_subject_offered.csv.Offer Dept Name


In [15]:
table = "Warehouse_users.csv"
sim_items = similar_to(table)
for el in sim_items:
    print(str(el))

Warehouse_users.csv.Title
Hr_faculty_roster.csv.Middle Name
Employee_directory.csv.Middle Name
Se_person.csv.Middle Name
Drupal_employee_directory.csv.Middle Name
Warehouse_users.csv
Warehouse_users.csv.Middle Name
Mit_student_directory.csv.Middle Name


In [16]:
table = "Fac_building.csv"
sim_items = similar_to(table, threshold=0.1)
for el in sim_items:
    print(str(el))

Fclt_building_list.csv
Fac_building.csv
Fclt_building_hist_1.csv
Fclt_building.csv


### Other examples

In [25]:
path_to_relation = "/data/datasets/mitdwh/Se_person.csv"
col_we_se, missing_words = composition.column_avg_composition(path_to_relation, model)

In [26]:
path_to_relation = "/data/datasets/mitdwh/Drupal_employee_directory.csv"
col_we_drupal, missing_words = composition.column_avg_composition(path_to_relation, model)

In [27]:
se_vec = composition.relation_column_composition(col_we_se)
drupal_vec = composition.relation_column_composition(col_we_drupal)

In [28]:
cosine(se_vec, drupal_vec)

0.17816106687438826

In [30]:
for a, b in itertools.combinations(col_we_se.keys(), 2):
    we_a = col_we_se[a]
    we_b = col_we_se[b]

    cos = cosine(we_a, we_b)
    print(str(a) + " -sim- " + str(b) + " is: " + str(cos))

Is Active -sim- Organization is: 0.71580180098
Is Active -sim- Last Name is: 0.754346633966
Is Active -sim- Employee Type is: 0.593234068665
Is Active -sim- Krb Name is: 0.743188209349
Is Active -sim- Full Name is: 0.773458063759
Is Active -sim- Mit Id is: 0.713277708397
Is Active -sim- Office Location is: 1.04746727659
Is Active -sim- Position Title is: 0.654582383524
Is Active -sim- Payroll Rank is: 0.545001600076
Is Active -sim- Middle Name is: 1.0106378701
Is Active -sim- First Name is: 0.649957258588
Organization -sim- Last Name is: 0.550844356843
Organization -sim- Employee Type is: 0.431105480558
Organization -sim- Krb Name is: 0.49305379079
Organization -sim- Full Name is: 0.608568961034
Organization -sim- Mit Id is: 0.414800770861
Organization -sim- Office Location is: 0.929235802932
Organization -sim- Position Title is: 0.720257754811
Organization -sim- Payroll Rank is: 0.33552013582
Organization -sim- Middle Name is: 0.905511666916
Organization -sim- First Name is: 0.5615552