# Read dataset and plot graph

In [1]:
import numpy as np
import pandas as pd
import json
import networkx as nx
import matplotlib.pyplot as plt
from random import randrange

def read_dataset(path,drop_columns=None,keep_columns=None):
    #get rid of useless columns
    csv_data = pd.read_csv(path)
    
    if keep_columns != None:
        #keep only these columns
        return csv_data.filter(items=keep_columns)
    
    if drop_columns!= None:
        #drop these and keep the rest
        return csv_data.drop(drop_columns, axis=1)
    
    #finally, didn't drop or filter any column
    return csv_data     

def plot_graph(g,ds_nodes=[],attribute_nodes=[],feat_nodes=[],lit_nodes=[]):
    pos=nx.spring_layout(g)    
    nx.draw_networkx_nodes(g,pos,nodelist=ds_nodes,node_color="blue",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=attribute_nodes,node_color="green",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=feat_nodes,node_color="grey",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=lit_nodes,node_color="red",node_size=900)

    nx.draw_networkx_edges(g,pos,width=3)
    nx.draw_networkx_labels(g,pos,font_size=8)
    plt.show() 

## Graph  construction

In [2]:
def code_ds_id(data):
    return "DS_"+data
def code_attr_id(data,parent):
    return data+"|"+parent
def code_feat_id(data,parent):
    return data+"|"+parent
def code_literal_id(data,parent):
    return "literal_"+data+"|"+parent

In [3]:
def graph_dataset(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
    
    #create nodes and edges at datasetLevel
    features = datasets.columns[1:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = instances
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        g.add_node(dataset_id,vector=word_embedding("dataset",wem),tipo="dataset")
        row = datasets.iloc[r][1:]
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min(instances,len(features))
            
        for i in range (number_features):
            feature_dataset_id = code_feat_id(features[i],dataset_id)
            literal_dataset_id = code_literal_id(str(i),dataset_id)
            g.add_node(feature_dataset_id,vector=word_embedding("feature dataset|" +features[i] ,wem),tipo="feature dataset")
            g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal dataset")
            g.add_edge(dataset_id,feature_dataset_id)
            g.add_edge(feature_dataset_id,literal_dataset_id)
            
    return g


def graph_attribute(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
        
    #create nodes and edges at datasetLevel
    features = datasets.columns[2:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = min (instances,len(datasets))
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        #attr name is the 2nd column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        attribute_id = code_attr_id(datasets.iloc[r][1],dataset_id)
        row = datasets.iloc[r][2:]
        
        g.add_node(attribute_id,vector=word_embedding("attribute",wem),tipo="attribute")
        
        #relation of dataset and an attribute
        g.add_edge(dataset_id,attribute_id)
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min (instances,len(features))
            
        for i in range (number_features):
            feature_attribute_id = code_feat_id(features[i],attribute_id)
            literal_dataset_id = code_literal_id(str(i),attribute_id)
            g.add_node(feature_attribute_id,vector=word_embedding("feature attribute|"+features[i],wem),tipo="feature attribute")
            g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal attribute")
            g.add_edge(attribute_id,feature_attribute_id)
            g.add_edge(feature_attribute_id,literal_dataset_id)
            
    return g

In [4]:
def graph_dataset_names(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
    
    #create nodes and edges at datasetLevel
    features = datasets.columns[2:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = instances
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        g.add_node(dataset_id,vector=word_embedding("dataset|"+datasets.iloc[r][1] ,wem),tipo="dataset")
        row = datasets.iloc[r][2:]
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min(instances,len(features))
            
        for i in range (number_features):
            feature_dataset_id = code_feat_id(features[i],dataset_id)
            literal_dataset_id = code_literal_id(str(i),dataset_id)
            g.add_node(feature_dataset_id,vector=word_embedding("feature dataset|" +features[i] ,wem),tipo="feature dataset")
            g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal dataset")
            g.add_edge(dataset_id,feature_dataset_id)
            g.add_edge(feature_dataset_id,literal_dataset_id)
            
    return g


def graph_attribute_names(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
        
    #create nodes and edges at datasetLevel
    features = datasets.columns[2:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = min (instances,len(datasets))
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        #attr name is the 2nd column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        attribute_id = code_attr_id(datasets.iloc[r][1],dataset_id)
        row = datasets.iloc[r][2:]
        
        g.add_node(attribute_id,vector=word_embedding("attribute|"+datasets.iloc[r][1],wem),tipo="attribute")
        
        #relation of dataset and an attribute
        g.add_edge(dataset_id,attribute_id)
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min (instances,len(features))
            
        for i in range (number_features):
            feature_attribute_id = code_feat_id(features[i],attribute_id)
            literal_dataset_id = code_literal_id(str(i),attribute_id)
            g.add_node(feature_attribute_id,vector=word_embedding("feature attribute|"+features[i],wem),tipo="feature attribute")
            g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal attribute")
            g.add_edge(attribute_id,feature_attribute_id)
            g.add_edge(feature_attribute_id,literal_dataset_id)
            
    return g

In [5]:
def graph_dataset_short(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
    
    #create nodes and edges at datasetLevel
    features = datasets.columns[2:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = instances
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        g.add_node(dataset_id,vector=word_embedding("dataset|"+datasets.iloc[r][1] ,wem),tipo="dataset")
        row = datasets.iloc[r][2:]
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min(instances,len(features))
            
        for i in range (number_features):
            feature_dataset_id = code_feat_id(features[i],dataset_id)
#             literal_dataset_id = code_literal_id(str(i),dataset_id)
            g.add_node(feature_dataset_id,vector=word_embedding(features[i]+"|"+str(row[i]) ,wem),tipo="feature dataset")
#             g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal dataset")
            g.add_edge(dataset_id,feature_dataset_id)
#             g.add_edge(feature_dataset_id,literal_dataset_id)
            
    return g


def graph_attribute_short(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
        
    #create nodes and edges at datasetLevel
    features = datasets.columns[2:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = min (instances,len(datasets))
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        #attr name is the 2nd column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        attribute_id = code_attr_id(datasets.iloc[r][1],dataset_id)
        row = datasets.iloc[r][2:]
        
        g.add_node(attribute_id,vector=word_embedding("attribute|"+datasets.iloc[r][1],wem),tipo="attribute")
        
        #relation of dataset and an attribute
        g.add_edge(dataset_id,attribute_id)
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min (instances,len(features))
            
        for i in range (number_features):
            feature_attribute_id = code_feat_id(features[i],attribute_id)
#             literal_dataset_id = code_literal_id(str(i),attribute_id)
            g.add_node(feature_attribute_id,vector=word_embedding(features[i]+"|"+str(row[i]),wem),tipo="feature attribute")
#             g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal attribute")
            g.add_edge(attribute_id,feature_attribute_id)
#             g.add_edge(feature_attribute_id,literal_dataset_id)
            
    return g

## Auxiliars

### Check if input is number

In [6]:
def is_number(s):
    #Returns True is string is a number.
    try:
        float(s)
        if float(s) == float("INF") or float(s) == float("NAN") or s == "NAN" or s == "nan":
            return False
        return True
    except ValueError:
        return False
    
def fill_ones(bin_rep,value=1,neg=1):
    output = []
#     neg = 10**neg
    fill_one = False
    for b in bin_rep:
        if fill_one:
            output.append(str(neg))
#             output.append(str(value*neg/1000000))
        else:
            if b == "1":
                output.append(str(neg))
#                 output.append(str(value*neg/1000000))
                fill_one=True
            else:
                output.append("0")
    return output

### From numbers to bin tensor vector

In [9]:
from decimal import Decimal
import bitstring
import torch

def num2vec_old3(num):
    rep_sc = str('{:.5E}'.format(num))
#     print(rep_sc)
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = min(abs(exp_part)+1,2047)
    else:
        exp_pos = min(abs(exp_part)+1,2047)
        exp_neg = 0

    dec_part = rep_sc.split("E")[0]
    dec_part_int = dec_part.split(".")[0]
    dec_part_frac = dec_part.split(".")[1][0:min(5,exp_pos + exp_neg)]#.rstrip('0')
#     dec_part = int(dec_part_int)
    dec_part = int(dec_part_int + dec_part_frac)
#     dec_part = int(rep_sc.split("E")[0].replace(".",""))
#     print("Exp+ :" + str(exp_pos) + " Exp- :" + str(exp_neg) + " Decimal:"+ str(dec_part))
    factor = 1
    if dec_part <0:
        factor = -1
        dec_pos = 0
        dec_neg = abs(dec_part)
        neg_exp_pos = exp_pos
        neg_exp_neg = exp_neg
        pos_exp_pos = 0
        pos_exp_neg = 0
    else:
        dec_pos = abs(dec_part)
        dec_neg = 0    
        neg_exp_pos = 0
        neg_exp_neg = 0
        pos_exp_pos = exp_pos
        pos_exp_neg = exp_neg
        
    bin_pos_exp_pos = fill_ones(bitstring.Bits(uint=pos_exp_pos, length=11).bin)
    bin_pos_exp_neg = fill_ones(bitstring.Bits(uint=pos_exp_neg, length=11).bin)
    bin_neg_exp_pos = fill_ones(bitstring.Bits(uint=neg_exp_pos, length=11).bin)
    bin_neg_exp_neg = fill_ones(bitstring.Bits(uint=neg_exp_neg, length=11).bin)
    bin_dec = fill_ones(bitstring.Bits(uint=(dec_pos+dec_neg), length=20).bin,neg=factor)
    
    
#     rep_str = str("{:03}{:03}{:03}{:03}".format(pos_exp_pos,pos_exp_neg,neg_exp_pos,neg_exp_neg))
#     rep_int_list = [int(char) for char in rep_str] 
    rep_bin = bin_pos_exp_pos + bin_pos_exp_neg + bin_neg_exp_pos + bin_neg_exp_neg + bin_dec
#     bin_tensor = torch.tensor(np.array([float(x) for x in rep_bin]))
    return bin_tensor


def num2vec_old2(num):
    rep_sc = str('{:.11E}'.format(num))
#     print(rep_sc)
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = min(abs(exp_part)+1,1023)
    else:
        exp_pos = min(abs(exp_part)+1,1023)
        exp_neg = 0

    dec_part = rep_sc.split("E")[0]
    dec_part_int = dec_part.split(".")[0]
    dec_part_frac = dec_part.split(".")[1][0:min(11,exp_pos + exp_neg)]#.rstrip('0')
#     dec_part = int(dec_part_int)
    dec_part = int(dec_part_int + dec_part_frac)
#     dec_part = int(rep_sc.split("E")[0].replace(".",""))
#     print("Exp+ :" + str(exp_pos) + " Exp- :" + str(exp_neg) + " Decimal:"+ str(dec_part))
    factor = 1
    if dec_part <0:
        factor = -1
        
    bin_exp_pos = fill_ones(bitstring.Bits(uint=exp_pos, length=11).bin)
    bin_exp_neg = fill_ones(bitstring.Bits(uint=exp_neg, length=11).bin)
    bin_dec = fill_ones(bitstring.Bits(uint=abs(dec_part), length=42).bin)
    
#     rep_str = str("{:03}{:03}{:03}{:03}".format(pos_exp_pos,pos_exp_neg,neg_exp_pos,neg_exp_neg))
#     rep_int_list = [int(char) for char in rep_str] 
    rep_bin = bin_exp_pos + bin_exp_neg + bin_dec
    bin_tensor = torch.tensor(np.array([float(x)*factor for x in rep_bin]))
    return bin_tensor

def num2vec_old(num):
    rep_sc = str('{:.3E}'.format(num))
#     print(rep_sc)
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = min(abs(exp_part)+1,1023)
    else:
        exp_pos = min(abs(exp_part)+1,1023)
        exp_neg = 0

    dec_part = rep_sc.split("E")[0]
    dec_part_int = dec_part.split(".")[0]
    dec_part_frac = dec_part.split(".")[1]#.rstrip('0')
#     dec_part = int(dec_part_int)
    dec_part = int(dec_part_int + dec_part_frac)
#     dec_part = int(rep_sc.split("E")[0].replace(".",""))
#     print("Exp+ :" + str(exp_pos) + " Exp- :" + str(exp_neg) + " Decimal:"+ str(dec_part))
    if dec_part <0:
        dec_pos = 0
        dec_neg = abs(dec_part)
        neg_exp_pos = exp_pos
        neg_exp_neg = exp_neg
        pos_exp_pos = 0
        pos_exp_neg = 0
    else:
        dec_pos = abs(dec_part)
        dec_neg = 0    
        neg_exp_pos = 0
        neg_exp_neg = 0
        pos_exp_pos = exp_pos
        pos_exp_neg = exp_neg
        
    bin_pos_exp_pos = fill_ones(bitstring.Bits(uint=pos_exp_pos, length=10).bin,pos_exp_pos*1000000)
    bin_pos_exp_neg = fill_ones(bitstring.Bits(uint=pos_exp_neg, length=10).bin,pos_exp_neg*1000000)
    bin_neg_exp_pos = fill_ones(bitstring.Bits(uint=neg_exp_pos, length=10).bin,neg_exp_pos*1000000)
    bin_neg_exp_neg = fill_ones(bitstring.Bits(uint=neg_exp_neg, length=10).bin,neg_exp_neg*1000000)
    bin_dec_pos = fill_ones(bitstring.Bits(uint=(exp_pos+exp_neg), length=12).bin,dec_pos,exp_pos-exp_neg)
    bin_dec_neg = fill_ones(bitstring.Bits(uint=(exp_pos+exp_neg), length=12).bin,dec_neg,exp_pos-exp_neg)

#     rep_str = str("{:03}{:03}{:03}{:03}".format(pos_exp_pos,pos_exp_neg,neg_exp_pos,neg_exp_neg))
#     rep_int_list = [int(char) for char in rep_str] 
    rep_bin = bin_pos_exp_pos + bin_pos_exp_neg + bin_neg_exp_pos + bin_neg_exp_neg + bin_dec_pos + bin_dec_neg 
    bin_tensor = torch.tensor(np.array([float(x) for x in rep_bin]))
    return bin_tensor

In [593]:
def num2vec_just_old(num):
    rep_sc = str('{:.8E}'.format(num))
    print(rep_sc)
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = exp_part
    else:
        exp_pos = exp_part
        exp_neg = 0

    exp_pos = abs(exp_pos)    
    exp_neg = abs(exp_neg)
    
    dec_part_int = rep_sc.split("E")[0].split(".")[0]
    dec_part_frac = rep_sc.split("E")[0].split(".")[1][0:min(11,exp_pos+exp_neg)]
    dec_part = int(dec_part_int+dec_part_frac)
    c = 1.0
    if dec_part <0:
        c = -1.0
    dec_part = abs(dec_part)
    
    rep_str = str("{:03}{:03}{:010}".format(exp_pos,exp_neg,dec_part))
    rep_final_str = []
    previous = ""
    for i in range (16):
        dec  = rep_str[i]
        for j in range(4):
            rep_final_str.append(previous+dec)
            
        if i+1 < 6:
            if (i+1) % 3 == 0:
                previous = ""
            else:
                previous += dec
        else:
            previous = ""
            
        
#     print(rep_final_str)    
#     rep_str = rep_str+rep_str+rep_str+rep_str
    rep_float = []
    for i in range(64):
        if i % 64 < 24:
            if i % 24 < 4:
                w = 1/10000
            elif i % 24 < 8:
                w = 1/1000
            elif i%24 < 12:
                w = 1/100
            else:
                w = 1/10
        else:
            w = 1
        rep_float.append(float(rep_final_str[i])/w)
#     print("rep str :" + str(rep_str))
#     print(dec_part)
#     rep_int = int(rep_str)
#     rep_bin = bitstring.Bits(uint=rep_int, length=64).bin
#     print(rep_float)
    bin_tensor = torch.tensor(np.array([x*c for x in rep_float]))
#     print(bin_tensor)
    return bin_tensor


In [40]:
#clean
def num2vec(num):
    rep_sc = str('{:.11E}'.format(num))
    print(rep_sc)
    dec_part = int(rep_sc.split("E")[0].replace(".",""))
    c = 1
    if dec_part <0:
        c = -1
    dec_part = abs(dec_part)
    
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = exp_part
    else:
        exp_pos = exp_part
        exp_neg = 0

    exp_pos = abs(exp_pos)    
    exp_neg = abs(exp_neg)
    
    rep_str = str("{:03}{:03}{:012}".format(exp_pos,exp_neg,dec_part))
    print(rep_str)
    
#     print(dec_part)
    rep_int = int(rep_str) * c
    rep_bin = bitstring.Bits(int=rep_int, length=64).bin

    bin_tensor = torch.tensor(np.array([float(x) for x in rep_bin]))
    return bin_tensor

In [516]:
##clean_5
def num2vec_clean5(num):
    rep_sc = str('{:.5E}'.format(num))
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = exp_part
    else:
        exp_pos = exp_part
        exp_neg = 0

    exp_pos = abs(exp_pos)    
    exp_neg = abs(exp_neg)
    
    dec_part = rep_sc.split("E")[0]
    c = 1.0
    if float(dec_part) <0:
        c = -1.0
    float_dec = abs(float(dec_part))
    
    final_rep = []
    for i in range (64):
        if i < 22:
            final_rep.append(exp_pos/22)
        elif i < 44:
            final_rep.append(exp_neg/22)
        else:
            final_rep.append(float_dec/200)
            
    bin_tensor = torch.tensor(np.array([float(x)*c for x in final_rep]))
    return bin_tensor

In [517]:
##clean6
def num2vec_clean6(num):
    rep_sc = str('{:.2E}'.format(num))
    print(rep_sc)
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = exp_part
    else:
        exp_pos = exp_part
        exp_neg = 0

    exp_pos = abs(exp_pos)
    exp_neg = abs(exp_neg)
    
    dec_part_int = rep_sc.split("E")[0].split(".")[0]
    dec_part_frac = rep_sc.split("E")[0].split(".")[1][0:min(2,(exp_pos+exp_neg))]
    dec_part = int(dec_part_int+dec_part_frac)
    print(dec_part)
    c = 1
    if dec_part <0:
        c = -1
    dec_part = abs(dec_part)
    
    if exp_pos >=0:
        rep_exp_pos = int(str("{:03}{:03}".format(exp_pos,dec_part)))
    else:
        rep_exp_pos = 0
    if exp_neg >=0:
        rep_exp_neg = int(str("{:03}{:03}".format(exp_neg,dec_part)))
    else:
        rep_exp_neg = 0 
    rep_dec_str = int(str("{:03}".format(dec_part)))
    
    
    num_value = (exp_pos+exp_neg)/10 + dec_part/10
    float_rep = []
    if c < 1:
        for i in range(32):
            float_rep.append(0)
        for i in range(16):
            float_rep.append(rep_exp_pos/1000)
        for i in range(16):
            float_rep.append(rep_exp_neg/1000)
#         for i in range(10):
#             float_rep.append(rep_dec_str/1000)
            
    else:
        for i in range(16):
            float_rep.append(rep_exp_pos/1000)
        for i in range(16):
            float_rep.append(rep_exp_neg/1000)
#         for i in range(10):
#             float_rep.append(rep_dec_str/1000)
        for i in range(32):
            float_rep.append(0)    
#     if c < 1:
#         for i in range(32):
#             float_rep.append(0)
#         for i in range(10):
#             float_rep.append(exp_pos/10)
#         for i in range(10):
#             float_rep.append(exp_neg/10)
#         for i in range(12):
#             float_rep.append(dec_part/100)
            
#     else:
#         for i in range(10):
#             float_rep.append(exp_pos/10)
#         for i in range(10):
#             float_rep.append(exp_neg/10)
#         for i in range(12):
#             float_rep.append(dec_part/100)
#         for i in range(32):
#             float_rep.append(0)

    bin_tensor = torch.tensor(np.array([float(x) for x in float_rep]))
    return bin_tensor

In [61]:
import bitstring
import torch
import numpy as np
nums = [-0.00000001,-2.7766554*10**300]
tensors = []
for n in nums:
    tensors.append(num2vec(n))
#     print(tensors[-1])
    vec = tensors[-1].tolist()
    output = ""
    for v in vec:
        output = output + str(int(v))
    print(output)
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
result = cos(tensors[0],tensors[1])
print(result.float())
pdist = torch.nn.PairwiseDistance(p=2)
norm_euclidean = pdist(torch.stack([tensors[0]]),torch.stack([tensors[1]])) 
print(norm_euclidean[0].float())

-1.00000000000E-08
000008100000000000
1111111111111111111110001010001000010010011000001001100000000000
-2.77665540000E+300
300000277665540000
1111101111010110001011110101011001000001001110100001100001100000
tensor(0.5806)
tensor(5.0990)


In [46]:
b1 = bitstring.Bits(float=nums[0], length=64).bin
b2 = bitstring.Bits(float=nums[1], length=64).bin
print(b1)
print(b2)

0011111111100001100110011001100110011001100110011001100110011010
0100000011101010110110110000000000000000000000000000000000000000


## Fasttext

In [7]:
import numpy as np
import torch
import fasttext
#fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('./resources/fasttext.bin')
print(ft.get_dimension())




300


In [8]:
def fasttex_simple(value):
    if is_number(value):
        value = str(value)
    
    values = value.split("|")
    out_tensor = torch.zeros(300)
    for v in values:
        out_tensor = out_tensor + torch.tensor(ft.get_sentence_vector(value))
    out_tensor = out_tensor / len(values)
    return out_tensor
    
def fasttex_(value):
    value = str(value)
    values = value.split("|")
    out_tensor = torch.zeros(364)
    for v in values:
        if is_number(v):
            value_f = float(v)
            bin_tensor = num2vec(value_f)
            out_tensor = out_tensor + torch.cat((torch.zeros(300),bin_tensor.float()))
        else:
            str_tensor = torch.tensor(ft.get_sentence_vector(value))
            out_tensor = out_tensor + torch.cat((str_tensor.float(),torch.zeros(64)))
    out_tensor = out_tensor / len(values)
    return out_tensor

## Bert


In [None]:
#pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
#pip install transformers
from transformers import BertModel, BertTokenizer
import torch

#load model in memory
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-12_H-768_A-12')
model = BertModel.from_pretrained('google/bert_uncased_L-12_H-768_A-12')
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = BertModel.from_pretrained('bert-base-uncased')
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

def bert_simple(value):
    if is_number(value):
        value = str(value)
    
    values = value.split("|")
    out_tensor = torch.zeros(768)
    for v in values:
        #add special tokens at the begining and end, and takes until 512 tokens max 
        tokenized = tokenizer.encode(v, add_special_tokens=True,max_length=512)
        input_ids = torch.tensor(tokenized).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        #result shape: (batch size, sequence length, model hidden dimension)
#             print(last_hidden_states.shape)

        #make the mean of the vectors to have 1 vector for the whole sentence and store result
        out_tensor = out_tensor + torch.mean(last_hidden_states[0],dim=0).detach()

    out_tensor = out_tensor / len(values)
    return out_tensor

def bert(value):
    value = str(value)
    values = value.split("|")
    out_tensor = torch.zeros(832)
    for v in values:
        if is_number(v):
            v_f = float(v)
            bin_tensor = num2vec(v_f)
            out_tensor = out_tensor + torch.cat((torch.zeros(768),bin_tensor.float()))
        else:
            #add special tokens at the begining and end, and takes until 512 tokens max 
            tokenized = tokenizer.encode(v, add_special_tokens=True,max_length=512)
            input_ids = torch.tensor(tokenized).unsqueeze(0)  # Batch size 1
            outputs = model(input_ids)
            last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
            #result shape: (batch size, sequence length, model hidden dimension)
#             print(last_hidden_states.shape)
            #make the mean of the vectors to have 1 vector for the whole sentence and store result
            str_tensor = torch.mean(last_hidden_states[0],dim=0).detach()
            out_tensor = out_tensor + torch.cat((str_tensor.float(),torch.zeros(64)))
    out_tensor = out_tensor / len(values)
    return out_tensor

## Choose word embedding

In [9]:
def word_embedding(data, model):
    if model=="fasttext":
        return fasttex_(data)
    if model=="bert":
        return bert(data)
    if model=="fasttext_simple":
        return fasttex_simple(data)
    if model=="bert_simple":
        return bert_simple(data)

# Execute

In [10]:
#build graph
word_emb = "fasttext"
df_dataset = read_dataset("./resources/monitor_clean/ds2.csv");
g = g = nx.Graph()
g = graph_dataset_short(df_dataset,g,word_emb)
df_attributes = read_dataset("./resources/monitor_clean/attr_cat2.csv");
g = graph_attribute_short(df_attributes,g,word_emb)
df_attributes_numeric = read_dataset("./resources/monitor_clean/attr_num2.csv");
g = graph_attribute_short(df_attributes_numeric,g,word_emb)
#write graph to file
nx.write_gpickle(g, "../word_embeddings/clean_monitor_"+word_emb+".gpickle")
print("clean_monitor"+word_emb+".gpickle")
##clean with older notation
##clean 2 with higher numbers that didnt work
##clean 3 with exp+,exp-,fraction
##clean 4 with +exp+,+exp-,-exp+,-exp-,fraction with sign




clean_fasttext_simple_short.gpickle


In [None]:
nx.write_gpickle(g, "../word_embeddings/new_"+word_emb+"_short.gpickle")
print("new_"+word_emb+"_short.gpickle")


In [None]:
#build graph
word_emb = "fasttext"
df_dataset = read_dataset("./resources/openml_203ds_datasets_index.csv",drop_columns=["Num", "dataset_topic"]);
g = g = nx.Graph()
g = graph_dataset_short(df_dataset,g,word_emb)
df_attributes = read_dataset("./resources/openml_203ds_attributes_nominal_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute_short(df_attributes,g,word_emb)
df_attributes_numeric = read_dataset("./resources/openml_203ds_attributes_numeric_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute_short(df_attributes_numeric,g,word_emb)
#write graph to file
nx.write_gpickle(g, "../word_embeddings/encoded_"+word_emb+"2_short.gpickle")
print("encoded_"+word_emb+"2_short")

In [None]:
#build graph
word_emb = "fasttext"
df_dataset = read_dataset("./resources/openml_203ds_datasets_index.csv",drop_columns=["Num", "dataset_topic"]);
g = g = nx.Graph()
g = graph_dataset_short(df_dataset,g,word_emb)
df_attributes = read_dataset("./resources/openml_203ds_attributes_nominal_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute_short(df_attributes,g,word_emb)
df_attributes_numeric = read_dataset("./resources/openml_203ds_attributes_numeric_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute_short(df_attributes_numeric,g,word_emb)
#write graph to file
nx.write_gpickle(g, "../word_embeddings/encoded_"+word_emb+"2_short.gpickle")
print("encoded_"+word_emb+"2_short")

#build graph
word_emb = "fasttext"
df_dataset = read_dataset("./resources/openml_203ds_datasets_index.csv",drop_columns=["Num", "dataset_topic"]);
g = g = nx.Graph()
g = graph_dataset(df_dataset,g,word_emb)
df_attributes = read_dataset("./resources/openml_203ds_attributes_nominal_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute(df_attributes,g,word_emb)
df_attributes_numeric = read_dataset("./resources/openml_203ds_attributes_numeric_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute(df_attributes_numeric,g,word_emb)
#write graph to file
nx.write_gpickle(g, "../word_embeddings/encoded_"+word_emb+"2.gpickle")
print("encoded_"+word_emb+"2_names")

In [None]:


#build graph
word_emb = "fasttext"
df_dataset = read_dataset("./resources/openml_203ds_datasets_index.csv",drop_columns=["Num", "dataset_topic"]);
g = g = nx.Graph()
g = graph_dataset_names(df_dataset,g,word_emb)
df_attributes = read_dataset("./resources/openml_203ds_attributes_nominal_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute_names(df_attributes,g,word_emb)
df_attributes_numeric = read_dataset("./resources/openml_203ds_attributes_numeric_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute_names(df_attributes_numeric,g,word_emb)
#write graph to file
nx.write_gpickle(g, "../word_embeddings/encoded_"+word_emb+"2_names.gpickle")
print("encoded_"+word_emb+"2_names")

## Read previously created graph

In [None]:
#read
g = nx.read_gpickle("../word_embeddings/encoded_bert_v2.gpickle")

## Deprecated

In [None]:
datasets = [x for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
datasets_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature dataset"]
datasets_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal dataset"]
attributes = [x for x,y in g.nodes(data=True) if y['tipo']=="attribute"]
attributes_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature attribute"]
attributes_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal attribute"]

In [None]:
print(len(datasets))
print(len(datasets_features))
print(len(datasets_literals))
print(len(attributes))
print(len(attributes_features))
print(len(attributes_literals))

In [None]:
#plot
datasets = [x for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
datasets_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature dataset"]
datasets_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal dataset"]
attributes = [x for x,y in g.nodes(data=True) if y['tipo']=="attribute"]
attributes_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature attribute"]
attributes_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal attribute"]
# plot_graph(g,datasets,attributes,datasets_features + attributes_features,datasets_literals + attributes_literals);

## Get nodes and print data from nodes

In [None]:
sample = [y for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
print(sample[0]["vector"])