# Read dataset and plot graph

In [1]:
import numpy as np
import pandas as pd
import json
import networkx as nx
import matplotlib.pyplot as plt
from random import randrange

def read_dataset(path,drop_columns=None,keep_columns=None):
    #get rid of useless columns
    csv_data = pd.read_csv(path,sep="~")
    
    if keep_columns != None:
        #keep only these columns
        return csv_data.filter(items=keep_columns)
    
    if drop_columns!= None:
        #drop these and keep the rest
        return csv_data.drop(drop_columns, axis=1)
    
    #finally, didn't drop or filter any column
    return csv_data     

def plot_graph(g,ds_nodes=[],attribute_nodes=[],feat_nodes=[],lit_nodes=[]):
    pos=nx.spring_layout(g)    
    nx.draw_networkx_nodes(g,pos,nodelist=ds_nodes,node_color="blue",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=attribute_nodes,node_color="green",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=feat_nodes,node_color="grey",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=lit_nodes,node_color="red",node_size=900)

    nx.draw_networkx_edges(g,pos,width=3)
    nx.draw_networkx_labels(g,pos,font_size=8)
    plt.show() 

## Graph  construction

In [2]:
def code_ds_id(data):
    return "DS_"+data
def code_attr_id(data,parent):
    return data+"|"+parent
def code_feat_id(data,parent):
    return data+"|"+parent
def code_literal_id(data,parent):
    return "literal_"+data+"|"+parent

In [3]:
def graph_dataset_short(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
    
    #create nodes and edges at datasetLevel
    features = datasets.columns[2:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = instances
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
#         dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        dataset_id = datasets.iloc[r][0]
        g.add_node(dataset_id,vector=word_embedding("dataset|"+datasets.iloc[r][1] ,wem),tipo="attribute")
        row = datasets.iloc[r][2:]
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min(instances,len(features))
            
        for i in range (number_features):
            feature_dataset_id = code_feat_id(features[i],dataset_id)
#             literal_dataset_id = code_literal_id(str(i),dataset_id)
            g.add_node(feature_dataset_id,vector=word_embedding(features[i]+"|"+str(row[i]) ,wem),tipo="feature dataset")
#             g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal dataset")
            g.add_edge(dataset_id,feature_dataset_id)
#             g.add_edge(feature_dataset_id,literal_dataset_id)
            
    return g


def graph_attribute_short(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
        
    #create nodes and edges at datasetLevel
    features = datasets.columns[2:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = min (instances,len(datasets))
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        #attr name is the 2nd column
#         dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        dataset_id = datasets.iloc[r][0]
        attribute_id = code_ds_id(dataset_id+"//"+datasets.iloc[r][1])
        row = datasets.iloc[r][2:]
        
        g.add_node(attribute_id,vector=word_embedding("attribute|"+datasets.iloc[r][1],wem),tipo="dataset")
        
        #relation of dataset and an attribute
        g.add_edge(dataset_id,attribute_id)
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min (instances,len(features))
            
        for i in range (number_features):
            feature_attribute_id = code_feat_id(features[i],attribute_id)
#             literal_dataset_id = code_literal_id(str(i),attribute_id)
            g.add_node(feature_attribute_id,vector=word_embedding(features[i]+"|"+str(row[i]),wem),tipo="feature attribute")
#             g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal attribute")
            g.add_edge(attribute_id,feature_attribute_id)
#             g.add_edge(feature_attribute_id,literal_dataset_id)
            
    return g

## Auxiliars

### Check if input is number

In [4]:
def is_number(s):
    #Returns True is string is a number.
    try:
        float(s)
        if float(s) == float("INF") or float(s) == float("NAN") or s == "NAN" or s == "nan":
            return False
        return True
    except ValueError:
        return False

### From numbers to bin tensor vector

In [46]:
from decimal import Decimal
import bitstring
import torch
#clean
def num2vec(num):
    rep_sc = str('{:.11E}'.format(num))
    print(rep_sc)
    dec_part = int(rep_sc.split("E")[0].replace(".",""))
    c = 1
    if dec_part <0:
        c = -1
    dec_part = abs(dec_part)
    
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = exp_part
    else:
        exp_pos = exp_part
        exp_neg = 0

    exp_pos = abs(exp_pos)    
    exp_neg = abs(exp_neg)
    
    rep_str = str("{:03}{:03}{:012}".format(exp_pos,exp_neg,dec_part))
    print(rep_str)
    
    print(dec_part)
    rep_int = int(rep_str) * c
    rep_bin = bitstring.Bits(int=rep_int, length=64).bin
    print(rep_bin)

    bin_tensor = torch.tensor(np.array([float(x) for x in rep_bin]))
    return bin_tensor

In [49]:
num2vec(2)

2.00000000000E+00
000000200000000000
200000000000
0000000000000000000000000010111010010000111011011101000000000000


tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1.,
        0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)

## Fasttext

In [6]:
import numpy as np
import torch
import fasttext
#fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('./resources/fasttext.bin')
print(ft.get_dimension())


300




In [7]:
def fasttex_simple(value):
    if is_number(value):
        value = str(value)
    
    values = value.split("|")
    out_tensor = torch.zeros(300)
    for v in values:
        out_tensor = out_tensor + torch.tensor(ft.get_sentence_vector(value))
    out_tensor = out_tensor / len(values)
    return out_tensor
    
def fasttex_(value):
    value = str(value)
    values = value.split("|")
    out_tensor = torch.zeros(364)
    for v in values:
        if is_number(v):
            value_f = float(v)
            bin_tensor = num2vec(value_f)
            out_tensor = out_tensor + torch.cat((torch.zeros(300),bin_tensor.float()))
        else:
            str_tensor = torch.tensor(ft.get_sentence_vector(value))
            out_tensor = out_tensor + torch.cat((str_tensor.float(),torch.zeros(64)))
    out_tensor = out_tensor / len(values)
    return out_tensor

## Choose word embedding

In [6]:
def word_embedding(data, model):
    if model=="fasttext":
        return fasttex_(data)
    if model=="bert":
        return bert(data)
    if model=="fasttext_simple":
        return fasttex_simple(data)
    if model=="bert_simple":
        return bert_simple(data)

# Execute

In [10]:
#build graph
word_emb = "fasttext"
df_dataset = read_dataset("./resources/monitor_clean/ds2.csv");
g = g = nx.Graph()
g = graph_dataset_short(df_dataset,g,word_emb)
df_attributes = read_dataset("./resources/monitor_clean/attr_cat2.csv");
g = graph_attribute_short(df_attributes,g,word_emb)
df_attributes_numeric = read_dataset("./resources/monitor_clean/attr_num2.csv");
g = graph_attribute_short(df_attributes_numeric,g,word_emb)
#write graph to file
nx.write_gpickle(g, "../word_embeddings/clean_monitor_"+word_emb+"_short2.gpickle")
print("clean_monitor"+word_emb+"_short2.gpickle")
##clean with older notation
##clean 2 with higher numbers that didnt work
##clean 3 with exp+,exp-,fraction
##clean 4 with +exp+,+exp-,-exp+,-exp-,fraction with sign


clean_monitorfasttext_short2.gpickle


## Read previously created graph

In [7]:
#read
g = nx.read_gpickle("../word_embeddings/clean_monitor_fasttext_short.gpickle")

In [12]:
H = g.subgraph("www.best-deal-items.com//display resolution")

In [13]:
list(H.edges)

[]

In [15]:
nn = nx.number_of_nodes(g)

In [16]:
print(nn)

48911


In [18]:
nodesg = nx.nodes(g)

In [38]:
nodesg["number distinct values|DS_ca.pcpartpicker.com//builtin speakers"]["vector"]*2.0

tensor([ 9.0223e-03,  6.1975e-03, -8.1225e-03,  6.9885e-02, -4.3083e-02,
        -1.8552e-02, -4.3418e-03, -2.6688e-02, -1.4315e-02,  1.9211e-02,
        -3.9717e-03, -1.8107e-02,  3.9169e-02,  3.3390e-02,  3.2606e-04,
        -1.4275e-01,  2.1018e-02, -2.6587e-02,  2.2666e-02, -1.0658e-01,
        -9.2528e-04, -9.0795e-03, -2.1855e-02,  2.5881e-02, -4.2198e-02,
         1.1728e-02,  2.2016e-03,  5.4410e-02,  5.2185e-02,  3.6289e-02,
         2.4765e-02,  2.1216e-02,  2.8413e-02, -4.8492e-02,  1.8319e-02,
         2.0977e-02, -4.5620e-02, -7.1136e-02,  1.7818e-02,  1.2355e-02,
        -1.2732e-02, -1.9463e-03, -2.3491e-02,  9.3629e-03, -8.6660e-02,
        -1.1051e-02,  6.0026e-03, -8.4843e-02, -1.6608e-02,  2.4070e-02,
        -6.8959e-03,  5.4117e-03, -3.5538e-03, -1.4239e-02, -1.4355e-02,
         3.3411e-02,  3.4816e-02,  3.3450e-02, -8.9429e-02, -4.6620e-03,
        -3.6198e-02, -4.1949e-03, -1.6564e-02, -6.4998e-02,  3.0676e-02,
        -7.7396e-02, -2.5179e-02,  2.7802e-02, -2.4