# Read dataset and plot graph

In [1]:
import numpy as np
import pandas as pd
import json
import networkx as nx
import matplotlib.pyplot as plt
from random import randrange

def read_dataset(path,drop_columns=None,keep_columns=None):
    #get rid of useless columns
    csv_data = pd.read_csv(path,sep='|')
    
    if keep_columns != None:
        #keep only these columns
        return csv_data.filter(items=keep_columns)
    
    if drop_columns!= None:
        #drop these and keep the rest
        return csv_data.drop(drop_columns, axis=1)
    
    #finally, didn't drop or filter any column
    return csv_data     

def plot_graph(g,ds_nodes=[],attribute_nodes=[],feat_nodes=[],lit_nodes=[]):
    pos=nx.spring_layout(g)    
    nx.draw_networkx_nodes(g,pos,nodelist=ds_nodes,node_color="blue",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=attribute_nodes,node_color="green",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=feat_nodes,node_color="grey",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=lit_nodes,node_color="red",node_size=900)

    nx.draw_networkx_edges(g,pos,width=3)
    nx.draw_networkx_labels(g,pos,font_size=8)
    plt.show() 

## Graph  construction

In [2]:
def code_ds_id(data):
    return "DS_"+data
def code_attr_id(data,parent):
    return data+"|"+parent
def code_feat_id(data,parent):
    return data+"|"+parent
def code_literal_id(data,parent):
    return "literal_"+data+"|"+parent

In [3]:
def graph_dataset(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
    
    #create nodes and edges at datasetLevel
    features = datasets.columns[1:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = instances
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        g.add_node(dataset_id,vector=word_embedding("dataset",wem),tipo="dataset")
        row = datasets.iloc[r][1:]
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min(instances,len(features))
            
        for i in range (number_features):
            feature_dataset_id = code_feat_id(features[i],dataset_id)
            literal_dataset_id = code_literal_id(str(i),dataset_id)
            g.add_node(feature_dataset_id,vector=word_embedding("feature dataset|" +features[i] ,wem),tipo="feature dataset")
            g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal dataset")
            g.add_edge(dataset_id,feature_dataset_id)
            g.add_edge(feature_dataset_id,literal_dataset_id)
            
    return g


def graph_attribute(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
        
    #create nodes and edges at datasetLevel
    features = datasets.columns[1:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = min (instances,len(datasets))
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        #attr name is the 2nd column
#         attribute_id = code_ds_id(str(datasets.iloc[r][0]))
        attribute_id = "DS_"+datasets.iloc[r][0]
        row = datasets.iloc[r][1:]
        
        g.add_node(attribute_id,vector=word_embedding(attribute_id,wem),tipo="dataset")
        
        #relation of dataset and an attribute
#         g.add_edge(dataset_id,attribute_id)
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min (instances,len(features))
            
        for i in range (number_features):
            feature_attribute_id = code_feat_id(features[i],attribute_id)
            literal_dataset_id = code_literal_id(str(i),attribute_id)
            g.add_node(feature_attribute_id,vector=word_embedding("feature attribute|"+features[i],wem),tipo="feature attribute")
            g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal attribute")
            g.add_edge(attribute_id,feature_attribute_id)
            g.add_edge(feature_attribute_id,literal_dataset_id)
            
    return g

def graph_attribute_short(datasets,g=None,wem="fasttext",instances=0):
    if g == None:
        g = nx.Graph()
        
    #create nodes and edges at datasetLevel
    features = datasets.columns[1:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = min (instances,len(datasets))
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        #attr name is the 2nd column
#         attribute_id = code_ds_id(str(datasets.iloc[r][0]))
        attribute_id = "DS_"+datasets.iloc[r][0]
        row = datasets.iloc[r][1:]
        
        g.add_node(attribute_id,vector=word_embedding(attribute_id,wem),tipo="dataset")
        
        #relation of dataset and an attribute
#         g.add_edge(dataset_id,attribute_id)
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min (instances,len(features))
            
        for i in range (number_features):
            feature_attribute_id = code_feat_id(features[i],attribute_id)
#             literal_dataset_id = code_literal_id(str(i),attribute_id)
            g.add_node(feature_attribute_id,vector=word_embedding(features[i]+"|"+str(row[i]),wem),tipo="feature attribute")
#             g.add_node(literal_dataset_id,vector=word_embedding(row[i],wem),tipo="literal attribute")
            g.add_edge(attribute_id,feature_attribute_id)
#             g.add_edge(feature_attribute_id,literal_dataset_id)
            
    return g

## Auxiliars

### Check if input is number

In [6]:
def is_number(s):
    #Returns True is string is a number.
    try:
        float(s)
        if float(s) == float("INF"):
            return False
        return True
    except ValueError:
        return False

### From numbers to bin tensor vector

In [7]:
from decimal import Decimal
import bitstring
def num2vec(num):
    
    rep_sc = str('{:.11E}'.format(num))
    
    dec_part = int(rep_sc.split("E")[0].replace(".",""))
    c = 1
    if dec_part <0:
        c = -1
    dec_part = abs(dec_part)
    
    exp_part = int(rep_sc.split("E")[1])
    if exp_part <0:
        exp_pos = 0
        exp_neg = exp_part
    else:
        exp_pos = exp_part
        exp_neg = 0

    exp_pos = abs(exp_pos)    
    exp_neg = abs(exp_neg)
    
    rep_str = str("{:03}{:03}{:012}".format(exp_pos,exp_neg,dec_part))
    
#     print(dec_part)
    rep_int = int(rep_str) * c
    rep_bin = bitstring.Bits(int=rep_int, length=64).bin

    bin_tensor = torch.tensor(np.array([float(x) for x in rep_bin]))
    return bin_tensor

## Fasttext

In [8]:
import numpy as np
import torch
import fasttext
#fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('./resources/fasttext.bin')
print(ft.get_dimension())




300


In [9]:
def fasttex_simple(value):
    if is_number(value):
        value = str(value)
    
    value = value.replace("\n"," ")
    values = value.split("|")
    out_tensor = torch.zeros(300)
    for v in values:
        out_tensor = out_tensor + torch.tensor(ft.get_sentence_vector(value))
    out_tensor = out_tensor / len(values)
    return out_tensor
    
def fasttex_(value):
    value = str(value)
    values = value.split("|")
    out_tensor = torch.zeros(364)
    for v in values:
        if is_number(v):
            value_f = float(v)
            bin_tensor = num2vec(value_f)
            out_tensor = out_tensor + torch.cat((torch.zeros(300),bin_tensor.float()))
        else:
            str_tensor = torch.tensor(ft.get_sentence_vector(value))
            out_tensor = out_tensor + torch.cat((str_tensor.float(),torch.zeros(64)))
    out_tensor = out_tensor / len(values)
    return out_tensor

## Bert


In [6]:
#pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
#pip install transformers
from transformers import BertModel, BertTokenizer
import torch

#load model in memory
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-12_H-768_A-12')
model = BertModel.from_pretrained('google/bert_uncased_L-12_H-768_A-12')
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = BertModel.from_pretrained('bert-base-uncased')
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

def bert_simple(value):
    if is_number(value):
        value = str(value)
    value = value.replace("\n"," ")
    values = value.split("|")
    out_tensor = torch.zeros(768)
    for v in values:
        #add special tokens at the begining and end, and takes until 512 tokens max 
        tokenized = tokenizer.encode(v, add_special_tokens=True,max_length=512)
        input_ids = torch.tensor(tokenized).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        #result shape: (batch size, sequence length, model hidden dimension)
#             print(last_hidden_states.shape)

        #make the mean of the vectors to have 1 vector for the whole sentence and store result
        out_tensor = out_tensor + torch.mean(last_hidden_states[0],dim=0).detach()

    out_tensor = out_tensor / len(values)
    return out_tensor

def bert(value):
    if is_number(value):
        value = float(value)
        bin_tensor = num2vec(value)
        return torch.cat((torch.zeros(768),bin_tensor.float()))
    else:
        values = value.split("|")
        out_tensor = torch.zeros(768)
        for v in values:
            #add special tokens at the begining and end, and takes until 512 tokens max 
            tokenized = tokenizer.encode(v, add_special_tokens=True,max_length=512)
            input_ids = torch.tensor(tokenized).unsqueeze(0)  # Batch size 1
            outputs = model(input_ids)

            last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
            #result shape: (batch size, sequence length, model hidden dimension)
#             print(last_hidden_states.shape)

            #make the mean of the vectors to have 1 vector for the whole sentence and store result
            out_tensor = out_tensor + torch.mean(last_hidden_states[0],dim=0).detach()
            
        out_tensor = out_tensor / len(values)
        return torch.cat((out_tensor.float(),torch.zeros(64)))

## Choose word embedding

In [10]:
def word_embedding(data, model):
    if model=="fasttext":
        return fasttex_(data)
    if model=="bert":
        return bert(data)
    if model=="fasttext_simple":
        return fasttex_simple(data)
    if model=="bert_simple":
        return bert_simple(data)

# Execute

In [11]:
#build graph
word_emb = "fasttext_simple"
# df_dataset = read_dataset("./resources/monitor_nominal.csv",drop_columns=["value_counts_with_nan", "value_counts_without_nan","distinct_count_with_nan","distinct_count_without_nan","n"
#                                                                           ,"p_infinite","n_infinite","date_warning","histogram_data","scatter_data"]);
g = g = nx.Graph()
# g = graph_dataset(df_dataset,g,word_emb)
df_attributes = read_dataset("./resources/monitor_nominal.csv",drop_columns=["value_counts_with_nan", "value_counts_without_nan","distinct_count_with_nan","distinct_count_without_nan","n","p_infinite","n_infinite","date_warning","histogram_data","scatter_data"])
g = graph_attribute_short(df_attributes,g,word_emb)
# df_attributes_numeric = read_dataset("./resources/openml_203ds_attributes_numeric_index.csv",drop_columns=["dataset_name", "type_converted"]);
# g = graph_attribute(df_attributes_numeric,g,word_emb)

#write graph to file
nx.write_gpickle(g, "../word_embeddings/monitor_"+word_emb+"_short.gpickle")

In [8]:
#build graph
word_emb = "fasttext_simple"
# df_dataset = read_dataset("./resources/monitor_nominal.csv",drop_columns=["value_counts_with_nan", "value_counts_without_nan","distinct_count_with_nan","distinct_count_without_nan","n"
#                                                                           ,"p_infinite","n_infinite","date_warning","histogram_data","scatter_data"]);
g = g = nx.Graph()
# g = graph_dataset(df_dataset,g,word_emb)
df_attributes = read_dataset("./resources/monitor_nominal.csv",drop_columns=["value_counts_with_nan", "value_counts_without_nan","distinct_count_with_nan","distinct_count_without_nan","n","p_infinite","n_infinite","date_warning","histogram_data","scatter_data"])
g = graph_attribute(df_attributes,g,word_emb)
# df_attributes_numeric = read_dataset("./resources/openml_203ds_attributes_numeric_index.csv",drop_columns=["dataset_name", "type_converted"]);
# g = graph_attribute(df_attributes_numeric,g,word_emb)

#write graph to file
nx.write_gpickle(g, "../word_embeddings/monitor_"+word_emb+".gpickle")

KeyboardInterrupt: 

## Read previously created graph

In [7]:
#read
g = nx.read_gpickle("../word_embeddings/monitor_fasttext_simple.gpickle")

In [31]:
datasets = [x for x,y in g.nodes(data=True) if "DS_www.cleverboxes.com//mount type" == x.strip() ]
for d in datasets:
    print (d)

DS_www.cleverboxes.com//mount type


In [35]:
str("hola ".strip())=="hola"

True

In [11]:
d = [x for x in datasets if x=="www.cleverboxes.com//mount_type"]
print(d)

[]


In [38]:
'DS_www.cleverboxes.com//mount_type' == 'DS_www.cleverboxes.com//mount_type'

True

## Deprecated

In [8]:
datasets = [x for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
datasets_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature dataset"]
datasets_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal dataset"]
attributes = [x for x,y in g.nodes(data=True) if y['tipo']=="attribute"]
attributes_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature attribute"]
attributes_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal attribute"]

NameError: name 'g' is not defined

In [None]:
print(len(datasets))
print(len(datasets_features))
print(len(datasets_literals))
print(len(attributes))
print(len(attributes_features))
print(len(attributes_literals))

In [None]:
#plot
datasets = [x for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
datasets_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature dataset"]
datasets_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal dataset"]
attributes = [x for x,y in g.nodes(data=True) if y['tipo']=="attribute"]
attributes_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature attribute"]
attributes_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal attribute"]
# plot_graph(g,datasets,attributes,datasets_features + attributes_features,datasets_literals + attributes_literals);

## Get nodes and print data from nodes

In [None]:
sample = [y for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
print(sample[0]["vector"])