# Read dataset and plot graph

In [None]:
#pip install numpy
#pip install pandas
#pip install pandas-profiling
#pip install networkx
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import json
import networkx as nx
import matplotlib.pyplot as plt
from random import randrange

def read_dataset(path,drop_columns=None,keep_columns=None):
    #get rid of useless columns
    csv_data = pd.read_csv(path)
    
    if keep_columns != None:
        #keep only these columns
        return csv_data.filter(items=keep_columns)
    
    if drop_columns!= None:
        #drop these and keep the rest
        return csv_data.drop(drop_columns, axis=1)
    
    #finally, didn't drop or filter any column
    return csv_data     

def plot_graph(g,ds_nodes=[],attribute_nodes=[],feat_nodes=[],lit_nodes=[]):
    pos=nx.spring_layout(g)    
    nx.draw_networkx_nodes(g,pos,nodelist=ds_nodes,node_color="blue",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=attribute_nodes,node_color="green",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=feat_nodes,node_color="grey",node_size=900)
    nx.draw_networkx_nodes(g,pos,nodelist=lit_nodes,node_color="red",node_size=900)

    nx.draw_networkx_edges(g,pos,width=3)
    nx.draw_networkx_labels(g,pos,font_size=8)
    plt.show() 

## Graph  construction

In [None]:
def code_ds_id(data):
    return "DS_"+data
def code_attr_id(data,parent):
    return data+"|"+parent
def code_feat_id(data,parent):
    return data+"|"+parent
def code_literal_id(data,parent):
    return "literal_"+data+"|"+parent

In [None]:
def graph_dataset(datasets,g=None,instances=0):
    if g == None:
        g = nx.Graph()
    
    #create nodes and edges at datasetLevel
    features = datasets.columns[1:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = instances
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        g.add_node(dataset_id,vector=word_embedding("dataset","fasttext"),tipo="dataset")
        row = datasets.iloc[r][1:]
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min(instances,len(features))
            
        for i in range (number_features):
            feature_dataset_id = code_feat_id(features[i],dataset_id)
            literal_dataset_id = code_literal_id(str(i),dataset_id)
            g.add_node(feature_dataset_id,vector=word_embedding("feature dataset","fasttext"),tipo="feature dataset")
            g.add_node(literal_dataset_id,vector=word_embedding(row[i],"fasttext"),tipo="literal dataset")
            g.add_edge(dataset_id,feature_dataset_id)
            g.add_edge(feature_dataset_id,literal_dataset_id)
            
    return g

In [None]:
def graph_attribute(datasets,g=None,instances=0):
    if g == None:
        g = nx.Graph()
        
    #create nodes and edges at datasetLevel
    features = datasets.columns[2:]
    
    if instances==0:
        number_instances = len(datasets)
    else:
        number_instances = min (instances,len(datasets))
    
    for r in range(number_instances): 
        #node id is the openML id which is in the first column
        #attr name is the 2nd column
        dataset_id = code_ds_id(str(datasets.iloc[r][0]))
        attribute_id = code_attr_id(datasets.iloc[r][1],dataset_id)
        row = datasets.iloc[r][2:]
        
        g.add_node(attribute_id,vector=word_embedding("attribute","fasttext"),tipo="attribute")
        
        #relation of dataset and an attribute
        g.add_edge(dataset_id,attribute_id)
        
        if instances == 0:
            number_features = len(features)
        else:
            number_features = min (instances,len(features))
            
        for i in range (number_features):
            feature_attribute_id = code_feat_id(features[i],attribute_id)
            literal_dataset_id = code_literal_id(str(i),attribute_id)
            g.add_node(feature_attribute_id,vector=word_embedding("feature attribute","fasttext"),tipo="feature attribute")
            g.add_node(literal_dataset_id,vector=word_embedding(row[i],"fasttext"),tipo="literal attribute")
            g.add_edge(attribute_id,feature_attribute_id)
            g.add_edge(feature_attribute_id,literal_dataset_id)
            
    return g

# Test

In [None]:
#build graph
df_dataset = read_dataset("./openML/openml_203ds_datasets_index.csv",drop_columns=["Num", "dataset_topic"]);
g = g = nx.Graph()
g = graph_dataset(df_dataset,g,1)
df_attributes = read_dataset("./openML/openml_203ds_attributes_nominal_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute(df_attributes,g,2)
df_attributes_numeric = read_dataset("./openML/openml_203ds_attributes_numeric_index.csv",drop_columns=["dataset_name", "type_converted"]);
g = graph_attribute(df_attributes_numeric,g,2)

In [None]:
#plot
datasets = [x for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
datasets_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature dataset"]
datasets_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal dataset"]
attributes = [x for x,y in g.nodes(data=True) if y['tipo']=="attribute"]
attributes_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature attribute"]
attributes_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal attribute"]
plot_graph(g,datasets,attributes,datasets_features + attributes_features,datasets_literals + attributes_literals);

## Write and read graph in file

In [None]:
#write
nx.write_gpickle(g, "./word_embeddings/encoded_fasttext.gpickle")

In [None]:
#read
g = nx.read_gpickle("./word_embeddings/encoded_features.gpickle")

## Get array of ids

In [None]:
datasets = [x for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
datasets_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature dataset"]
datasets_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal dataset"]
attributes = [x for x,y in g.nodes(data=True) if y['tipo']=="attribute"]
attributes_features = [x for x,y in g.nodes(data=True) if y['tipo']=="feature attribute"]
attributes_literals = [x for x,y in g.nodes(data=True) if y['tipo']=="literal attribute"]

In [None]:
print(len(datasets))
print(len(datasets_features))
print(len(datasets_literals))
print(len(attributes))
print(len(attributes_features))
print(len(attributes_literals))

## Get nodes and print data from nodes

In [None]:
sample = [y for x,y in g.nodes(data=True) if y['tipo']=="dataset"]
print(sample[0]["vector"])

## Possitive and negative samples

In [None]:
def get_samples(df):
    possitive_pairs = []
    negative_pairs = []
    for index, row in df.iterrows():
        if row[2] == 1:
            possitive_pairs.append((row[0],row[1]))
        else:
            if randrange(1, 10) > 8:
                negative_pairs.append((row[0],row[1]))
    return possitive_pairs,negative_pairs

In [None]:
df_matching = read_dataset("./openML/openml_203ds_datasets_matching.csv",keep_columns=["'dataset1_id'", "'dataset2_id'","'matching_topic'"]);
pos,neg = get_samples(df_matching)
print("Possitive samples: "+str(len(pos)) + " Negative samples (10%): "+str(len(neg)))


## Convert strings to single words

In [None]:
def is_number(s):
    #Returns True is string is a number.
    try:
        float(s)
        return True
    except ValueError:
        return False

In [None]:
import re
def get_tokens(nodes):
    #nodes_unique = set(nodes)
    nodes_unique = nodes
    numerical=[]
    nominal=[]
    for s in nodes_unique:
        if is_number(s):
            numerical.append(float(s))
        else:
            #nominal = nominal +  list(map(lambda a : re.sub(r'[^A-Za-z0-9 \']+','',a).lower(),(re.split('[-_;|]\s*',s))))
            nominal.append(re.sub(r'[^A-Za-z0-9 \']+',' ',s).lower())
            #nominal = nominal +  s.split("|")
    return list(set(numerical)),list(set(nominal))

In [None]:
def vector_from_number(n):
    #get the embeddings for numerical values that make sense
    out = []
    return out

In [None]:
vec_num,vec_nom = get_tokens(lit_nodes)

In [None]:
for i in range(100):
    print("num: "+ str(vec_num[i]))
    print("nom: "+ vec_nom[i])

## Cosine similarity

In [1]:
from scipy.spatial.distance import cosine
def cosine_vectors(vec1,vec2):
    return (1- cosine(vec1,vec2))

In [8]:
cosine_vectors([1,2,3],[1,1,1])

0.9258200997725514

## Fasttext

In [None]:
#pip install fasttext
import fasttext
#fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('../models/cc.en.300.bin')
print(ft.get_dimension())

In [None]:
import torch
def fasttex_(value):
    if is_number(value):
        value = str(value)

    return torch.tensor(ft.get_sentence_vector(value))

In [None]:
import torch
def fasttex_vectors(nodes_literals):
    nodes_vector = []
    for literal in nodes_literals:
        
        if is_number(literal):
            literal = str(literal)
        
        nodes_vector.append(torch.tensor(ft.get_sentence_vector(literal)))
        
    return nodes_vector

## Bert


In [None]:
#pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
#pip install transformers
from transformers import BertModel, BertTokenizer
import torch

#load model in memory
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-12_H-768_A-12')
model = BertModel.from_pretrained('google/bert_uncased_L-12_H-768_A-12')
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = BertModel.from_pretrained('bert-base-uncased')
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

In [None]:
def bert(value):
    if is_number(value):
        value = str(value)

    #add special tokens at the begining and end, and takes until 512 tokens max 
    tokenized = tokenizer.encode(value, add_special_tokens=True,max_length=512)
    input_ids = torch.tensor(tokenized).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)

    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    #result shape: (batch size, sequence length, model hidden dimension)
    #print(last_hidden_states.shape)

    #make the mean of the vectors to have 1 vector for the whole sentence and store result
    return torch.mean(last_hidden_states[0],dim=0).detach()

In [None]:
def bert_vectors(nodes_literals):
    nodes_vector = []
    for literal in nodes_literals:
        
        if is_number(literal):
            literal = str(literal)
        
        #add special tokens at the begining and end, and takes until 512 tokens max 
        tokenized = tokenizer.encode(literal, add_special_tokens=True,max_length=512)
        input_ids = torch.tensor(tokenized).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        #result shape: (batch size, sequence length, model hidden dimension)
        #print(last_hidden_states.shape)
        
        #make the mean of the vectors to have 1 vector for the whole sentence and store result
        nodes_vector.append(torch.mean(last_hidden_states[0],dim=0).detach()) 
    return nodes_vector

## Choose word embedding

In [None]:
def word_embedding(data, model):
    if model=="fasttext":
        return fasttex_(data)
    if model=="bert":
        return bert(data)

## Get embeddings of words and calculate similarity

In [None]:
import numpy as np
import torch
s1= "University degree"
s2="Bachelor"
vectors_b = bert_vectors([s1,s2])
vectors_ft = fasttex_vectors([s1,s2])
print("Fasttext dif: " + str(cosine_vectors(vectors_ft[0],vectors_ft[1])))
#print("Bert dif: " + str(cosine_vectors(vectors_b[0],vectors_b[1])))
print("Fasttext rep")
print(vectors_ft[0])
print(torch.tensor(vectors_ft[0]))
print("BERT rep")
print(vectors_b[0])
print(torch.tensor(vectors_b[0]))

## Trash

In [None]:
from decimal import Decimal
rep = '%.4E' % Decimal(88799777378)
print(rep)

In [None]:
df = pd.DataFrame(vectors, columns=["colummn"])
df.to_csv('lit_vectors.csv', index=False)