In [1]:
# install required libraries
import pandas as pd
import os
from os import path
import numpy as np
from tqdm import tqdm
import time

import torch
!pip install transformers==3

from transformers import BertTokenizer, DistilBertTokenizer, AlbertTokenizer, RobertaTokenizer, XLNetTokenizer
from transformers import BertModel, DistilBertModel, AlbertModel, RobertaModel, XLNetModel
from transformers.tokenization_utils_base import BatchEncoding



# Data Collector

In [2]:
class DataCollector(object):
  def __init__(self, data_dir):

    if not isinstance(data_dir, str):
      raise TypeError("Wrong data directory type.")

    !mkdir -p $data_dir
    
    self.data_dir = data_dir
    self.available_use_cases = ["Structured/DBLP-GoogleScholar", 
                                "Structured/DBLP-ACM", 
                                "Structured/Amazon-Google",
                                "Structured/Walmart-Amazon",
                                "Structured/Beer", "Structured/iTunes-Amazon",
                                "Structured/Fodors-Zagats", "Textual/Abt-Buy",
                                "Textual/Company", "Dirty/iTunes-Amazon",
                                "Dirty/DBLP-ACM", "Dirty/DBLP-GoogleScholar",
                                "Dirty/Walmart-Amazon"]

  def _get_complete_dataset(self, dataset):
    """
    Expand the dataset with the records from source A and source B.
    """
    
    dataset_path = os.path.join(self.data_dir, dataset)
    ds = pd.read_csv(dataset_path)

    tableA_path = os.path.join(self.data_dir, "tableA.csv")
    ds_a = pd.read_csv(tableA_path)

    tableB_path = os.path.join(self.data_dir, "tableB.csv")
    ds_b = pd.read_csv(tableB_path)

    assert 'ltable_id' in ds
    assert 'rtable_id' in ds
    assert 'id' in ds_b
    assert 'id' in ds_a

    ds_a = ds_a.add_prefix('left_')
    ds_b = ds_b.add_prefix('right_')

    ds = pd.merge(ds, ds_a, how='inner', left_on='ltable_id', right_on='left_id', suffixes=(False, False))
    ds = pd.merge(ds, ds_b, how='inner', left_on='rtable_id', right_on='right_id', suffixes=(False, False))

    ds.drop(["ltable_id", "rtable_id"], axis=1, inplace=True)

    return ds

  def _save_complete_dataset(self, dataset):
    """
    Expand the integrated dataset.
    """
    
    ds = self._get_complete_dataset(dataset)

    out_file_name = os.path.join(self.data_dir, dataset)
    ds.to_csv(out_file_name, index=False)

  def _download_data(self, use_case):
    """
    Download the datasets associated to the provided DeepMatcher use case.
    """
  
    base_url = "http://pages.cs.wisc.edu/~anhai/data1/deepmatcher_data/{}/exp_data".format(use_case)
    tableA = "{}/tableA.csv".format(base_url)
    tableB = "{}/tableB.csv".format(base_url)
    train = "{}/train.csv".format(base_url)
    test = "{}/test.csv".format(base_url)
    valid = "{}/valid.csv".format(base_url)

    !wget -P $self.data_dir $tableA
    !wget -P $self.data_dir $tableB
    !wget -P $self.data_dir $train
    !wget -P $self.data_dir $test
    !wget -P $self.data_dir $valid

    # extend datasets
    self._save_complete_dataset("train.csv")
    self._save_complete_dataset("test.csv")
    self._save_complete_dataset("valid.csv")

  def get_data(self, use_case):

    if not isinstance(use_case, str):
      raise TypeError("Wrong use case type.")

    if use_case not in self.available_use_cases:
      raise ValueError("Wrong use case name.")

    # check data existence
    if not (os.path.exists(os.path.join(self.data_dir, "tableA.csv")) and \
       os.path.exists(os.path.join(self.data_dir, "tableB.csv")) and \
       os.path.exists(os.path.join(self.data_dir, "train.csv")) and \
       os.path.exists(os.path.join(self.data_dir, "valid.csv")) and \
       os.path.exists(os.path.join(self.data_dir, "test.csv"))):
      
      print("Starting downloading the data...")
      for file in ["tableA.csv", "tableB.csv", "train.csv", "valid.csv", "test.csv"]:
        f = os.path.join(self.data_dir, file)
        !rm $f

      self._download_data(use_case)
    
    else:
      print("Data already downloaded.")

In [3]:
# use_case = "Textual/Abt-Buy"
# data_dir = "data/{}/original/".format(use_case)

# data_collector = DataCollector(data_dir)
# data_collector.get_data(use_case)

# Data Containers

In [4]:
class DataContainer(object):
  
  def __init__(self, data_dir, dataset_id, dataset_file, columns, label_col,
               sample_size=None, left_prefix='left_', right_prefix='right_',
               seed=24):

    # check parameter types
    if not isinstance(data_dir, str):
      raise TypeError("Wrong data dir type.")

    if not isinstance(dataset_id, str):
      raise TypeError("Wrong dataset id type.")

    if not isinstance(dataset_file, str):
      raise TypeError("Wrong dataset file type.")

    if not isinstance(columns, list):
      raise TypeError("Wrong columns type.")

    if not isinstance(label_col, str):
      raise TypeError("Wrong label column type.")

    if sample_size is not None:
      if not isinstance(sample_size, int):
        raise TypeError("Wrong sample size type.")

    if not isinstance(left_prefix, str):
      raise TypeError("Wrong left prefix type.")

    if not isinstance(right_prefix, str):
      raise TypeError("Wrong right prefix type.")

    # check parameter values
    if not os.path.exists(data_dir):
      raise ValueError("Data directory not found.")

    dataset_path = os.path.join(data_dir, dataset_file)
    if not os.path.exists(dataset_path):
      raise ValueError("Dataset file not found.")
    
    data = pd.read_csv(dataset_path)

    if len(columns) == 0:
      return ValueError("No columns provided.")
      
    for col in columns:
      for prefix in [left_prefix, right_prefix]:
        c = "{}{}".format(prefix, col)
        if c not in data.columns.values:
          raise ValueError("No column {} found.".format(c)) 

    if label_col not in data.columns.values:
      raise ValueError("No column {} found.".format(label_col))  

    if sample_size is not None and dataset_id == 'train':

      print("Getting {} data sample with {} elements.".format(dataset_id, sample_size))
      match = data[data[label_col] == 1]
      non_match = data[data[label_col] == 0]
      match_ratio = len(match) / len(data)

      if sample_size > len(data):
        raise ValueError("Wrong sample size value.")

      match_sample_size = int(round(match_ratio * sample_size))
      match_sample = match.sample(n=match_sample_size, random_state=seed)
      non_match_sample = non_match.sample(n=sample_size - len(match_sample),
                                          random_state=seed)
      data_sample = pd.concat([match_sample, non_match_sample])
      data = data_sample.sample(frac=1, random_state=seed)
      print("Data sample match ratio: {}".format((len(data[data[label_col]==1]) / len(data))*100))

    self.nrows = data.shape[0]
    self.ncols = len(columns)
    self.match_ratio = (len(data[data[label_col]==1]) / len(data))*100

    self.dataset_id = dataset_id
    self.label_col = label_col
    self.all_left_cols = ["{}{}".format(left_prefix, c) for c in columns]
    self.all_right_cols = ["{}{}".format(right_prefix, c) for c in columns]
    self.all_cols = self.all_left_cols + self.all_right_cols# + [label_col]
    complete_data = data.copy()
    self.complete_data = complete_data[self.all_cols]

    data.drop(["{}id".format(left_prefix), "{}id".format(right_prefix)], axis=1, inplace=True)

    self.labels = data[label_col]
    data.drop([label_col], axis=1, inplace=True)
    self.data = data   
    self.columns = columns.copy()
    self.left_cols = self.all_left_cols
    self.right_cols = self.all_right_cols
    self.left_prefix = left_prefix
    self.right_prefix = right_prefix

    self.tokenized_data = None
    self.encoded_data = None
    self.embeddings = None

  def get_complete_data(self):
    return self.complete_data

  def get_data(self):    
    return self.data

  def get_stats(self):
    return {"nrows": self.nrows, "ncols": self.ncols, "match_ratio": self.match_ratio}

  def get_labels(self):
    return self.labels

  def get_label_col(self):
    return self.label_col

  def get_dataset_id(self):
    return self.dataset_id

  def get_columns(self):
    return self.columns

  def get_left_columns(self):
    return self.left_cols

  def get_right_columns(self):
    return self.right_cols

  def set_tokenized_data(self, tokenized_data):
    self.tokenized_data = tokenized_data

  def get_tokenized_data(self):
    return self.tokenized_data

  def set_encoded_data(self, encoded_data):
    self.encoded_data = encoded_data

  def get_encoded_data(self):
    return self.encoded_data

  def set_embeddings(self, embeddings):
    self.embeddings = embeddings

  def get_embeddings(self):
    return self.embeddings

  def check_em_adapter_application(self):
    return self.tokenized_data is not None and self.encoded_data is not None and self.embeddings is not None

In [5]:
class DataCollectionContainer(object):
  
  def __init__(self, data_dir, collection_id, train_file, valid_file, test_file,
               columns, label_col, sample_size=None, left_prefix='left_',
               right_prefix='right_'):    

    if not isinstance(collection_id, str):
      raise TypeError("Wrong collection id type.")          

    self.collection_id = collection_id
    self.train = DataContainer(data_dir, "train", train_file, columns, label_col,
                               sample_size, left_prefix, right_prefix)
    self.valid = DataContainer(data_dir, "valid", valid_file, columns, label_col,
                               sample_size, left_prefix, right_prefix)
    self.test = DataContainer(data_dir, "test", test_file, columns, label_col,
                              sample_size, left_prefix, right_prefix)
    self.files = {"train": self.train, "valid": self.valid, "test": self.test}
    self.left_cols = ["{}{}".format(left_prefix, c) for c in columns]
    self.right_cols = ["{}{}".format(right_prefix, c) for c in columns]
    self.all_cols = self.left_cols + self.right_cols + [label_col]  
    self.columns = columns
    self.label_col = label_col

  def get_data(self, data_id):
    
    if not isinstance(data_id, str):
      raise TypeError("Wrong data_id type.")

    if data_id not in list(self.files):
      raise ValueError("Wrong data_id value.")

    return self.files[data_id]

  def get_stats(self):
    
    stats_data = {}
    for data_id in self.files:
      stats = self.files[data_id].get_stats()
      for stats_item in stats:
        stats_data["{}_{}".format(data_id, stats_item)] = stats[stats_item]

    return stats_data

  def get_columns(self):
    return self.columns

  def get_label_col(self):
    return self.label_col

  def get_columns_and_label(self):
    return self.all_cols

  def get_left_columns(self):
    return self.left_cols

  def get_right_columns(self):
    return self.right_cols

In [6]:
# cols = list(pd.read_csv(os.path.join(data_dir, "tableA.csv")).columns.values)[1:]
# label_col = "label"
# sample_size = 500
# data_container_col = DataCollectionContainer(data_dir, use_case, "train.csv", "valid.csv", "test.csv", cols, label_col, sample_size)

In [7]:
# cols = list(pd.read_csv(os.path.join(data_dir, "tableA.csv")).columns.values)[1:]
# label_col = "label"
# sample_size = 200
# data_container = DataContainer(data_dir, "test", "test.csv", cols, label_col, sample_size)

# EM Adapter

## Tokenizer

In [8]:
class EM_Tokenizer(object):
  """
  Component that tokenizes EM datasets.
  """

  def __init__(self, name, max_length=512):
    """
    This method initializes the EM_Tokenizer by selecting the Huggingface's
    tokenizer associated to the provided tokenizer name.
    """

    if not isinstance(name, str):
      raise TypeError("Wrong tokenizer name type.")

    available_tokenizers = ["bert", "distilbert", "albert", "roberta", "xlnet"]
    if name not in available_tokenizers:
      raise ValueError("Tokenizer not found.")
    
    if name == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    elif name == 'distilbert':
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    elif name == 'albert':
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

    elif name == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    elif name == 'xlnet':
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')        

    self.tokenizer = tokenizer
    self.max_length = max_length
    self.additional_special_tokens = None

  def set_max_length(self, max_length):
    self.max_length = max_length

  def _tokenize_pair_sent(self, entity1, entity2):

      sent1 = ' '.join([str(val) for val in entity1.to_list()])# if val != unk_token])
      sent2 = ' '.join([str(val) for val in entity2.to_list()])# if val != unk_token])
      
      return [sent1], [sent2]

  def _tokenize_pair_attr(self, entity1, entity2):
    
    sent1 = []
    sent2 = []

    cols1 = list(entity1.index)
    cols2 = list(entity2.index)
    for pair_col in zip(cols1, cols2):
      attr_sent1 = str(entity1[pair_col[0]])
      attr_sent2 = str(entity2[pair_col[1]])
      sent1.append(attr_sent1)
      sent2.append(attr_sent2)
    
    return sent1, sent2

  def _tokenize_pair_incremental_attr(self, entity1, entity2):

    sentences1 = []
    sentences2 = []

    cols1 = list(entity1.index)
    cols2 = list(entity2.index)
    sent1 = ""
    sent2 = ""
    for pair_col in zip(cols1, cols2):
      attr_sent1 = str(entity1[pair_col[0]])
      attr_sent2 = str(entity2[pair_col[1]])

      if sent1 == "":
        sent1 = attr_sent1
      else:
        sent1 += " {}".format(attr_sent1)

      if sent2 == "":
        sent2 = attr_sent2
      else:
        sent2 += " {}".format(attr_sent2)

      sentences1.append(sent1)
      sentences2.append(sent2)

    return sentences1, sentences2

  def _tokenize_pair_sent_attr(self, entity1, entity2):

    special_tokens = []

    sent1 = ""
    for col in entity1.index:
      col_special_token = "[{}]".format(col)
      sent1 += "{} {} {} ".format(col_special_token, entity1[col], col_special_token)

      if self.additional_special_tokens is None:
        special_tokens.append(col_special_token)
    sent1 = sent1[:-1]

    if len(special_tokens) > 0:
      print("Adding new special tokens: {}".format(special_tokens))      
      self.additional_special_tokens = special_tokens
      self.tokenizer.add_tokens(self.additional_special_tokens)
      print("Now the tokens are: {} (+{})".format(len(self.tokenizer), len(special_tokens)))
    
    sent2 = ""
    for col in entity2.index:
      col_special_token = "[{}]".format(col)
      sent2 += "{} {} {} ".format(col_special_token, entity2[col], col_special_token)
    sent2 = sent2[:-1]

    return [sent1], [sent2]

  def tokenize(self, entity1, entity2, method):
      """
      This method tokenizes a pair of entity mentions.
      """

      unk_token = self.tokenizer.unk_token
      entity1 = entity1.fillna(unk_token)
      entity2 = entity2.fillna(unk_token)

      sentences1 = []
      sentences2 = []
      
      if method == 'pair-sent':
        
        sent1, sent2 = self._tokenize_pair_sent(entity1, entity2)
        sentences1 += sent1
        sentences2 += sent2

      elif method == 'pair-attr':

        sent1, sent2 = self._tokenize_pair_attr(entity1, entity2)
        sentences1 += sent1
        sentences2 += sent2

      elif method == 'pair-incremental-attr':

        sent1, sent2 = self._tokenize_pair_incremental_attr(entity1, entity2)
        sentences1 += sent1
        sentences2 += sent2

      elif method == 'pair-sent-attr':
        
        sent1, sent2 = self._tokenize_pair_sent_attr(entity1, entity2)
        sentences1 += sent1
        sentences2 += sent2

      elif method == "pair-sent-incremental-attr":

        sent1, sent2 = self._tokenize_pair_attr(entity1, entity2)
        sentences1 += sent1
        sentences2 += sent2

        sent1, sent2 = self._tokenize_pair_sent_attr(entity1, entity2)
        sentences1 += sent1
        sentences2 += sent2        

      token_data_items = self.tokenizer(sentences1, sentences2, padding='max_length',
                                        truncation=True, return_tensors="pt",
                                        max_length = self.max_length,
                                        add_special_tokens = True, 
                                        pad_to_max_length = True, 
                                        return_attention_mask = True)

      # n = len(token_data_items["input_ids"])
      # for i in range(n):
      #   print(self.tokenizer.decode(token_data_items["input_ids"][i]))

      return token_data_items

  def tokenize_dataset(self, data_container, method, force=True):
      """
      This method tokenizes all rows of a EM dataset.
      """

      if not isinstance(data_container, DataContainer):
        raise TypeError("Wrong data container type.")

      if not isinstance(method, str):
        raise TypeError("Wrong method type.")    

      available_methods = ["pair-sent", "pair-attr", "pair-sent-attr",
                           "pair-incremental-attr", "pair-sent-incremental-attr"]
      if method not in available_methods:
        raise ValueError("Tokenization method not found.")    

      if not force:
        if data_container.get_tokenized_data():
          return data_container.get_tokenized_data()

      if method == 'pair-attr':
        self.max_length = 128
      else:
        if not data_container.get_dataset_id().startswith("Textual"):
          self.max_length = 256
        else:
          self.max_length = 512

      df = data_container.get_data()  
      left_columns = data_container.get_left_columns()
      right_columns = data_container.get_right_columns()
      columns = data_container.get_columns()

      tokenized_data = []
      for row_id, row in tqdm(df.iterrows()):
        left_row = row[left_columns]
        left_row.index = columns
        right_row = row[right_columns]
        right_row.index = columns
        tokenized_row = self.tokenize(left_row, right_row, method)
        tokenized_data.append(tokenized_row)

      return tokenized_data

In [9]:
# # tokenize_method = 'pair-sent'
# #tokenize_method = 'pair-attr'
# # tokenize_method = 'pair-incremental-attr'
# tokenize_method = 'pair-sent-attr'
# #tokenize_method = 'pair-sent-incremental-attr'

# em_tokenizer = EM_Tokenizer('bert')
# tokenized_data = em_tokenizer.tokenize_dataset(data_container, tokenize_method)
# data_container.set_tokenized_data(tokenized_data)
# tokenized_data

## Embedder

In [10]:
class Embedder(object):
  """
  Component that encodes tokenized data into pre-trained embeddings.
  """

  def __init__(self, name, em_tokenizer=None):
    """
    This method initializes the Embedder by selecting the Huggingface's
    pre-trained models.
    """

    if not isinstance(name, str):
      raise TypeError("Wrong model name type.")

    available_models = ["bert", "distilbert", "albert", "roberta", "xlnet"]
    if name not in available_models:
      raise ValueError("Model not found.")

    self.name = name

    if name == 'bert':
      self.embedder = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
    
    elif name == 'distilbert':
      self.embedder = DistilBertModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
    
    elif name == 'albert':
      self.embedder = AlbertModel.from_pretrained('albert-base-v2', output_hidden_states=True)

    elif name == 'roberta':
      self.embedder = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True)

    elif name == 'xlnet':
      self.embedder = XLNetModel.from_pretrained('xlnet-base-cased', output_hidden_states=True)

    # resize the model in order to insert in the embedding matrix also new vectors
    # for the new added special tokens
    # the vectors of these special tokens will be randomly initialized
    if em_tokenizer:
      print("Embedding matrix resize: the new dimension is {}.".format(len(em_tokenizer.tokenizer)))
      self.embedder.resize_token_embeddings(len(em_tokenizer.tokenizer))

    # Set the device to GPU (SELECT THE GPU RUNTIME)
    self.device = torch.device(0)

    self.embedder.eval()
    self.embedder = self.embedder.to(self.device)
    
  def get_encoded_data(self, data):
    
    if not isinstance(data, BatchEncoding):
      raise TypeError("Wrong data type.")

    required_params = ["input_ids", "attention_mask"]
    for param in required_params:
      if param not in data:
        raise ValueError("Wrong data value.")

    optional_params = ["token_type_ids"]
    if self.name in ["bert", "albert", "xlnet"]:
      for param in optional_params:
        if param not in data:
          raise ValueError("Wrong data value.")

    input_ids = [data["input_ids"]]
    input_ids = torch.cat(input_ids, dim=0)  
    attention_mask = [data["attention_mask"]]
    attention_mask = torch.cat(attention_mask, dim=0)  
    token_type_ids = None
    if self.name in ["bert", "albert", "xlnet"]:
      token_type_ids = [data["token_type_ids"]]
      token_type_ids = torch.cat(token_type_ids, dim=0)  
        
    input_ids = input_ids.to(self.device)    
    attention_mask = attention_mask.to(self.device)
    if token_type_ids is not None:
      token_type_ids = token_type_ids.to(self.device)

    with torch.no_grad():

      if token_type_ids is not None:
        outputs = self.embedder(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
      
      else:
        outputs = self.embedder(input_ids=input_ids, attention_mask=attention_mask)

    if self.name == 'distilbert' or self.name == 'xlnet':
      hidden_states = outputs[1]
    else:
      hidden_states = outputs[2]

    if self.name == "bert":

      # # get last four layers
      # last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
      # # cast layers to a tuple and concatenate over the last dimension
      # cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
      # # take the mean of the concatenated vector over the token dimension
      # encoded_data = torch.mean(cat_hidden_states, dim=1).squeeze()

      # get last hidden layer
      encoded_data = torch.mean(hidden_states[-1], dim=1).squeeze()

    else:

      # get last layer and take the mean of the concatenated vector over the token dimension
      encoded_data = torch.mean(hidden_states[-1], dim=1).squeeze()

    return encoded_data

  def get_encoded_batch_data(self, batch_data):    
    
    if not isinstance(batch_data, list):
      raise TypeError("Wrong batch data type.")

    encoded_batch_data = []
    for data in tqdm(batch_data):
      encoded_data = self.get_encoded_data(data)        
      encoded_batch_data.append(encoded_data)

    return encoded_batch_data

In [11]:
# emb = Embedder("bert", em_tokenizer=em_tokenizer)
# encoded_data = emb.get_encoded_batch_data(tokenized_data)
# data_container.set_encoded_data(encoded_data)
# encoded_data

## Combiner

In [12]:
class Combiner(object):

  def __init__(self):
    pass

  def combine(self, embeddings, method):
    
    if not isinstance(method, str):
      raise TypeError("Wrong method type.") 

    methods = ['avg']
    if method not in methods:
      raise ValueError("Wrong method value.")

    if not torch.is_tensor(embeddings):
      raise ValueError("Wrong embeddings data value.")
    
    out_embedding = None
    nrows = list(embeddings.size())[0]
    if nrows > 1:    

        if method == 'avg':
          summary_vec = torch.mean(embeddings, dim=0)

        out_embedding = summary_vec.cpu().detach().numpy()

    else:
      # no combiner is need, but a new data format is generated

      out_embedding = embeddings.cpu().detach().numpy()

    return out_embedding

  def combine_batch_data(self, list_embeddings, method):

    if not isinstance(list_embeddings, list):
      raise TypeError("Wrong embeddings type.")       
    
    if len(list_embeddings) == 0:
      raise ValueError("Empty embedding list provided.")

    nrows = len(list_embeddings)
    emb_size = list(list_embeddings[0].size())
    ncols = emb_size[0]
    if len(emb_size) > 1:
      ncols = emb_size[1]
    out_embeddings = np.empty((nrows, ncols))

    for idx, embeddings in enumerate(list_embeddings):

      out_embedding = self.combine(embeddings, method)
      out_embeddings[idx, :] = out_embedding

    return out_embeddings

In [13]:
# combiner_method = 'avg'
# combiner = Combiner()
# embeddings = combiner.combine_batch_data(encoded_data, combiner_method)
# data_container.set_embeddings(embeddings)
# embeddings

## EM adapter

In [14]:
class EM_Adapter(object):
  def __init__(self, data_container, model):

    self.data_container = data_container
    self.model = model   
    self.em_tokenizer = None
    self.embedder = None
    self.combiner = None    
    self.times = {}

  def _tokenize_data(self, tokenize_method):

    print("\nApplying {} tokenization...".format(tokenize_method))

    ts = time.time()

    self.em_tokenizer = EM_Tokenizer(self.model)
    tokenized_data = self.em_tokenizer.tokenize_dataset(self.data_container, tokenize_method)

    ts2 = time.time()

    elapsed_time = ts2 - ts
    self.times["tokenization"] = elapsed_time

    print("Adding tokenized data to data container.")

    self.data_container.set_tokenized_data(tokenized_data)

    print("Tokenization {} applied successfully.\n".format(tokenize_method))

    return tokenized_data

  def _encode_data(self, tokenized_data):

    print("\nEncoding data with {}...".format(self.model))

    ts = time.time()

    self.embedder = Embedder(self.model, em_tokenizer=self.em_tokenizer)
    encoded_data = self.embedder.get_encoded_batch_data(tokenized_data)

    ts2 = time.time()

    elapsed_time = ts2 - ts
    self.times["embedding"] = elapsed_time

    print("Adding encoded data to data container.")

    self.data_container.set_encoded_data(encoded_data)

    print("Data encoded successfully.\n".format(self.model))

    return encoded_data

  def _create_embeddings(self, encoded_data, combiner_method):
    
    print("\nCreating {} embeddings...".format(self.model))

    ts = time.time()

    self.combiner = Combiner()
    embeddings = self.combiner.combine_batch_data(encoded_data, combiner_method)

    ts2 = time.time()

    elapsed_time = ts2 - ts
    self.times["combining"] = elapsed_time

    print("Embedding matrix: {}".format(embeddings.shape))

    print("Adding embeddings to data container.")

    self.data_container.set_embeddings(embeddings)

    print("{} embeddings created successfully.\n".format(self.model))

    return embeddings

  def transform_data(self, tokenize_method, combiner_method):

    tokenized_data = self._tokenize_data(tokenize_method)
    encoded_data = self._encode_data(tokenized_data)
    embeddings = self._create_embeddings(encoded_data, combiner_method)

    return embeddings

  def get_times(self):
    return self.times

In [15]:
# em_adapter = EM_Adapter(data_container, "bert")

# # tokenize_method = 'pair-sent'
# #tokenize_method = 'pair-attr'
# # tokenize_method = 'pair-incremental-attr'
# #tokenize_method = 'pair-sent-attr'
# tokenize_method = 'pair-sent-incremental-attr'

# combiner_method = 'avg'

# adapted_data = em_adapter.transform_data(tokenize_method, combiner_method)
# adapted_data

In [16]:
class EMAdapterCollection(object):
  
  def __init__(self, container_collection, model):

    if not isinstance(container_collection, DataCollectionContainer):
      raise TypeError("Wrong container collection data type.")

    if not isinstance(model, str):
      raise TypeError("Wrong model data type.")

    print("\nInitializing EM adapter based on {} model...".format(model))

    train_container = container_collection.get_data("train")
    test_container = container_collection.get_data("test")

    self.train_em_adapter = EM_Adapter(train_container, model)
    self.test_em_adapter = EM_Adapter(test_container, model)

    self.times = {}

    print("EM adapter initialized successfully.\n")

  def transform_data(self, tokenize_method, combiner_method):

    print("\nAdapting train data with {} tokenization and {} combiner...".format(tokenize_method, combiner_method))
    train_embeddings = self.train_em_adapter.transform_data(tokenize_method, combiner_method)
    self.times["train"] = self.train_em_adapter.get_times()
    print("Train data adapted successfully.\n")

    print("\nAdapting test data with {} tokenization and {} combiner...".format(tokenize_method, combiner_method))
    test_embeddings = self.test_em_adapter.transform_data(tokenize_method, combiner_method)
    self.times["test"] = self.test_em_adapter.get_times()
    print("Test data adapted successfully.\n")

    return train_embeddings, test_embeddings

  def get_times(self):
    return self.times

In [17]:
# em_adapter_col = EMAdapterCollection(data_container_col, "bert")

# #tokenize_method = 'pair-sent'
# #tokenize_method = 'pair-attr'
# tokenize_method = 'pair-incremental-attr'
# #tokenize_method = 'pair-sent-attr'
# #tokenize_method = 'pair-sent-incremental-attr'

# combiner_method = 'avg'

# train_adapted_data, test_adapted_data = em_adapter_col.transform_data(tokenize_method, combiner_method)

# AutoML

In [18]:
import time
from sklearn.metrics import f1_score

class AutoML(object):
  def __init__(self, name, container_collection, train_time=None, seed=24):

    if not isinstance(name, str):
      raise TypeError("Wrong name data type.")

    if not isinstance(container_collection, DataCollectionContainer):
      raise TypeError("Wrong container collection type.")

    automl_names = ["autosklearn", "h2o", "autogluon"]
    if name not in automl_names:
      raise ValueError("Wrong name value.")
    
    self.name = name
    self.label_col = container_collection.get_label_col()
    self.train = container_collection.get_data("train")
    self.test = container_collection.get_data("test")    
    self.train.check_em_adapter_application()
    self.test.check_em_adapter_application()
    self.library = None

    self.x_train = self.train.get_embeddings()    
    self.y_train = self.train.get_labels()
    self.x_train_tabular = pd.DataFrame(self.x_train)
    self.y_train_tabular = pd.DataFrame(self.y_train)
    self.train_tabular = pd.concat([self.x_train_tabular, self.y_train_tabular], axis=1)
    
    self.x_test = self.test.get_embeddings()    
    self.y_test = self.test.get_labels() 
    assert not (self.y_test == 0).all()   
    assert not (self.y_test == 1).all()   
    self.x_test_tabular = pd.DataFrame(self.x_test)
    self.y_test_tabular = pd.DataFrame(self.y_test)
    self.test_tabular = pd.concat([self.x_test_tabular, self.y_test_tabular], axis=1)

    self.train_time = train_time
    self.seed = seed
    self.model = None

    if name == 'autosklearn':

      if train_time: 
        self.model = AutoSklearn2Classifier(n_jobs=-1,
                      metric=autosklearn.metrics.f1,
                      time_left_for_this_task=self.train_time, seed=seed)
      else:
        self.model = AutoSklearn2Classifier(n_jobs=-1,
                      metric=autosklearn.metrics.f1, seed=seed)
                    
    elif name == 'h2o':

      self.library = h2o
            
      self.train_tabular = h2o.H2OFrame(self.train_tabular)
      self.train_tabular[self.label_col] = self.train_tabular[self.label_col].asfactor()

      self.test_tabular = h2o.H2OFrame(self.test_tabular)
      self.test_tabular[self.label_col] = self.test_tabular[self.label_col].asfactor()

      if train_time:
        self.model = H2OAutoML(seed=seed, stopping_metric="AUC", sort_metric="AUC",
                              max_runtime_secs=self.train_time)
      else:
        self.model = H2OAutoML(seed=seed, stopping_metric="AUC", sort_metric="AUC")              

  def fit(self):

    print("Strating training process...")
    
    ts = time.time()
    
    if self.name == 'autosklearn':
      
      try:
        self.model.fit(self.x_train, self.y_train)
      except Exception as e:
        print(e)
        self.model = None

    elif self.name == 'h2o':

      self.model.train(x=self.train_tabular.columns[:-1], y=self.label_col,
                       training_frame=self.train_tabular)

    elif self.name == 'autogluon':

      from autogluon.tabular import TabularPrediction as task

      hyperparameters = {  # hyperparameters of each model type
                   'NN': {'seed_value': self.seed}, 
                   'GBM': {'seed': self.seed},
                   'CAT': {'random_seed': self.seed},
                   'RF': {'random_state': self.seed},
                   'XT': {'random_state': self.seed},
                   'KNN': {'n_jobs': -1},
                   'LR': {'random_state': self.seed}
                  }
      
      if self.train_time:

        try:
          self.model = task.fit(train_data=self.train_tabular, label=self.label_col,
                              eval_metric="f1", presets='best_quality',
                              auto_stack=True, time_limits=self.train_time,
                              random_seed=self.seed, hyperparameters=hyperparameters)
        except ValueError as e:

          if str(e) == "AutoGluon did not successfully train any models":
            self.model = None

      else:

        try:
          self.model = task.fit(train_data=self.train_tabular, label=self.label_col,
                              eval_metric="f1", presets='best_quality',
                              auto_stack=True, random_seed=self.seed,
                              hyperparameters=hyperparameters)
        except ValueError as e:

          if str(e) == "AutoGluon did not successfully train any models":
            self.model = None
      
    ts2 = time.time()

    elapsed_time = ts2 - ts
    
    print("Training process completed in {}s.".format(elapsed_time))

    return elapsed_time

  def predict(self):

    print("Strating prediction process...")
    
    ts = time.time()
    
    x_test = self.x_test_tabular

    if self.name == 'h2o':      
      x_test = self.test_tabular
    
    elif self.name == 'autosklearn':
      x_test = x_test.values
      
    y_pred = None
    if self.model is not None:
      y_pred = self.model.predict(x_test)

    if self.name == 'h2o':
      if y_pred is not None:
        y_pred = self.library.as_list(y_pred).iloc[:, 0]
    
    ts2 = time.time()
    elapsed_time = ts2 - ts
    
    print("Prediction process completed in {}s.".format(elapsed_time))

    return y_pred, elapsed_time

  def get_f1_score(self, y_pred):

    if y_pred is None:
      return None

    return f1_score(self.y_test, y_pred)
    

In [19]:
# # model = 'autosklearn'
# # model = 'h2o'
# model = 'autogluon'
# automl = AutoML(model, data_container_col)
# automl.fit()
# y_pred, _ = automl.predict()
# f1 = automl.get_f1_score(y_pred)
# f1

# Executor

In [20]:
class ExecutorConfiguration(object):
  def __init__(self, use_case, sample_size, embedding_model, tokenizer_name,
               combiner_method, automl_model, automl_train_time):
      
    data = {}
    data["use_case"] = use_case
    data["data_dir"] = "data/{}/original/".format(use_case)
    data["label_col"] = "label"
    data["sample_size"] = sample_size
    data["embedding_model"] = embedding_model
    data["tokenizer_name"] = tokenizer_name
    data["combiner_method"] = combiner_method
    data["automl_model"] = automl_model
    data["automl_train_time"] = automl_train_time
    
    self.data = data

  def get_data(self):
    return self.data

  def get_param(self, name):
    return self.data[name]

  def __str__(self):

    use_case  = self.data["use_case"]
    sample_size  = self.data["sample_size"]
    embedding  = self.data["embedding_model"]
    tokenizer  = self.data["tokenizer_name"]
    combiner  = self.data["combiner_method"]
    automl  = self.data["automl_model"]
    automl_train_time  = self.data["automl_train_time"]

    return "USE CASE: {}, SAMPLE SIZE: {}, EMBEDDING: {}, TOKENIZER: {}, COMBINER: {}, AUTOML: {}, AUTOML TRAIN TIME: {}".format(use_case, sample_size, embedding, tokenizer, combiner, automl, automl_train_time)

In [21]:
class Executor(object):
  def __init__(self, conf):

    if not isinstance(conf, ExecutorConfiguration):
      raise TypeError("Wrong configuration data type.")

    self.conf = conf
    
    data_dir = self.conf.get_param("data_dir")
    use_case = self.conf.get_param("use_case")    
    label_col = self.conf.get_param("label_col")
    sample_size = self.conf.get_param("sample_size")
    self.embedding_model = self.conf.get_param("embedding_model")
    self.tokenizer_name = self.conf.get_param("tokenizer_name")
    self.combiner_method = self.conf.get_param("combiner_method")
    self.automl_model = self.conf.get_param("automl_model")
    self.automl_train_time = self.conf.get_param("automl_train_time")
    
    data_collector = DataCollector(data_dir)
    data_collector.get_data(use_case)
    
    cols = list(pd.read_csv(os.path.join(data_dir, "tableA.csv")).columns.values)[1:]
    self.data = DataCollectionContainer(data_dir, use_case, "train.csv",
                                        "valid.csv", "test.csv", cols,
                                        label_col, sample_size)
    self.em_adapter_times = {}
    
    print(str(self.conf))
  
  def get_conf(self):
    return str(self.conf)

  def adapt_data_for_em(self):

    em_adapter_col = EMAdapterCollection(self.data, self.embedding_model)

    _, _ = em_adapter_col.transform_data(self.tokenizer_name,
                                         self.combiner_method)
    flat_em_adapter_times = {}
    em_adapter_times = em_adapter_col.get_times()
    for data_id in em_adapter_times:
      for task in em_adapter_times[data_id]:
        flat_em_adapter_times["{}_{}".format(data_id, task)] = em_adapter_times[data_id][task]
    self.em_adapter_times = flat_em_adapter_times
  
  def run(self):

    automl = AutoML(self.automl_model, self.data, self.automl_train_time)
    train_time = automl.fit()
    y_pred, pred_time = automl.predict()
    f1 = automl.get_f1_score(y_pred)

    results = self.conf.get_data()
    results["train_time"] = train_time
    results["test_time"] = pred_time
    results["score"] = f1
    results.update(self.em_adapter_times)
    results.update(self.data.get_stats())    
    
    return pd.DataFrame([results])
    

In [22]:
# Use case
# use_case = "Structured/DBLP-GoogleScholar"
# use_case = "Structured/DBLP-ACM"
# use_case = "Structured/Amazon-Google"
# use_case = "Structured/Walmart-Amazon"
use_case = "Structured/Beer"
# use_case = "Structured/iTunes-Amazon"
# use_case = "Structured/Fodors-Zagats"
# use_case = "Textual/Abt-Buy"
# use_case = "Textual/Company"
# use_case = "Dirty/iTunes-Amazon"
# use_case = "Dirty/DBLP-ACM"
# use_case = "Dirty/DBLP-GoogleScholar"
# use_case = "Dirty/Walmart-Amazon"

# Sample size
sample_size = None
# sample_size = 200

# EM adapter

## embedding model
embedding_model = "bert"
# embedding_model = "distilbert"
# embedding_model = "albert"
# embedding_model = "roberta"
# embedding_model = "xlnet"

## tokenizer
# tokenizer_name = 'pair-sent'
# tokenizer_name = 'pair-attr'
# tokenizer_name = 'pair-incremental-attr'
tokenizer_name = 'pair-sent-attr'
# tokenizer_name = 'pair-sent-incremental-attr'

## combiner
combiner_method = 'avg'

# AutoML
# automl_model = 'autosklearn'
automl_model = "h2o"
# automl_model = "autogluon"

## train time
automl_train_time = None
# automl_train_time = 600

In [23]:
target_automl = automl_model
if target_automl == 'autosklearn':

  !sudo apt-get install build-essential swig
  !curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
  !pip install auto-sklearn

  for _ in range(3):
      try:
          import autosklearn.classification
          break
      except:
          pass
  else:
      raise ImportError("failed to import from autosklearn")

  from autosklearn.experimental.askl2 import AutoSklearn2Classifier
                
elif target_automl == 'h2o':

  !pip install h2o

  import h2o
  from h2o.automl import H2OAutoML

  h2o.init()

elif target_automl == 'autogluon':

  !pip install dask==2020.12
  !pip install --upgrade setuptools
  !pip install --upgrade "mxnet<2.0.0"
  !pip install --pre autogluon

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (51.1.1)
Requirement already up-to-date: mxnet<2.0.0 in /usr/local/lib/python3.6/dist-packages (1.7.0.post1)


In [24]:
# If a package error is raised, try to restart the runtime

In [25]:
conf = ExecutorConfiguration(use_case, sample_size, embedding_model, 
                             tokenizer_name, combiner_method, automl_model,
                             automl_train_time)

executor = Executor(conf)
executor.adapt_data_for_em()
results = executor.run()

32it [00:00, 315.02it/s]

Data already downloaded.
USE CASE: Structured/Beer, SAMPLE SIZE: None, EMBEDDING: bert, TOKENIZER: pair-sent-attr, COMBINER: avg, AUTOML: autogluon, AUTOML TRAIN TIME: 600

Initializing EM adapter based on bert model...
EM adapter initialized successfully.


Adapting train data with pair-sent-attr tokenization and avg combiner...

Applying pair-sent-attr tokenization...
Adding new special tokens: ['[Beer_Name]', '[Brew_Factory_Name]', '[Style]', '[ABV]']
Now the tokens are: 30526 (+4)


268it [00:00, 314.11it/s]


Adding tokenized data to data container.
Tokenization pair-sent-attr applied successfully.


Encoding data with bert...
Embedding matrix resize: the new dimension is 30526.


100%|██████████| 268/268 [00:04<00:00, 60.26it/s]
33it [00:00, 328.54it/s]

Adding encoded data to data container.
Data encoded successfully.


Creating bert embeddings...
Embedding matrix: (268, 768)
Adding embeddings to data container.
bert embeddings created successfully.

Train data adapted successfully.


Adapting test data with pair-sent-attr tokenization and avg combiner...

Applying pair-sent-attr tokenization...
Adding new special tokens: ['[Beer_Name]', '[Brew_Factory_Name]', '[Style]', '[ABV]']
Now the tokens are: 30526 (+4)


91it [00:00, 319.37it/s]


Adding tokenized data to data container.
Tokenization pair-sent-attr applied successfully.


Encoding data with bert...
Embedding matrix resize: the new dimension is 30526.


100%|██████████| 91/91 [00:01<00:00, 57.59it/s]


Adding encoded data to data container.
Data encoded successfully.


Creating bert embeddings...
Embedding matrix: (91, 768)
Adding embeddings to data container.
bert embeddings created successfully.

Test data adapted successfully.

Strating training process...


No output_directory specified. Models will be saved in: AutogluonModels/ag-20210107_133844/
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to AutogluonModels/ag-20210107_133844/
AutoGluon Version:  0.0.16b20210107
Train Data Rows:    268
Train Data Columns: 768
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    9743.17 MB
	Train Data (Original)  Memory Usage: 1.65 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in t

Training process completed in 406.54304122924805s.
Strating prediction process...
Prediction process completed in 0.42626523971557617s.


Unnamed: 0,use_case,data_dir,label_col,sample_size,embedding_model,tokenizer_name,combiner_method,automl_model,automl_train_time,train_time,test_time,score,train_tokenization,train_embedding,train_combining,test_tokenization,test_embedding,test_combining,train_nrows,train_ncols,train_match_ratio,valid_nrows,valid_ncols,valid_match_ratio,test_nrows,test_ncols,test_match_ratio
0,Structured/Beer,data/Structured/Beer/original/,label,,bert,pair-sent-attr,avg,autogluon,600,406.543041,0.426265,0.0,0.924592,13.237058,0.017738,0.375757,4.880841,0.011133,268,4,14.925373,91,4,15.384615,91,4,15.384615


In [None]:
results