In [152]:
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import sys
from util.constants import  DATA_DIR, TASKS, NERS
from util.preprocessing import readCoNLL, get_label_name
from collections import defaultdict


def build_vocab(domains):
    word_count = defaultdict(int)
    for domain in domains:
        sentences_in_domain = readCoNLL(os.path.join(DATA_DIR, domain, "train.txt.ori"), {0: 'tokens', 1: 'labels'})
        print(" Number of sentences in {} : {}".format(domain, len(sentences_in_domain)))
        for sentence_idx in range(len(sentences_in_domain)):
            #if sentence_idx % 1000 == 0:
            #    print("{}".format(sentence_idx))
            tokens = sentences_in_domain[sentence_idx]['tokens']
            labels = sentences_in_domain[sentence_idx]['labels']
            for token_idx in range(len(tokens)):
                token = tokens[token_idx].lower()
                word_count[token] += 1
    return word_count

def build_indexes_from_domains(domains, word_count, threshold = 5):
    word2idx = {}
    label2idx = {}
    domain2label = {}
    for domain in domains:
        sentences_in_domain = readCoNLL(os.path.join(DATA_DIR, domain, "train.txt.ori"), {0: 'tokens', 1: 'labels'})
        print(" Number of sentences in {} : {}".format(domain, len(sentences_in_domain)))
        labels_in_domain = set()
        for sentence_idx in range(len(sentences_in_domain)):
            tokens = sentences_in_domain[sentence_idx]['tokens']
            labels = sentences_in_domain[sentence_idx]['labels']            
            for token_idx in range(len(tokens)):
                token = tokens[token_idx].lower()
                label = get_label_name(labels[token_idx])
                if label not in label2idx.keys() and label != "O":
                    label2idx[label] = len(label2idx)
                    labels_in_domain.add(label)
                if token not in word2idx.keys() and word_count[token] >= threshold:
                    word2idx[token] = len(word2idx)
        domain2label[domain] = list(labels_in_domain)


    idx2label = { v : k for k, v in label2idx.items()}
    idx2word  = { v : k for k, v in word2idx.items()}

    idx = {'word2idx' : word2idx, 'idx2word': idx2word, 'label2idx' : label2idx, 'idx2label' : idx2label, 'domain2label':domain2label}
    print("Word : {} Label :{}".format(len(word2idx), len(label2idx)))
    return idx

from collections import defaultdict
import numpy as np
import math

def build_matrix_from_domains(domains, idx, word_count, k=50) :

    label_count = defaultdict(int)
    label_word_count = np.zeros((len(idx['label2idx']), len(idx['word2idx'])))
    for domain in domains:
        sentences_in_domain = readCoNLL(os.path.join(DATA_DIR, domain, "train.txt.ori"), {0: 'tokens', 1: 'labels'})
        for sentence_idx in range(len(sentences_in_domain)):
            tokens = sentences_in_domain[sentence_idx]['tokens']
            labels = sentences_in_domain[sentence_idx]['labels']
            for token_idx in range(len(tokens)):
                token = tokens[token_idx].lower()
                label = get_label_name(labels[token_idx])
                if token in idx['word2idx'].keys() and label != "O" and label != "LAW":
                    label_count[label] += 1
                    label_word_count[idx['label2idx'][label], idx['word2idx'][token]] += 1

    original_matrix = np.zeros((len(idx['label2idx']), len(idx['word2idx'])))

    for i in range(label_word_count.shape[0]):
        for j in range(label_word_count.shape[1]):
            if math.sqrt(label_count[idx['idx2label'][i]] * word_count[idx['idx2word'][j]]) != 0 :
                original_matrix[i, j] = label_word_count[i,j] / math.sqrt(label_count[idx['idx2label'][i]] * word_count[idx['idx2word'][j]])
    
    from scipy.linalg import svd
    M1, M2, M3 = svd(original_matrix)
    ranked_k = M1[:, :k]
    ranked_k_normalized = preprocessing.normalize(ranked_k, norm='l2')
    #row_sums = M1.sum(axis=1)
    #normalized_matrix = M1 / row_sums[:, np.newaxis]

    return ranked_k_normalized

def get_label_mapping(domain1, domain2, matrix, idxs) :
    for label1 in idxs['domain2label'][domain1]:
        highest_sim_score = -1000000000000
        nearest_neighbor = None
        for label2 in idxs['domain2label'][domain2]:
            if ranked_k[idxs['label2idx'][label1]] == 0 or ranked_k[idxs['label2idx'][label2]] == 0:
                continue
            score = get_similarity(ranked_k[idxs['label2idx'][label1]], ranked_k[idxs['label2idx'][label2]])
            if score > highest_sim_score:
                highest_sim_score = score
                nearest_neighbor = label2
        print("The nearest neighbor for {} is {} with the score of {}".format(label1, nearest_neighbor, highest_sim_score))

from scipy.spatial.distance import cosine, euclidean

def get_similarity(repr1, repr2):
    return 1 - cosine(repr1, repr2)

def get_distance(repr1, repr2) :
    return euclidean(repr1, repr2)

def get_nearest_labels(target_task, aux_tasks, matrix, idxs, sim_threshold = 0.1):
    nearest_labels = {}
    
    for aux_task in aux_tasks:
        unique_labels = set()
        print("This is the mapping between {} and {}".format(target_task, aux_task))
        for label1 in idxs['domain2label'][target_task]:
            highest_sim_score = -1000000000000
            nearest_neighbor = None
            for label2 in idxs['domain2label'][aux_task] :
                if not np.any(matrix[idxs['label2idx'][label1]])  or not np.any(matrix[idxs['label2idx'][label2]]):
                    continue
                score = get_similarity(matrix[idxs['label2idx'][label1]], matrix[idxs['label2idx'][label2]])
                #print("Score between {} and {} is {}".format(label1, label2, score))
                if score > highest_sim_score:
                    highest_sim_score = score
                    nearest_neighbor = label2
            #print("The nearest neighbor for {} is {} with the score of {}".format(label1, nearest_neighbor, highest_sim_score))
            if highest_sim_score >= sim_threshold:
                unique_labels.add(nearest_neighbor)
        nearest_labels[aux_task] = unique_labels
        print("Nearest labels from {}  is {}".format(aux_task, str(unique_labels)))
    
    return nearest_labels

def compute_label_embeddings (target_task, aux_tasks):
    
    word_count = build_vocab(target_task + aux_tasks)
    idxs = build_indexes_from_domains(target_task + aux_tasks, word_count)
    matrix = build_matrix_from_domains(target_task + aux_tasks, idxs, word_count)
    get_nearest_labels(target_task[0], aux_tasks, matrix, idxs, sim_threshold=0.1)
    
    return matrix

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [153]:
word_count = build_vocab(TASKS + NERS)
idxs = build_indexes_from_domains(TASKS + NERS, word_count)
matrix = build_matrix_from_domains(TASKS + NERS, idxs, word_count)

 Number of sentences in ATIS : 4478
 Number of sentences in MIT_Restaurant : 6128
 Number of sentences in MIT_Movie : 7820
 Number of sentences in CONLL_2003_NER : 14987
 Number of sentences in OntoNotes_NW : 34970
 Number of sentences in ATIS : 4478
 Number of sentences in MIT_Restaurant : 6128
 Number of sentences in MIT_Movie : 7820
 Number of sentences in CONLL_2003_NER : 14987
 Number of sentences in OntoNotes_NW : 34970
Word : 14033 Label :119


In [155]:
get_nearest_labels('ATIS', ['MIT_Restaurant','MIT_Movie', 'OntoNotes_NW','CONLL_2003_NER'], matrix, idxs, sim_threshold = 0.1)

This is the mapping between ATIS and MIT_Restaurant
Nearest labels from MIT_Restaurant  is {'Location', 'Rating', 'Amenity', 'Price', 'Hours'}
This is the mapping between ATIS and MIT_Movie
Nearest labels from MIT_Movie  is {'YEAR', 'SONG', 'PLOT', 'TITLE', 'ACTOR'}
This is the mapping between ATIS and OntoNotes_NW
Nearest labels from OntoNotes_NW  is {'ORDINAL', 'PRODUCT', 'EVENT', 'GPE', 'DATE', 'CARDINAL', 'WORK_OF_ART', 'FAC', 'TIME'}
This is the mapping between ATIS and CONLL_2003_NER
Nearest labels from CONLL_2003_NER  is {'MISC', 'ORG'}


{'CONLL_2003_NER': {'MISC', 'ORG'},
 'MIT_Movie': {'ACTOR', 'PLOT', 'SONG', 'TITLE', 'YEAR'},
 'MIT_Restaurant': {'Amenity', 'Hours', 'Location', 'Price', 'Rating'},
 'OntoNotes_NW': {'CARDINAL',
  'DATE',
  'EVENT',
  'FAC',
  'GPE',
  'ORDINAL',
  'PRODUCT',
  'TIME',
  'WORK_OF_ART'}}

In [126]:
from sklearn import preprocessing