In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip
!unzip snli_1.0.zip
!wget https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip
!unzip multinli_1.0.zip

!git clone https://github.com/brmson/dataset-sts
!git clone https://github.com/gaphex/bert_experimental

In [1]:
def load_toxic_data(tox_path):
  tox = pd.read_csv(tox_path)
  #remove ' ' before and after text
  tox['text'] = tox['text'].map(lambda x: str(x).lstrip().rstrip())
  #toxic = 1, other = 0
  tox['sentiment'] = tox['sentiment'].map(lambda x: 0 if x in ['positive','neutral'] else 1) 
  toxic_text, toxic_labels = tox.text.values, tox.sentiment.values
  return toxic_text, toxic_labels

In [2]:
def load_ner_data(ner_path, seq_len=24):
    data = pd.read_csv(ner_path, encoding= 'unicode_escape', sep=',')
    data = data.fillna(method='ffill')
    grouped_s = data.groupby('Sentence #', as_index=True)['Word'].apply(lambda g: ' '.join(g))
    grouped_t = data.groupby('Sentence #',  as_index=True)['Tag'].apply(lambda g: ' '.join(g))

    ner_tr = pd.DataFrame({}, columns=['sentence', 'tag'] )
    ner_tr['sentence'] = [st for st in grouped_s.values if len(st.split())<=seq_len]
    ner_tr['tag'] = [ tg.split() for tg in grouped_t if len(tg.split())<=seq_len]

    tag2idx = {t: i for i,t in enumerate(data.Tag.unique())}
    num_tags = len(tag2idx)

    y = [[tag2idx[w] for w in s] for s in ner_tr['tag']]
    y = pad_sequences(maxlen = seq_len, sequences=y, padding='post', value=tag2idx["O"])

    ptargs = [to_categorical(i, num_classes=num_tags) for i in y]        
    ptexts = np.array(ner_tr['sentence'])

    return ptexts, ptargs, num_tags, tag2idx, ner_tr

In [3]:
class TripletGenerator:
    def __init__(self, datadict, hard_frac = 0.2, batch_size=256):
        self.datadict = datadict
        self._anchor_idx = np.array(list(self.datadict.keys()))
        self._hard_frac = hard_frac
        self._generator = self.generate_batch(batch_size)

    def generate_batch(self, size):
        while True:

            hards = int(size*self._hard_frac)
            anchor_ids = np.array(np.random.choice(self._anchor_idx, size, replace=False))

            anchors = self.get_anchors(anchor_ids)
            positives = self.get_positives(anchor_ids)
            negatives = np.hstack([self.get_hard_negatives(anchor_ids[:hards]),
                                   self.get_random_negatives(anchor_ids[hards:])])
            labels = np.ones((size,1))

            assert len(anchors) == len(positives) == len(negatives) == len(labels) == size

            yield [anchors, positives, negatives], labels

In [None]:
class MulticlassGenerator:
    def __init__(self, data_tuple, batch_size=256):
        self._data = data_tuple
        self._idx = np.arange(len(data_tuple[-1]))
        self.generator = self.generate_batch(batch_size)

    def generate_batch(self, size):
        while True:
            px_ids = np.random.choice(self._idx, size, replace=False)
            samples = [p[px_ids] for p in self._data[:-1]]
            labels = self._data[-1][px_ids]
            
            yield samples+[labels], [1]*size

In [4]:
class MultitaskDataGenerator:
    def __init__(self, generators):
        self.generators = generators
        self.generator = self.generate_batch()
        
    def generate_batch(self, batch_size=None):
        while True:
            batch = self.__next__()
            yield batch
        
    def __next__(self):
        data_arrays = []
        for gen in self.generators:
            gen_data, gen_labels = next(gen.generator)

            if type(gen_data) not in {list, tuple}:
                gen_data = [gen_data]

            data_arrays += gen_data

        return data_arrays, gen_labels
            
    def __iter__(self):
        return self.__next__()

In [6]:
class SBERT:
    def __init__(self, config):
        self.loss = 0
        self.metrics = []
        self.inputs = []
        self.config = config
        self.build()
        
    def build(self):
        
        self.saver_dict = {}
        self.build_body()
        
        if self.config.use_par_head:
            self.build_nli_head()

        if self.config.use_toxic_head:
            self.build_toxic_head()

        if self.config.use_ner_head:
            self.build_tag_head()
        
        self.compile_model()
        
    def compile_model(self):
        
        log_this("Compiling")
        
        self.train_model = tf.keras.models.Model(inputs=self.inputs, outputs=[self.loss])

        opt = tf.keras.optimizers.Adam(learning_rate=self.config.lr)
        self.train_model.compile(
            optimizer=opt,
            loss=average_loss,
            metrics=self.metrics)
        log_this("The model is built")
    
    def build_body(self):
        self.nlu_encoder = BertLayer(
            self.config.module_path, self.config.ctx_len,
            n_tune_layers=self.config.n_tune, do_preprocessing=True,
          pooling='mean', tune_embeddings=self.config.tune_embs,
           trainable=self.config.train_bert)

    def build_tag_head(self):
        
        log_this("Building tagger head")
        
        tag_input = layers.Input(shape=(1, ),  dtype=tf.string)
        tag_label = layers.Input(shape=(self.config.ctx_len, self.config.n_tags,), dtype=tf.float32)
        
        self.nlu_encoder.as_dict = True
        inp_tok_encoded =  self.nlu_encoder(tag_input)['token_output']
        self.nlu_encoder.as_dict = False
        
        tag_mlp = self.build_mlp(
            2, self.config.dim, self.config.dim, self.config.n_tags, 
            name="ner", dropout_rate=self.config.head_dropout_rate)
        tag_pred = tf.keras.layers.TimeDistributed(tag_mlp)(inp_tok_encoded)
        tag_loss = tf.keras.losses.categorical_crossentropy(tag_label, tag_pred)
        
        self.tag_model = tf.keras.models.Model(inputs=[tag_input], outputs=[tag_pred], name=f'tagger_model')
        self.inputs += [tag_input, tag_label]
        self.loss += self.config.tagger_loss_weight * tag_loss

    def build_nli_head(self):
        
        log_this("Building paraphraser head")
        
        anc_input = layers.Input(shape=(1,), dtype=tf.string)
        pos_input = layers.Input(shape=(1,), dtype=tf.string)
        neg_input = layers.Input(shape=(1,), dtype=tf.string)

        anc_encoded = self.nlu_encoder(anc_input)
        pos_encoded = self.nlu_encoder(pos_input)
                
        if self.config.train_bert:
          neg_encoded = self.nlu_encoder(neg_input)
          par_loss = tf.keras.layers.Lambda(softmax_loss)([anc_encoded, pos_encoded, neg_encoded])
          self.loss += self.config.paraphrase_loss_weight * par_loss
        
        self.nli_encoder_model = tf.keras.models.Model(inputs=[pos_input], outputs=[pos_encoded])

        sim = tf.keras.layers.Lambda(cosine_similarity, name='similarity')([anc_encoded, pos_encoded])
        self.sim_model = tf.keras.models.Model(inputs=[anc_input, pos_input], outputs=[sim])
        self.inputs += [anc_input, pos_input, neg_input]
        
        
    def build_toxic_head(self):
        
        log_this("Building toxic head")
        
        sent_input = layers.Input(shape=(1, ),  dtype=tf.string)
        sent_label = layers.Input(shape=(self.config.n_toxic_tags, ), dtype=tf.float32)
        
        sents_encoded = self.nlu_encoder(sent_input)
        
        tox_mlp = self.build_mlp(
            2, self.config.dim, self.config.dim, self.config.n_toxic_tags, 
            name="toxic", dropout_rate=self.config.head_dropout_rate)
        pred = tox_mlp(sents_encoded)
        
        tox_loss = tf.keras.losses.categorical_crossentropy(sent_label, pred)
        tox_loss = tf.reshape(tox_loss, (-1, 1))
        
        self.tox_model = tf.keras.models.Model(inputs=[sent_input], outputs=[pred], name=f'toxic_model')
        self.inputs += [sent_input, sent_label]
        self.loss += self.config.toxic_loss_weight * tox_loss

In [7]:
ss = SBERT

In [None]:
s