In [1]:
# Generic
import argparse
import yaml
import time
import gc
import datetime
import os
import sys

# Data & Math
import math
import pandas as pd
import torch
from torch.utils.tensorboard import SummaryWriter

# Transformers
from transformers import BertModel, BertTokenizer

# Custom imports
from model.model import End2EndModel

# Neural Network Builder functions
from model.builder.transformer import make_transformer
from model.builder.bracketing import make_bracketer
from model.builder.multitask import make_multitask_net
from model.builder.generators import make_generator

from model.utils import (
    eval_model_on_DF,
    str2bool,
    str2list,
    txt2list,
    abs_max_pooling,
    mean_pooling
)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [192]:
# load config file from datasets
with open("./config/datasets.yml", "r") as file:
    config = yaml.load(file, Loader=yaml.Loader)
with open("./config/model.yml", "r") as file:
    model_config = yaml.load(file, Loader=yaml.Loader)
with open("./config/optimizer.yml", "r") as file:
    optimizer_config = yaml.load(file, Loader=yaml.Loader)

#############################################################################
############################### LOAD MODELS #################################
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device being used: {device}")


args = type('', (), {})()
args.trf_out_layer = 10
args.log_dir = "./tensorboard"
args.checkpoint_id = None
args.modules_to_load = ['multitasknet']
args.modules_to_save = ['multitasknet']
args.modules_to_train = ['multitasknet']
args.train_comp = False
args.eval_comp = False
args.chunker = "agglomerative"  # ["NNSimilarity", "agglomerative", "hard", "fixed", "freq"]
args.span = 11
args.out_num = 1
args.log_threshold = -52
args.sim_threshold = 0
args.dist_threshold = 1.71
args.max_skip = 1
args.pooling = "mean_pooling"  # ["abs_max_pooling", "mean_pooling", "freq_pooling", "conv_att"]

transformer_net = make_transformer(args, device=device)
bracketing = make_bracketer(args, device=device)

Device being used: cpu


In [193]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#sequence = "the film would work much better as a video installation in a museum ,\
#            where viewers would be free to leave"
#sequence = "On 25 February Cyborgo was stepped up in class for the Grade 2 Rendlesham Hurdle over three miles on heavy at Kempton Park Racecourse and started 8/11 favourite in a five-runner field."
#sequence = "On 25 February Cyborgo was stepped up in class for the Grade 2 Rendlesham Hurdle over three miles on heavy at Kempton Park Racecourse and started."
sequence = "It was easy to spot her. All you needed to do was look at her socks. They were never a matching pair. One would be green while the other would be blue. One would reach her knee while the other barely touched her ankle. Every other part of her was perfect, but never the socks. They were her micro act of rebellion."

tokens = tokenizer.encode_plus(sequence, 
                               add_special_tokens=True,
                               return_special_tokens_mask=True,
                               return_token_type_ids=True)
s = sum(tokens['special_tokens_mask'])
n_reg_tokens = len(tokens['input_ids']) - s

tensor, mask, token_ids = transformer_net([sequence], return_extras=True)
indices = bracketing.forward(tensor, masks_dict=mask, token_ids=token_ids)


print(f"Original sentence: {sequence}")
print(f"\nCompression of {(len(indices[0]) - 2) / n_reg_tokens}")

string_list = []
for idx in indices[0]:
    token_ids = [tokens['input_ids'][i] for i in idx]
    string_list.append(tokenizer.decode(token_ids,
                              skip_special_tokens=False, 
                              clean_up_tokenization_spaces=False))
print(tokenizer.tokenize(sequence))
print(string_list)

Original sentence: It was easy to spot her. All you needed to do was look at her socks. They were never a matching pair. One would be green while the other would be blue. One would reach her knee while the other barely touched her ankle. Every other part of her was perfect, but never the socks. They were her micro act of rebellion.

Compression of 0.08571428571428572
['it', 'was', 'easy', 'to', 'spot', 'her', '.', 'all', 'you', 'needed', 'to', 'do', 'was', 'look', 'at', 'her', 'socks', '.', 'they', 'were', 'never', 'a', 'matching', 'pair', '.', 'one', 'would', 'be', 'green', 'while', 'the', 'other', 'would', 'be', 'blue', '.', 'one', 'would', 'reach', 'her', 'knee', 'while', 'the', 'other', 'barely', 'touched', 'her', 'ankle', '.', 'every', 'other', 'part', 'of', 'her', 'was', 'perfect', ',', 'but', 'never', 'the', 'socks', '.', 'they', 'were', 'her', 'micro', 'act', 'of', 'rebellion', '.']
['[CLS]', 'it was easy to spot her .', 'all you needed to do was', 'look at her socks . they wer

In [194]:
def write_colors(chunks, colors=['olive', 'purple', 'teal', 'grey']):
    out = ""
    i = 0
    for chunk in chunks:
        chunk = chunk.replace("#", "\#")
        out += f" \\textcolor[{colors[i]}][{chunk}]".replace('[', '{').replace(']', '}')
        i = i + 1 if i < 3 else 0
    return out

In [195]:
out = write_colors(string_list[1:-1])
#out = write_colors(tokenizer.tokenize(sequence))
print(out)

 \textcolor{olive}{it was easy to spot her .} \textcolor{purple}{all you needed to do was} \textcolor{teal}{look at her socks . they were never a matching pair . one would be green} \textcolor{grey}{while the other would be blue . one would reach her knee while the other barely touched her ankle} \textcolor{olive}{. every other part of her was perfect , but never the socks .} \textcolor{purple}{they were her micro act of rebellion .}


In [59]:
print(s)

\colors{stuff}


In [106]:
s = "##stuff" 
print(f"this is a string {s}")

this is a string ##stuff


In [113]:
s = 'patatta'
s = s.replace('a', 'bbbb')
print(s)

pbbbbtbbbbttbbbb


array([[0, 1, 1, 0, 0],
       [1, 0, 1, 1, 0],
       [1, 1, 0, 1, 1],
       [0, 1, 1, 0, 1],
       [0, 0, 1, 1, 0]])