In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

In [2]:
model_type = 'bert-base-uncased' #albert-base-v1, bert-base-cased, bert-base-uncased
data_path = "D:/Data/neural-punctuator/ted-talks/"

with open(data_path + 'train_texts.txt', 'r', encoding='utf-8') as f:
    train_text = f.readlines()
with open(data_path + 'dev_texts.txt', 'r', encoding='utf-8') as f:
    valid_text = f.readlines()
with open(data_path + 'test_texts_2012.txt', 'r', encoding='utf-8') as f:
    test_text = f.readlines()

In [3]:
datasets = train_text, valid_text, test_text

In [4]:
[len(ds) for ds in datasets]

[1029, 8, 11]

In [5]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    
    reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"
    r = re.compile(reg, re.DOTALL)
    text = r.sub(' ', text)
    
    text = re.sub(r'\s-\s', ' , ', text)
    
#     text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
    text = text.replace('.\"', ',')
    text = text.replace('"', ',')

    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r',\s?,', ',', text)
    text = re.sub(r',\s?\.', '.', text)
    text = re.sub(r'\?\s?\.', '?', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    text = re.sub(r'\s+\.', '.', text)
    
    return text.strip().lower()

In [18]:
datasets = [[clean_text(text) for text in ds] for ds in datasets]

In [19]:
[len([t for t in ds if len(t)>0]) for ds in datasets]

[1029, 8, 11]

In [20]:
[len(' '.join(ds).split(' ')) for ds in datasets]

[2339461, 17346, 18474]

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [24]:
target_ids = tokenizer.encode(".?,")[1:-1]
target_ids

[1012, 1029, 1010]

In [25]:
target_token2id = {t: tokenizer.encode(t)[-2] for t in ".?,"}
target_token2id

{'.': 1012, '?': 1029, ',': 1010}

In [26]:
target_ids = list(target_token2id.values())
target_ids

[1012, 1029, 1010]

In [27]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in words:
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
        
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        assert(len(encoded_word)>0)

    encoded_words = [tokenizer.cls_token_id or tokenizer.bos_token_id] +\
                    encoded_words +\
                    [tokenizer.sep_token_id or tokenizer.eos_token_id]
    targets = [-1] + targets + [-1]
    
    return encoded_words, targets

In [33]:
s = "Tyranosaurus: kill me? Not enough, -- said the co-pilot -- ..."
print(s)
s = clean_text(s)
print(s)
data, targets = create_target(s)
print(targets)
[tokenizer._convert_id_to_token(d) for d in data[1:-1]]

Tyranosaurus: kill me? Not enough, -- said the co-pilot -- ...
tyranosaurus, kill me? not enough, said the co pilot,
[-1, -1, -1, 3, 0, 2, 0, 3, 0, 0, 0, 3, -1]


['ty',
 '##rano',
 '##saurus',
 'kill',
 'me',
 'not',
 'enough',
 'said',
 'the',
 'co',
 'pilot']

In [14]:
# encoded_texts, targets = create_target(transcripts[164])

In [15]:
# print(datasets[0][0])

In [16]:
encoded_texts, targets = [], []

for ds in datasets:
    x = list(zip(*(create_target(ts) for ts in tqdm(ds))))
    encoded_texts.append(x[0])
    targets.append(x[1])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [17]:
# encoded_words, targets
for te, ta in zip(encoded_texts[0][0], targets[0][0]):
    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")

[CLS]          	-1
it             	0
can            	0
be             	0
a              	0
very           	0
complicated    	0
thing          	3
the            	0
ocean          	1
and            	0
it             	0
can            	0
be             	0
a              	0
very           	0
complicated    	0
thing          	3
what           	0
human          	0
health         	0
is             	1
and            	0
bringing       	0
those          	0
two            	0
together       	0
might          	0
seem           	0
a              	0
very           	0
da             	-1
##unt          	-1
##ing          	0
task           	3
but            	0
what           	0
i              	-1
'              	-1
m              	0
going          	0
to             	0
try            	0
to             	0
say            	0
is             	0
that           	0
even           	0
in             	0
that           	0
complexity     	3
there          	-1
'              	-1
s              	0
some           	0
sim

north          	0
carolina       	1
they           	0
get            	0
into           	0
the            	0
food           	0
chain          	1
the            	0
dolphins       	0
eat            	0
the            	0
fish           	0
that           	0
have           	0
pc             	-1
##bs           	0
from           	0
the            	0
plank          	-1
##ton          	3
and            	0
those          	0
pc             	-1
##bs           	3
being          	0
fat            	0
soluble        	3
accumulate     	0
in             	0
these          	0
dolphins       	1
now            	3
a              	0
dolphin        	3
mother         	0
dolphin        	3
any            	0
dolphin        	3
there          	-1
'              	-1
s              	0
only           	0
one            	0
way            	0
that           	0
a              	0
pc             	-1
##b            	0
can            	0
get            	0
out            	0
of             	0
a              	0
dolphin        	1
and 

happens        	0
because        	0
we             	0
have           	0
jammed         	0
so             	0
much           	0
into           	0
the            	0
base           	0
of             	0
the            	0
natural        	0
ocean          	0
pyramid        	0
that           	0
these          	0
bacteria       	0
cl             	-1
##og           	0
it             	0
up             	0
and            	0
over           	-1
##fi           	-1
##ll           	0
onto           	0
our            	0
beaches        	1
often          	0
what           	0
jam            	-1
##s            	0
us             	0
up             	0
is             	0
sewage         	1
now            	0
how            	0
many           	0
of             	0
you            	0
have           	0
ever           	0
gone           	0
to             	0
a              	0
state          	0
park           	0
or             	0
a              	0
national       	0
park           	0
where          	0
you            	0
had   

planet         	3
and            	0
we             	0
think          	0
of             	0
ourselves      	0
as             	0
a              	0
terrestrial    	0
species        	3
but            	0
the            	0
pyramid        	0
of             	0
life           	0
in             	0
the            	0
ocean          	0
and            	0
our            	0
own            	0
lives          	0
on             	0
land           	0
are            	0
intricate      	-1
##ly           	0
connected      	1
and            	0
it             	-1
'              	-1
s              	0
only           	0
through        	0
having         	0
the            	0
ocean          	0
being          	0
healthy        	0
that           	0
we             	0
can            	0
remain         	0
healthy        	0
ourselves      	1
thank          	0
you            	0
very           	0
much           	1
[SEP]          	-1


In [18]:
os.makedirs(data_path + model_type, exist_ok=True)

for i, name in enumerate(('train', 'valid', 'test')):
    with open(data_path + f'{model_type}/{name}_data.pkl', 'wb') as f:
        pickle.dump((encoded_texts[i], targets[i]), f)

In [19]:
from collections import Counter

for ds_targets in targets:
    c = Counter((target for t in ds_targets for target in t))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

139619	10215	188165	2001462	267423
909	71	1225	15141	1899
1100	46	1120	16208	2072


In [20]:
e = []
i = 0

raw_words = datasets[1][2].split(' ')

for te, ta in zip(encoded_texts[1][2], targets[1][2]):
    if ta == -1:
        e.append(te)
    else:
        e.append(te)
        print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t{raw_words[i]}")
        e = []
        i += 1
print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta]):10}\t")

[CLS] you      	[ P A D ] 	you
know           	,         	know,
i've           	[ P A D ] 	i've
talked         	[ P A D ] 	talked
about          	[ P A D ] 	about
some           	[ P A D ] 	some
of             	[ P A D ] 	of
these          	[ P A D ] 	these
projects       	[ P A D ] 	projects
before         	,         	before,
about          	[ P A D ] 	about
the            	[ P A D ] 	the
human          	[ P A D ] 	human
genome         	[ P A D ] 	genome
and            	[ P A D ] 	and
what           	[ P A D ] 	what
that           	[ P A D ] 	that
might          	[ P A D ] 	might
mean           	,         	mean,
and            	[ P A D ] 	and
discovering    	[ P A D ] 	discovering
new            	[ P A D ] 	new
sets           	[ P A D ] 	sets
of             	[ P A D ] 	of
genes          	.         	genes.
we're          	[ P A D ] 	we're
actually       	[ P A D ] 	actually
starting       	[ P A D ] 	starting
at             	[ P A D ] 	at
a              	[ P A D ] 	a
new            	[ 

the            	[ P A D ] 	the
genetic        	[ P A D ] 	genetic
code           	[ P A D ] 	code
to             	[ P A D ] 	to
write          	[ P A D ] 	write
out            	[ P A D ] 	out
words          	,         	words,
sentences      	,         	sentences,
thoughts       	.         	thoughts.
initially      	,         	initially,
all            	[ P A D ] 	all
we             	[ P A D ] 	we
did            	[ P A D ] 	did
was            	[ P A D ] 	was
autograph      	[ P A D ] 	autograph
it             	.         	it.
some           	[ P A D ] 	some
people         	[ P A D ] 	people
were           	[ P A D ] 	were
disappointed   	[ P A D ] 	disappointed
there          	[ P A D ] 	there
was            	[ P A D ] 	was
not            	[ P A D ] 	not
poetry         	.         	poetry.
we             	[ P A D ] 	we
designed       	[ P A D ] 	designed
these          	[ P A D ] 	these
pieces         	[ P A D ] 	pieces
so             	[ P A D ] 	so
we             	[ P A D ] 	we
can      

born           	[ P A D ] 	born
in             	[ P A D ] 	in
1946           	.         	1946.
there's        	[ P A D ] 	there's
now            	[ P A D ] 	now
three          	[ P A D ] 	three
people         	[ P A D ] 	people
on             	[ P A D ] 	on
the            	[ P A D ] 	the
planet         	[ P A D ] 	planet
for            	[ P A D ] 	for
every          	[ P A D ] 	every
one            	[ P A D ] 	one
of             	[ P A D ] 	of
us             	[ P A D ] 	us
that           	[ P A D ] 	that
existed        	[ P A D ] 	existed
in             	[ P A D ] 	in
1946           	.         	1946.
within         	[ P A D ] 	within
40             	[ P A D ] 	40
years          	,         	years,
there'll       	[ P A D ] 	there'll
be             	[ P A D ] 	be
four           	.         	four.
we             	[ P A D ] 	we
have           	[ P A D ] 	have
trouble        	[ P A D ] 	trouble
feeding        	,         	feeding,
providing      	[ P A D ] 	providing
fresh          	,        

In [21]:
print(tokenizer.decode(encoded_texts[1][2]))

[CLS] you know i've talked about some of these projects before about the human genome and what that might mean and discovering new sets of genes we're actually starting at a new point we've been digitizing biology and now we're trying to go from that digital code into a new phase of biology with designing and synthesizing life so we've always been trying to ask big questions., what is life? is something that i think many biologists have been trying to understand at various levels we've tried various approaches paring it down to minimal components we've been digitizing it now for almost 20 years when we sequenced the human genome it was going from the analog world of biology into the digital world of the computer now we're trying to ask can we regenerate life or can we create new life out of this digital universe this is the map of a small organism mycoplasma genitalium that has the smallest genome for a species that can self replicate in the laboratory and we've been trying to just see