In [1]:
import pandas as pd
import re
import numpy as np
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from transformers import DistilBertTokenizer, DistilBertForTokenClassification

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [3]:
df = pd.read_csv("/home/ubuntu/Development/punctuation/transcripts.csv")
# len(df): 2467

num_train = int(len(df)*0.8)+2
#num_train = int(len(df)*0.01)+2

num_val = int(len(df)*0.1)
num_test = int(len(df)*0.1)

id_all = np.random.choice(len(df), len(df), replace=False)
id_train = id_all[0:num_train]
id_val = id_all[num_train : num_val+num_train]
id_test = id_all[num_val+num_train : num_val+num_train+num_test]

train_set = df.iloc[id_train]
val_set = df.iloc[id_val]
test_set = df.iloc[id_test]

train_set = train_set[~train_set['transcript'].str.contains('♫')]
val_set = val_set[~val_set['transcript'].str.contains('♫')]
test_set = test_set[~test_set['transcript'].str.contains('♫')]

# Train Set Cleanup

train_set = train_set.drop('url',axis=1)
train_set = train_set['transcript']
train_set = train_set.str.replace("\(.*?\)", " ")\
.str.replace("\[.*?\]", " ")\
.str.replace(";", ". ")\
.str.replace(":", ". ")\
.str.replace('"', ' ')\
.str.replace('!', '. ')\
.str.replace(" — (?=[a-z])", ", ")\
.str.replace(" — (?=[A-Z])", ". ")\
.str.replace("(?<=[a-z])\.(?=[A-Z])", ". ")\
.str.replace("(?<=[a-z])\?(?=[A-Z])", ". ")\
.str.replace("(?<= )'(?=[a-zA-Z])", " ")\
.str.replace("(?<=[a-z])\'(?= )", " ")\
.str.replace("\'(?= )", " ")\
.str.replace(" — ", " ")\
.str.replace('\.+', '.')\
.str.replace(' +', ' ')\
.str.lower()

# one - on - one tutoring works best so that's what we tried to emulate like with me and my mom even though we knew it would be one - on - thousands 


temp_list_1 = []
for sentences in train_set:
    temp_list_1 += re.split('(?<=\.)|(?<=\?)',sentences)

temp_list_2 = []
for item in temp_list_1:
    temp_list_2.append(re.sub('^ ','',item))
    
temp_list_3 = []
for s in temp_list_2:
    try:
        if s[-1] == ".":
            temp_list_3.append(s)
        elif s[-1] == "?":
            temp_list_3.append(s)
        else:
            pass
    except:
        pass
    
del train_set
del temp_list_1
del temp_list_2

total_words = 0
combined_text = ""
outer_list = []

for s in temp_list_3:
    if total_words + len(word_tokenize(s)) < 400:
        combined_text += (s + " ")
        total_words += len(word_tokenize(s))
    else:
        outer_list.append(combined_text)
        combined_text = ""
        total_words = 0        
        
# outer_list is a list of sentences that don't go beyond 400 words

In [4]:
encoded_data_train = tokenizer.batch_encode_plus(outer_list, max_length=450, padding='max_length', truncation=True, return_tensors='pt')

punc_mask_outer = []
ids_no_punc_outer = []
attention_mask_outer = []

for j in range(len(encoded_data_train['input_ids'])):

    # punctuation mask for sentences
    punc_mask = []
    for i in encoded_data_train['input_ids'][j]:
        if i == 1012:
            punc_mask.pop()
            punc_mask.append(1) # period
        elif i == 1029:
            punc_mask.pop()
            punc_mask.append(2) # question mark
        elif i == 1010:
            punc_mask.pop()
            punc_mask.append(3) # comma
        else:
            punc_mask.append(0)
    punc_mask_outer.append(torch.tensor(punc_mask))

    # sentences converted to word ids excluding punctuations
    # len(punc_mask) should be the same as len(ids_no_punc)
    ids_no_punc = []
    for i in encoded_data_train['input_ids'][j]:
        if i == 1012:
            pass
        elif i == 1029:
            pass
        elif i == 1010:
            pass
        else:
            ids_no_punc.append(i)
    ids_no_punc_outer.append(torch.tensor(ids_no_punc))
     
    # attention_mask with subwords set to 0 except for the last one
    attention_mask = []
    first_hash = True
    for i in encoded_data_train['input_ids'][j]:
        if (i == 101 or i == 102 or i == 0): # CLS, SEP, PAD
            attention_mask.append(0)
        elif (i == 1029 or i == 1010 or i == 1012):
            pass
        else:
            if re.match(r'^##', tokenizer.decode([i])):         
                if first_hash == True:
                    attention_mask.pop()
                    attention_mask.append(0)
                    first_hash == False
                attention_mask.append(1)
            else:
                if first_hash == False:
                    attention_mask.pop()
                attention_mask.append(1)                
    attention_mask_outer.append(torch.tensor(attention_mask))

In [5]:
token_lengths = []
for i in range(len(punc_mask_outer)):
    token_lengths.append(len(punc_mask_outer[i]))
token_length_max = np.max(token_lengths)

for i in range(len(punc_mask_outer)):
    # add PAD again because length is not equal after removing punctuations
    zeros = [0] * (token_length_max - len(punc_mask_outer[i]))
        
    punc_mask = torch.cat((punc_mask_outer[i], torch.tensor(zeros)), 0)
    ids_no_punc = torch.cat((ids_no_punc_outer[i], torch.tensor(zeros)), 0)
    attention_mask = torch.cat((attention_mask_outer[i], torch.tensor(zeros)), 0)
  
    if i != 0:
        pass
        punc_mask_outer_adjusted = torch.cat((punc_mask_outer_adjusted, punc_mask.view(1,-1)),0)
        ids_no_punc_outer_adjusted = torch.cat((ids_no_punc_outer_adjusted, ids_no_punc.view(1,-1)),0)
        attention_mask_outer_adjusted = torch.cat((attention_mask_outer_adjusted, attention_mask.view(1,-1)),0)
    else:
        punc_mask_outer_adjusted = punc_mask.view(1,-1)
        ids_no_punc_outer_adjusted = ids_no_punc.view(1,-1)
        attention_mask_outer_adjusted = attention_mask.view(1,-1)

In [9]:
dataset_train = TensorDataset(ids_no_punc_outer_adjusted, attention_mask_outer_adjusted, punc_mask_outer_adjusted)

batch_size = 3
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [27]:
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased",
                                                    num_labels=4,  
                                                    output_attentions=False,
                                                    output_hidden_states=False)


from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                 lr=1e-5,
                 eps=1e-8)

#epochs = 20
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps=0,
                                           num_training_steps=len(dataloader_train)*epochs)



model.train()

for epoch in range(epochs):

    nume = 0
    deno = 0
    loss_total = 0
    
    for batch in dataloader_train:
        
        model.zero_grad()
        
        batch = [b.to(device) for b in batch]
        
        model.to(device)
    
        inputs = {
            'input_ids': batch[0].to(torch.long),
            'attention_mask': batch[1].to(torch.long),
            'labels': batch[2].to(torch.long)
        }

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()

        loss_total += loss

        for j in range(outputs['logits'].shape[0]):
        
            preds = np.argmax(outputs['logits'][j].to('cpu').detach().numpy(),axis=1)
            labels = batch[2].to(torch.long)[j].to('cpu').detach().numpy()

            # if attention mask is not 0, check if prediction matchs label
            for i in range(len(batch[1][j])):
                if batch[1][j][i] == 0:
                    pass
                else:
                    if preds[i] == labels[i]:
                        nume += 1
                    deno += 1
       
    loss = loss_total/len(dataloader_train)
    acc = nume/deno
        
    print(f'epoch: {epoch+1}, tr_loss: {loss.item():.3f}, tr_acc: {acc:.3f}')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

epoch: 1, tr_loss: 0.524, tr_acc: 0.856
epoch: 2, tr_loss: 0.365, tr_acc: 0.872
epoch: 3, tr_loss: 0.292, tr_acc: 0.890


In [111]:
batch[1] # 51th is 0

tensor([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1.,

In [105]:
# preds
np.argmax(outputs['logits'][0].to('cpu').detach().numpy(),axis=1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0,
       1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,
       0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [110]:
# labels
batch[2].to(torch.long)[0].to('cpu').detach().numpy()

array([0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 3, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
       0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,

In [120]:
for i in dataset_train[0][0]:
    print(tokenizer.decode(int(i)))

[ C L S ]
e v e r y o n e
i s
b o t h
a
l e a r n
# # e r
a n d
a
t e a c h e r
t h i s
i s
m e
b e i n g
i n s p i r e d
b y
m y
f i r s t
t u t o r
m y
m o m
a n d
t h i s
i s
m e
t e a c h i n g
i n t r o d u c t i o n
t o
a r t i f i c i a l
i n t e l l i g e n c e
t o
2 0 0
s t u d e n t s
a t
s t a n f o r d
u n i v e r s i t y
n o w
t h e
s t u d e n t s
a n d
i
e n j o y e d
t h e
c l a s s
b u t
i t
o c c u r r e d
t o
m e
t h a t
w h i l e
t h e
s u b j e c t
m a t t e r
o f
t h e
c l a s s
i s
a d v a n c e d
a n d
m o d e r n
t h e
t e a c h i n g
t e c h n o l o g y
i s n
'
t
i n
f a c t
i
u s e
b a s i c a l l y
t h e
s a m e
t e c h n o l o g y
a s
t h i s
1 4 t h
-
c e n t u r y
c l a s s r o o m
n o t e
t h e
t e x t b o o k
t h e
s a g e
o n
t h e
s t a g e
a n d
t h e
s l e e p i n g
g u y
i n
t h e
b a c k
j u s t
l i k e
t o d a y
s o
m y
c o
-
t e a c h e r
s e b a s t i a n
t h r u
# # n
a n d
i
t h o u g h t
t h e r e
m u s t
b e
a
b e t t e r
w a y
w e
c h a l 

In [115]:
tokenizer.decode(dataset_train[0][0])

"[CLS] everyone is both a learner and a teacher this is me being inspired by my first tutor my mom and this is me teaching introduction to artificial intelligence to 200 students at stanford university now the students and i enjoyed the class but it occurred to me that while the subject matter of the class is advanced and modern the teaching technology isn't in fact i use basically the same technology as this 14th - century classroom note the textbook the sage on the stage and the sleeping guy in the back just like today so my co - teacher sebastian thrun and i thought there must be a better way we challenged ourselves to create an online class that would be equal or better in quality to our stanford class but to bring it to anyone in the world for free we announced the class on july 29th and within two weeks 50 000 people had signed up for it and that grew to 160 000 students from 209 countries we were thrilled to have that kind of audience and just a bit terrified that we hadn't fini

In [102]:
batch[2].shape

torch.Size([1, 419])