In [30]:
import pandas as pd
import re
import numpy as np
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from transformers import DistilBertTokenizer, DistilBertForTokenClassification

In [31]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [57]:
df = pd.read_csv("/home/ubuntu/Development/punctuation/transcripts.csv")
# len(df): 2467

#num_train = int(len(df)*0.8)+2
num_train = int(len(df)*0.01)+2

num_val = int(len(df)*0.1)
num_test = int(len(df)*0.1)

id_all = np.random.choice(len(df), len(df), replace=False)
id_train = id_all[0:num_train]
id_val = id_all[num_train : num_val+num_train]
id_test = id_all[num_val+num_train : num_val+num_train+num_test]

train_set = df.iloc[id_train]
val_set = df.iloc[id_val]
test_set = df.iloc[id_test]

train_set = train_set[~train_set['transcript'].str.contains('♫')]
val_set = val_set[~val_set['transcript'].str.contains('♫')]
test_set = test_set[~test_set['transcript'].str.contains('♫')]

# Train Set Cleanup

train_set = train_set.drop('url',axis=1)
train_set = train_set['transcript']
train_set = train_set.str.replace("\(.*?\)", " ")\
.str.replace("\[.*?\]", " ")\
.str.replace(";", ". ")\
.str.replace(":", ". ")\
.str.replace('"', ' ')\
.str.replace('!', '. ')\
.str.replace(" — (?=[a-z])", ", ")\
.str.replace(" — (?=[A-Z])", ". ")\
.str.replace("(?<=[a-z])\.(?=[A-Z])", ". ")\
.str.replace("(?<=[a-z])\?(?=[A-Z])", ". ")\
.str.replace("(?<= )'(?=[a-zA-Z])", " ")\
.str.replace("(?<=[a-z])\'(?= )", " ")\
.str.replace("\'(?= )", " ")\
.str.replace(" — ", " ")\
.str.replace('\.+', '.')\
.str.replace(' +', ' ')\
.str.lower()

# one - on - one tutoring works best so that's what we tried to emulate like with me and my mom even though we knew it would be one - on - thousands 


temp_list_1 = []
for sentences in train_set:
    temp_list_1 += re.split('(?<=\.)|(?<=\?)',sentences)

temp_list_2 = []
for item in temp_list_1:
    temp_list_2.append(re.sub('^ ','',item))
    
temp_list_3 = []
for s in temp_list_2:
    try:
        if s[-1] == ".":
            temp_list_3.append(s)
        elif s[-1] == "?":
            temp_list_3.append(s)
        else:
            pass
    except:
        pass
    
del train_set
del temp_list_1
del temp_list_2

total_words = 0
combined_text = ""
outer_list = []

for s in temp_list_3:
    if total_words + len(word_tokenize(s)) < 400:
        combined_text += (s + " ")
        total_words += len(word_tokenize(s))
    else:
        outer_list.append(combined_text)
        combined_text = ""
        total_words = 0        
        
# outer_list is a list of sentences that don't go beyond 400 words

In [58]:
encoded_data_train = tokenizer.batch_encode_plus(outer_list, max_length=450, padding='max_length', truncation=True, return_tensors='pt')

punc_mask_outer = []
ids_no_punc_outer = []
attention_mask_outer = []

for j in range(len(encoded_data_train['input_ids'])):

    # punctuation mask for sentences
    punc_mask = []
    for i in encoded_data_train['input_ids'][j]:
        if i == 1012:
            punc_mask.pop()
            punc_mask.append(1) # period
        elif i == 1029:
            punc_mask.pop()
            punc_mask.append(2) # question mark
        elif i == 1010:
            punc_mask.pop()
            punc_mask.append(3) # comma
        else:
            punc_mask.append(0)
    punc_mask_outer.append(torch.tensor(punc_mask))

    # sentences converted to word ids excluding punctuations
    # len(punc_mask) should be the same as len(ids_no_punc)
    ids_no_punc = []
    for i in encoded_data_train['input_ids'][j]:
        if i == 1012:
            pass
        elif i == 1029:
            pass
        elif i == 1010:
            pass
        else:
            ids_no_punc.append(i)
    ids_no_punc_outer.append(torch.tensor(ids_no_punc))
     
    # attention_mask with subwords set to 0 except for the last one
    attention_mask = []
    first_hash = True
    for i in encoded_data_train['input_ids'][j]:
        if (i == 101 or i == 102 or i == 0): # CLS, SEP, PAD
            attention_mask.append(0)
        elif (i == 1029 or i == 1010 or i == 1012):
            pass
        else:
            if re.match(r'^##', tokenizer.decode([i])):         
                if first_hash == True:
                    attention_mask.pop()
                    attention_mask.append(0)
                    first_hash == False
                attention_mask.append(1)
            else:
                if first_hash == False:
                    attention_mask.pop()
                attention_mask.append(1)                
    attention_mask_outer.append(torch.tensor(attention_mask))

In [59]:
token_lengths = []
for i in range(len(punc_mask_outer)):
    token_lengths.append(len(punc_mask_outer[i]))
token_length_max = np.max(token_lengths)

for i in range(len(punc_mask_outer)):
    # add PAD again because length is not equal after removing punctuations
    zeros = [0] * (token_length_max - len(punc_mask_outer[i]))
        
    punc_mask = torch.cat((punc_mask_outer[i], torch.tensor(zeros)), 0)
    ids_no_punc = torch.cat((ids_no_punc_outer[i], torch.tensor(zeros)), 0)
    attention_mask = torch.cat((attention_mask_outer[i], torch.tensor(zeros)), 0)
  
    if i != 0:
        pass
        punc_mask_outer_adjusted = torch.cat((punc_mask_outer_adjusted, punc_mask.view(1,-1)),0)
        ids_no_punc_outer_adjusted = torch.cat((ids_no_punc_outer_adjusted, ids_no_punc.view(1,-1)),0)
        attention_mask_outer_adjusted = torch.cat((attention_mask_outer_adjusted, attention_mask.view(1,-1)),0)
    else:
        punc_mask_outer_adjusted = punc_mask.view(1,-1)
        ids_no_punc_outer_adjusted = ids_no_punc.view(1,-1)
        attention_mask_outer_adjusted = attention_mask.view(1,-1)

In [98]:
dataset_train = TensorDataset(ids_no_punc_outer_adjusted, attention_mask_outer_adjusted, punc_mask_outer_adjusted)

batch_size = 3
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)

In [99]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [100]:
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased",
                                                    num_labels=4,  
                                                    output_attentions=False,
                                                    output_hidden_states=False)


from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                 lr=1e-5,
                 eps=1e-8)

epochs = 4
#epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps=0,
                                           num_training_steps=len(dataloader_train)*epochs)



for epoch in range(epochs):

    nume = 0
    deno = 0
    loss_total = 0
    
    
    for batch in dataloader_train:
    
        model.train()
        model.zero_grad()
        
        batch = [b.to(device) for b in batch]
        
        model.to(device)
    
        inputs = {
            'input_ids': batch[0].to(torch.long),
            'attention_mask': batch[1].to(torch.long),
            'labels': batch[2].to(torch.long)
        }

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()

        model.eval()
        
        loss_total += loss

        for j in range(outputs['logits'].shape[0]):
        # for jth sample in a batch
        
            preds = np.argmax(outputs['logits'][j].to('cpu').detach().numpy(),axis=1)
            labels = batch[2].to(torch.long)[j].to('cpu').detach().numpy()

            # for ith token in a jth sample
            # if attention mask is not 0, check if prediction matchs label
            for i in range(len(batch[1][j])):
                if batch[1][j][i] != 0:
                    if preds[i] == labels[i]:
                        nume += 1
                    deno += 1

#             print(preds.sum())

    
    loss = loss_total/len(dataloader_train)
    acc = nume/deno
        
    print(f'epoch: {epoch+1}, tr_loss: {loss.item():.3f}, tr_acc: {acc:.3f}, nume: {nume}, deno: {deno}')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

epoch: 1, tr_loss: 0.591, tr_acc: 0.837, nume: 43799, deno: 52329
epoch: 2, tr_loss: 0.496, tr_acc: 0.859, nume: 44932, deno: 52329
epoch: 3, tr_loss: 0.445, tr_acc: 0.859, nume: 44932, deno: 52329
epoch: 4, tr_loss: 0.395, tr_acc: 0.859, nume: 44962, deno: 52329
