In [1]:
!pip install transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import torch
from transformers import *
import numpy as np
import scipy as scipy
import pandas as pd
import os
import ast
import tqdm as tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
MODEL_CLASS, TOKENIZER_CLASS, PRETRAINED = (DistilBertForSequenceClassification, DistilBertTokenizer, 'distilbert-base-cased')
PAWS_QQP = True

# **Generate QQP**

In [0]:
data_path = '/content/drive/My Drive/Colab/data_qqp/'

all_data = pd.read_table(os.path.expanduser(data_path + 'quora_duplicate_questions.tsv'), header=0)
all_data = all_data.rename(columns={'is_duplicate': 'label', 'question1': 'sentence1', 'question2': 'sentence2'})

class_1 = all_data[all_data['label']==1]
class_0 = all_data[all_data['label']==0]

class_1_dev_test = class_1.sample(n=10000)
class_1_train = class_1.drop(class_1_dev_test.index)

class_0_dev_test = class_0.sample(n=10000)
class_0_train = class_0.drop(class_0_dev_test.index)

train_data = pd.concat([class_1_train, class_0_train])
train_data = train_data.sample(frac=1).reset_index(drop=True)
test_data = pd.concat([class_1_dev_test[:5000], class_0_dev_test[:5000]])
test_data = test_data.sample(frac=1).reset_index(drop=True)
dev_data = pd.concat([class_1_dev_test[5000:], class_0_dev_test[5000:]])
dev_data = dev_data.sample(frac=1).reset_index(drop=True)

In [0]:
train_data.to_csv(data_path + 'train.tsv', sep='\t')
dev_data.to_csv(data_path + 'dev.tsv', sep='\t')
test_data.to_csv(data_path + 'test.tsv', sep='\t')

# **Preprocess Data** 

In [0]:
DATA_FOLDER = 'data_qqp'
FILE_NAMES = ['dev.tsv']

In [0]:
class Tokenizer:
    # init
    def __init__(self, tokenizer_class, pretrained_weights, group_sent=False):
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.group_sent = group_sent
        
    # tokenize
    def tokenize_data(self, data, file_path):
        bar = tqdm.notebook.tqdm(total=len(data))
        tokenized = pd.DataFrame(columns = ['i1', 's1', 'i2', 's2', 'i', 's', 'y'])
        count = 0
        flag = True
        for index, row in data.iterrows():
            token_row = {
                'i1' : [],
                's1' : [],
                'i2' : [],
                's2' : [],
                'i' : [],
                's' : [],
                'y': 0
            }
            # print(row)
            # QQP Dataset
            sent1 = str(row['question1'])
            sent2 = str(row['question2'])
            token_row['y'] = int(row['is_duplicate'])
            
            # PAWS Dataset
            # sent1 = str(row['sentence1'])
            # sent2 = str(row['sentence2'])
            # token_row['y'] = int(row['label\r'])
            
            
            encoding1 = self.tokenizer.encode_plus(sent1, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

            token_row['i1'] = encoding1['input_ids']
            token_row['s1'] = encoding1['token_type_ids']

            # indexed1 = self.tokenizer.encode(sent1, max_length=128, pad_to_max_length=True)
            # print(token)
            # indexed1 = self.tokenizer.convert_tokens_to_ids(token)
            # indexed_tensor = torch.tensor([indexed1])
            # segment = np.zeros(len(indexed1), dtype=int)
            # segment_tensor = torch.tensor([segment])
            # token_row['s1'] = segment_tensor

            encoding2 = self.tokenizer.encode_plus(sent2, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

            token_row['i2'] = encoding2['input_ids']
            token_row['s2'] = encoding2['token_type_ids']
            # indexed2 = self.tokenizer.encode(sent2, max_length=128, pad_to_max_length=True)
            # # indexed2 = self.tokenizer.convert_tokens_to_ids(token)
            # indexed_tensor = torch.tensor([indexed2])
            # token_row['i2'] = indexed_tensor
            # segment = np.zeros(len(indexed2), dtype=int)
            # segment_tensor = torch.tensor([segment])
            # token_row['s2'] = segment_tensor
            
            if self.group_sent == True:
                encoding = self.tokenizer.encode_plus(sent1, sent2, return_token_type_ids=True, max_length=256, pad_to_max_length=True)
                print(encoding)
                
                token_row['i'] = encoding['input_ids']
                token_row['s'] = encoding['token_type_ids']

                # print(encoding)
                # indexed = self.tokenizer.build_inputs_with_special_tokens(indexed1, indexed2)
                # token_row['i'] = torch.tensor([indexed])
                # # print(torch.tensor([indexed]).shape)

                # max_length = max(len(indexed), max_length)

                # if len(indexed) > 500:
                #     print(sent1 + sent2)
                # segment = self.tokenizer.create_token_type_ids_from_sequences(indexed1, indexed2)
                # token_row['s'] = torch.tensor([segment])
            
            tokenized = tokenized.append(token_row, ignore_index=True)
            bar.update()
            count += 1
            if count % 10000 == 0:
                if flag == True:
                    tokenized.to_csv(file_path)
                    flag = False
                else:
                    tokenized.to_csv(file_path, mode='a', header=False)
                tokenized = tokenized.iloc[0:0]
        if count % 10000 != 0:
            if flag == True:
                tokenized.to_csv(file_path)
                flag = False
            else:
                tokenized.to_csv(file_path, mode='a', header=False)
        bar.close()
        return tokenized

In [0]:
data_path = '/content/drive/My Drive/Colab/' + DATA_FOLDER + '/'
tokenizer = Tokenizer(TOKENIZER_CLASS, PRETRAINED, True)

for file_name in FILE_NAMES:
    data = pd.read_csv(os.path.expanduser(data_path+file_name), sep='\t', lineterminator='\n', error_bad_lines=False)
    data = data.dropna()
    tokens = tokenizer.tokenize_data(data, data_path + file_name.split('.')[0] + '.csv')
    # tokens.to_csv(data_path + file_name.split('.')[0] + '.csv')
    # display(tokens)

HBox(children=(IntProgress(value=0, max=40371), HTML(value='')))

{'input_ids': [101, 2009, 1132, 2170, 118, 4038, 1177, 2712, 136, 102, 2009, 1132, 1117, 10224, 4724, 1177, 2712, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

KeyboardInterrupt: ignored

# **Load Data**
- TBD: Create DataLoader

In [0]:
BASE_QQP_PATH = '/content/drive/My Drive/Colab/data_qqp/'
BASE_PAWS_QQP_PATH = '/content/drive/My Drive/Colab/data_PAWS_qqp/'

In [0]:
# Load QQP sets
data = pd.read_csv(BASE_QQP_PATH + 'train.csv')
data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
train_set = data
# display(train_set)

In [0]:
# Load PAWS QQP if generated
if PAWS_QQP == True:    
    data = pd.read_csv(BASE_PAWS_QQP_PATH + 'train.csv')
    data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
    data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
    train_set = pd.concat([train_set, data])


In [0]:
data = pd.read_csv(BASE_QQP_PATH + 'dev.csv')
data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
dev_set = data
orig_dev_set = data

# data = pd.read_csv(BASE_QQP_PATH + 'test.csv')
# data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
# data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
# test_set = data

In [0]:
paws_qqp_dev_set = None
if PAWS_QQP == True: 
    data = pd.read_csv(BASE_PAWS_QQP_PATH + 'dev.csv')
    data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
    data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
    paws_qqp_dev_set = data
    dev_set = pd.concat([dev_set, data])

# **BERT Model**

## DataLoader

In [0]:
class CustomDataset():
    def __init__(self, data):
        self.inputs = data['i'].values
        self.sentence_tokens = data['s'].values
        self.labels = data['y'].values
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return tuple([torch.tensor(self.inputs[idx]), torch.tensor(self.sentence_tokens[idx]), torch.tensor(self.labels[idx])])

In [11]:
train_dataset = CustomDataset(train_set)
dev_dataset = CustomDataset(dev_set)
print(dev_dataset.__getitem__(-1))

(tensor([ 101, 2825,  146, 1202, 1243,  170, 3283,  112,  188, 1107, 6676, 3752,
        1121,  170, 1363, 2755, 1107, 1103, 1646,  120, 1860, 1191,  146, 1138,
         170,  139,  119, 7882, 2178, 1121,  170,  124,  118, 8081, 2755, 1107,
        1726,  136,  102, 2825,  146, 1202, 1106, 1243,  170, 3283,  112,  188,
        1107, 6676, 3752, 1121,  170,  139,  119, 7882, 2755, 1107, 1103, 1646,
         120, 1860, 1191,  146, 1138,  170, 1363, 2178, 1121,  170,  124,  118,
        8081, 2755, 1107, 1726,  136,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,  

## Model Code

In [0]:
class Model_Classifier:
    # initalize model
    def __init__(self, model_class, pretrained_weights, pretrained_path=None, no_iterations=10):
        if pretrained_path is not None:
            self.model = model_class.from_pretrained(pretrained_path).cuda()
        else:
            self.model = model_class.from_pretrained(pretrained_weights).cuda()

        self.TRAINING_ITERATIONS = no_iterations
        self.WARMUP = 1000
        self.BATCH_SIZE = 32
        self.REPORT_FREQUENCY = 100
        self.CHKPT_FREQUENCY = 500
#         self.model.eval()

    def train_batch(self, train_data):
        train_ids = train_data[0].cuda()
        train_segments = train_data[1].cuda()
        train_labels = train_data[2].cuda()
        # train_ids = torch.tensor(train_data['i'].values.tolist()).cuda()
        # train_segments = torch.tensor(train_data['s'].values.tolist()).cuda()
        # train_labels = torch.tensor(train_data['y'].values.tolist()).cuda()
        # print(train_labels)

        self.model.train()
        # Uncomment this for BERT
        # outputs = self.model(input_ids = train_ids, token_type_ids = train_segments, labels = train_labels)

        # Uncomment this for DistilBERT
        outputs = self.model(input_ids = train_ids, labels = train_labels)
        loss = outputs[0].cuda()
        loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        self.model.zero_grad()
        return loss

    # train classifier
    def train(self, data, validation_set):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        self.optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
        # self.scheduler = get_custom_schedule(self.optimizer))
        train_loader = torch.utils.data.DataLoader(data, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.RandomSampler(data))
        dev_loader = torch.utils.data.DataLoader(validation_set, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.RandomSampler(validation_set))
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=len(train_loader)//10, num_training_steps=(self.TRAINING_ITERATIONS+1)*len(train_loader))
        prev_acc = 0.0
        for i in range(1, self.TRAINING_ITERATIONS+1):
            count = 1
            bar = tqdm.notebook.tqdm(total=len(train_loader))
            for batch in train_loader:
                train_data = batch
                count += 1
                loss = self.train_batch(train_data)
                bar.update(1)
                if count%self.CHKPT_FREQUENCY == 0:
                    bar.write("%d: Loss - %s" %(i, loss))
                    acc = self.test(dev_loader)
                    bar.write("%d: Validation - %s" %(i, str(acc)))
                    # if abs(prev_acc-acc) < 0.0001 and loss[0] < 0.001:
                    #     bar.write('Finish')
                    #     bar.close()
                    #     self.save(directory='final')
                    #     break
                    prev_acc = acc
                    bar.write("%d Saved" %(i))
                    self.save(directory='CHKPT_')
            bar.close()
        return

    # save model
    def save(self, directory=None):
      if directory is not None:
          path = '/content/drive/My Drive/Colab/' + directory + '/'
          if not os.path.exists(path):
              os.makedirs(path)
          self.model.save_pretrained('/content/drive/My Drive/Colab/' + directory + '/')
      else:
          self.model.save_pretrained('/content/drive/My Drive/Colab/model/')
    
    # predict values
    def test(self, data, is_print=False, is_loader=True, print_file_name=None, sentence_base='sentence', print_file=None):
        dev_loader = None
        if is_loader==True:
            dev_loader = data
        else:
            dev_loader = torch.utils.data.DataLoader(data, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.SequentialSampler(data))

        if is_print == True and print_file_name is not None:
            test_samples = pd.read_table(print_file_name,header=0)
        self.model.eval()
        correct = 0
        total = 0
        i = 0
        for batch in dev_loader:
            input_ids = batch[0].cuda()
            input_segments = batch[1].cuda()
            labels = batch[2]

            # Uncomment this for BERT
            # outputs = self.model(input_ids = input_ids, token_type_ids = input_segments)

            # Uncomment this for DistilBERT
            outputs = self.model(input_ids = input_ids)
            probs = outputs[0]
            softmax = torch.nn.functional.softmax(probs, dim=1)
            prediction = torch.argmax(softmax, dim=1)
            # print(prediction)
            # print(len(prediction))
            for j in range(len(labels)):
                if labels[j] == prediction[j]:
                    correct += 1
                elif is_print==True:
                    print_file.write("Sentence 1: %s \n Sentence 2: %s \n Label (%d) vs Prediction (%d) \n \n ----- \n \n" %(test_samples.iloc[total][sentence_base+'1'], test_samples.iloc[total][sentence_base+'2'], labels[j], prediction[j]))
                total += 1
        return correct/total

## Train Model

In [0]:
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, no_iterations=2, pretrained_path='/content/drive/My Drive/Colab/CHKPT_/')

In [17]:
bert_class.train(train_dataset, dev_dataset)

HBox(children=(IntProgress(value=0, max=11724), HTML(value='')))

1: Loss - tensor(0.2853, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8754141492886377
1 Saved
1: Loss - tensor(0.0966, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8751705320600273
1 Saved
1: Loss - tensor(0.3156, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8723202104852855
1 Saved
1: Loss - tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.875048723445722
1 Saved
1: Loss - tensor(0.0946, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8776554277918535
1 Saved
1: Loss - tensor(0.1770, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8772900019489378
1 Saved
1: Loss - tensor(0.3710, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8752192555057494
1 Saved
1: Loss - tensor(0.2701, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8762911713116351
1 Saved
1: Loss - tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8731485090625609
1

KeyboardInterrupt: ignored

In [0]:
bert_class.save('QQP_PAWS_trained_1')

## Test Model


In [0]:
orig_dev_dataset = CustomDataset(orig_dev_set)
paws_qqp_dev_dataset=None
if PAWS_QQP == True:
    paws_qqp_dev_dataset = CustomDataset(paws_qqp_dev_set)

In [12]:
# bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/QQP_trained_1/')
# acc = bert_class.test(orig_dev_dataset, is_loader=False)
# print("DistilBERT 1 QQP: %lf" %(acc))
# bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/QQP_trained_1/')
# acc = bert_class.test(paws_qqp_dev_dataset, is_loader=False)
# print("DistilBERT 1 QQP PAWS: %lf" %(acc))
print_file = open('/content/drive/My Drive/Colab/QQP_PAWS_trained_1/qqp_false.txt', 'w')
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/QQP_PAWS_trained_1/')
acc = bert_class.test(orig_dev_dataset, is_loader=False, is_print=True, print_file_name='/content/drive/My Drive/Colab/data_qqp/dev.tsv', sentence_base='question', print_file=print_file)
print("DistilBERT 2 QQP: %lf" %(acc))
print_file.close()
print_file = open('/content/drive/My Drive/Colab/QQP_PAWS_trained_1/qqp_paws_false.txt', 'w')
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/QQP_PAWS_trained_1/')
acc = bert_class.test(paws_qqp_dev_dataset, is_loader=False, is_print=True, print_file_name='/content/drive/My Drive/Colab/data_PAWS_qqp/dev.tsv', sentence_base='sentence', print_file=print_file)
print("DistilBERT 2 QQP PAWS: %lf" %(acc))
print_file.close()

DistilBERT 2 QQP: 0.881053
DistilBERT 2 QQP PAWS: 0.805022


# **Custom Model**

In [0]:
class Model:
    # initalize model
    def __init__(self, model_class, tokenizer_class, pretrained_weights):
        self.model = model_class.from_pretrained(pretrained_weights)
        
    # Generate embeddings
    def get_embeddings(self, data):
        return
        
    # 
    def train(self, data):
        return
    
    # 
    def test(self, data):
        return