In [1]:
!pip install transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import torch
from transformers import *
import numpy as np
import scipy as scipy
import pandas as pd
import os
import ast
import tqdm as tqdm
import spacy


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
MODEL_CLASS, TOKENIZER_CLASS, PRETRAINED = (DistilBertForSequenceClassification, DistilBertTokenizer, 'distilbert-base-cased')
PAWS_QQP = True

In [0]:
SPACY_CORE = spacy.load("en_core_web_sm")

# **Generate QQP**

In [0]:
data_path = '/content/drive/My Drive/Colab/data_qqp/'

all_data = pd.read_table(os.path.expanduser(data_path + 'quora_duplicate_questions.tsv'), header=0)
all_data = all_data.rename(columns={'is_duplicate': 'label', 'question1': 'sentence1', 'question2': 'sentence2'})

class_1 = all_data[all_data['label']==1]
class_0 = all_data[all_data['label']==0]

class_1_dev_test = class_1.sample(n=10000)
class_1_train = class_1.drop(class_1_dev_test.index)

class_0_dev_test = class_0.sample(n=10000)
class_0_train = class_0.drop(class_0_dev_test.index)

train_data = pd.concat([class_1_train, class_0_train])
train_data = train_data.sample(frac=1).reset_index(drop=True)
test_data = pd.concat([class_1_dev_test[:5000], class_0_dev_test[:5000]])
test_data = test_data.sample(frac=1).reset_index(drop=True)
dev_data = pd.concat([class_1_dev_test[5000:], class_0_dev_test[5000:]])
dev_data = dev_data.sample(frac=1).reset_index(drop=True)

In [0]:
train_data.to_csv(data_path + 'train.tsv', sep='\t')
dev_data.to_csv(data_path + 'dev.tsv', sep='\t')
test_data.to_csv(data_path + 'test.tsv', sep='\t')

# **Preprocess Data** 

In [0]:
DATA_FOLDER = 'data_PAWS_qqp'
FILE_NAMES = ['dev.tsv']

In [0]:
class Tokenizer:
    # init
    def __init__(self, tokenizer_class, pretrained_weights):
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        
    # tokenize
    def tokenize_data(self, data, file_path, group_sent=True):
        bar = tqdm.notebook.tqdm(total=len(data))
        tokenized = None
        if group_sent == False:
            tokenized = pd.DataFrame(columns = ['i1', 's1', 'a1', 'n11', 'n12', 'm1', 'i2', 's2', 'a2', 'n21', 'n22', 'm2', 'y'])
        else:
            tokenized = pd.DataFrame(columns = ['i', 's', 'a', 'y'])
        count = 0
        flag = True
        i = 0
        for index, row in data.iterrows():
            token_row = {}
            # print(row)
            # QQP Dataset
            # sent1 = str(row['question1'])
            # sent2 = str(row['question2'])
            # token_row['y'] = int(row['is_duplicate'])
            
            # PAWS Dataset
            sent1 = str(row['sentence1'])
            sent2 = str(row['sentence2'])
            token_row['y'] = int(row['label\r'])
            # print(sent1)
            # print(sent2)
            # print(row['label\r'])
            # if count / 10000 < 14:
            #     count += 1
            #     continue
            
            if group_sent == True:
                encoding = self.tokenizer.encode_plus(sent1, sent2, return_token_type_ids=True, max_length=256, pad_to_max_length=True)
                token_row['i'] = encoding['input_ids']
                token_row['s'] = encoding['token_type_ids']
                token_row['a'] = encoding['attention_mask']
              
            else:
                encoding1 = self.tokenizer.encode_plus(sent1, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

                token_row['i1'] = encoding1['input_ids']
                token_row['s1'] = encoding1['token_type_ids']
                token_row['a1'] = encoding1['attention_mask']

                starts1 = np.zeros(10, dtype=int)
                ends1 = np.ones(10, dtype=int)
                mask1 = np.zeros(10, dtype=int)
                doc1 = SPACY_CORE(sent1)
                j = 0
                for chunk in doc1.noun_chunks:
                    # print(chunk)
                    starts1[j] = chunk.start
                    ends1[j] = chunk.end
                    mask1[j] = 1
                    j += 1
                    if j == 10:
                        break

                token_row['n11'] = starts1
                token_row['n12'] = ends1
                token_row['m1'] = mask1

                encoding2 = self.tokenizer.encode_plus(sent2, return_token_type_ids=True, max_length=128, pad_to_max_length=True)

                token_row['i2'] = encoding2['input_ids']
                token_row['s2'] = encoding2['token_type_ids']
                token_row['a2'] = encoding2['attention_mask']

                # print(sent2)
                starts2 = np.zeros(10, dtype=int)
                ends2 = np.ones(10, dtype=int)
                mask2 = np.zeros(10, dtype=int)
                doc2 = SPACY_CORE(sent2)
                j = 0
                for chunk in doc2.noun_chunks:
                    # print(chunk)
                    starts2[j] = chunk.start
                    ends2[j] = chunk.end
                    mask2[j] = 1
                    j += 1
                    if j == 10:
                        break

                token_row['n21'] = starts2
                token_row['n22'] = ends2
                token_row['m2'] = mask2

            tokenized = tokenized.append(token_row, ignore_index=True)
            bar.update()
            count += 1
            if count % 10000 == 0:
                tokenized.to_parquet(file_path + str(i) + '.parquet')
                tokenized = tokenized.iloc[0:0]
                i+=1
        if count % 10000 != 0:
            tokenized.to_parquet(file_path + str(i) + '.parquet')
        bar.close()
        return tokenized

In [26]:
data_path = '/content/drive/My Drive/Colab/' + DATA_FOLDER + '/'
tokenizer = Tokenizer(TOKENIZER_CLASS, PRETRAINED)

for file_name in FILE_NAMES:
    data = pd.read_csv(os.path.expanduser(data_path+file_name), sep='\t', lineterminator='\n', error_bad_lines=False)
    data = data.dropna()
    tokens = tokenizer.tokenize_data(data, data_path + file_name.split('.')[0] + '_seperate', group_sent=False)
    # tokens = tokenizer.tokenize_data(data, data_path + file_name.split('.')[0] + '_grouped')
    # tokens.to_csv(data_path + file_name.split('.')[0] + '.csv')
    # display(tokens)

HBox(children=(IntProgress(value=0, max=677), HTML(value='')))

What were the major effects of the cambodia earthquake , and how do these effects compare to the Iquique earthquake in 1877 ?
0
What
the major effects
the cambodia earthquake
these effects
the Iquique earthquake
What were the major effects of the Iquique earthquake , and how do these effects compare to the cambodia earthquake in 1877 ?
What
the major effects
the Iquique earthquake
these effects
the cambodia earthquake
The guy I 'm dating never texts me and I feel like he does n't care about me but when I see him he shows me he likes me and wants me . Why do I feel this way ?
0
The guy
I
me
I
he
me
I
him
he
me
The guy I 'm dating never wants me and I feel like he does n't care about me but when I see him , he shows me he likes me and texts me . Why do I feel this way ?
The guy
me
I
he
me
I
him
he
me
he
How do I make my new phone number as group admin when I do n't have access to my old number ( which is the group admin at present ) on whatsapp ?
0
I
my new phone number
group admin
I
acc

KeyboardInterrupt: ignored

In [0]:
data = pd.read_parquet('/content/drive/My Drive/Colab/data_PAWS_qqp/dev_seperate0.parquet')
display(data)

Unnamed: 0,i1,s1,a1,n11,n12,m1,i2,s2,a2,n21,n22,m2,y
0,"[101, 1327, 1127, 1103, 1558, 3154, 1104, 1103...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 2, 6, 13, 17, 0, 0, 0, 0, 0]","[1, 5, 9, 15, 20, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]","[101, 1327, 1127, 1103, 1558, 3154, 1104, 1103...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 2, 6, 13, 17, 0, 0, 0, 0, 0]","[1, 5, 9, 15, 20, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]",0
1,"[101, 1109, 2564, 146, 112, 182, 4676, 1309, 6...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 2, 8, 10, 13, 18, 21, 23, 24, 26]","[2, 3, 9, 11, 14, 19, 22, 24, 25, 27]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[101, 1109, 2564, 146, 112, 182, 4676, 1309, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 8, 10, 13, 18, 21, 23, 25, 27, 28]","[2, 9, 11, 14, 19, 22, 24, 26, 28, 29]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
2,"[101, 1731, 1202, 146, 1294, 1139, 1207, 2179,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 4, 9, 12, 16, 18, 24, 31, 0, 0]","[3, 8, 11, 13, 17, 21, 27, 32, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[101, 1731, 1202, 146, 1294, 1139, 1385, 2179,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 4, 9, 12, 16, 18, 24, 31, 0, 0]","[3, 8, 11, 13, 17, 21, 27, 32, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]",0
3,"[101, 2009, 11019, 183, 112, 189, 2182, 8658, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 5, 13, 18, 19, 0, 0, 0, 0, 0]","[4, 8, 17, 19, 20, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]","[101, 2009, 11019, 183, 112, 189, 2182, 8658, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 5, 15, 20, 21, 0, 0, 0, 0, 0]","[4, 10, 19, 21, 22, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]",0
4,"[101, 2091, 4112, 1535, 1176, 1689, 3141, 1441...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 4, 7, 0, 0, 0, 0, 0, 0, 0]","[3, 7, 13, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]","[101, 2091, 1689, 3141, 1535, 1176, 4112, 1441...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 5, 7, 0, 0, 0, 0, 0, 0, 0]","[4, 7, 13, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,"[101, 2825, 170, 2900, 1497, 7349, 1587, 1165,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 9, 12, 15, 19, 27, 0, 0, 0, 0]","[5, 10, 13, 18, 26, 28, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]","[101, 2825, 170, 2900, 1137, 1664, 118, 2900, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 9, 15, 18, 21, 25, 0, 0, 0, 0]","[8, 10, 16, 19, 24, 29, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]",0
673,"[101, 2421, 112, 188, 1474, 146, 1202, 183, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 3, 7, 14, 16, 21, 0, 0, 0, 0]","[2, 4, 9, 15, 18, 22, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]","[101, 2421, 112, 188, 1474, 146, 1225, 183, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 3, 7, 14, 16, 21, 0, 0, 0, 0]","[2, 4, 9, 15, 18, 22, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]",0
674,"[101, 1731, 1110, 6275, 10678, 4013, 1107, 847...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 6, 9, 11, 0, 0, 0, 0, 0, 0]","[4, 7, 10, 14, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]","[101, 1731, 1110, 6275, 10678, 4013, 1107, 651...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 6, 9, 11, 0, 0, 0, 0, 0, 0]","[4, 7, 10, 14, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]",0
675,"[101, 2181, 144, 24723, 2312, 12166, 1239, 178...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 5, 9, 12, 16, 0, 0, 0, 0, 0]","[4, 7, 11, 14, 18, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]","[101, 2181, 144, 24723, 2312, 12166, 1239, 178...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 5, 9, 12, 16, 0, 0, 0, 0, 0]","[4, 7, 11, 14, 18, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]",1


# **BERT Model**

## **Load Data**
- TBD: Create DataLoader

In [0]:
BASE_QQP_PATH = '/content/drive/My Drive/Colab/data_qqp/'
BASE_PAWS_QQP_PATH = '/content/drive/My Drive/Colab/data_PAWS_qqp/'

In [0]:
# Load QQP sets
data_list = [ pd.read_parquet(BASE_QQP_PATH + 'train_grouped' + str(i) + '.parquet') for i in range(0, 37) ]
data = pd.concat(data_list)
# data = lit_eval(data)
train_set = data
# display(train_set)

In [0]:
# Load PAWS QQP if generated
if PAWS_QQP == True:    
    data_list = [ pd.read_parquet(BASE_PAWS_QQP_PATH + 'train_grouped' + str(i) + '.parquet') for i in range(0, 2) ]
    data = pd.concat(data_list)
    train_set = pd.concat([train_set, data])

# print(train_set.shape)
# df = train_set.loc[train_set['y']==1]
# print(df.shape)

In [0]:
data_list = [ pd.read_parquet(BASE_QQP_PATH + 'dev_grouped' + str(i) + '.parquet') for i in range(0, 5) ]
data = pd.concat(data_list)
dev_set = data
orig_dev_set = data


# data = pd.read_csv(BASE_QQP_PATH + 'test.csv')
# data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
# data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
# test_set = data

In [0]:
paws_qqp_dev_set = None
if PAWS_QQP == True: 
    data_list = [ pd.read_parquet(BASE_PAWS_QQP_PATH + 'dev_grouped' + str(i) + '.parquet') for i in range(0, 1) ]
    data = pd.concat(data_list)
    # data = lit_eval(data)
    paws_qqp_dev_set = data
    dev_set = pd.concat([dev_set, data])
    # display(dev_set)


# print(dev_set.shape)
# df = dev_set.loc[dev_set['y']==1]
# print(df.shape)

## DataLoader

In [0]:
class CustomDataset():
    def __init__(self, data):
        self.inputs = data['i'].values
        self.sentence_tokens = data['s'].values
        self.labels = data['y'].values
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return tuple([torch.tensor(self.inputs[idx]), torch.tensor(self.sentence_tokens[idx]), torch.tensor(self.labels[idx])])

In [0]:
train_dataset = CustomDataset(train_set)
dev_dataset = CustomDataset(dev_set)
# print(dev_dataset.__getitem__(-1))

## Model Code

In [0]:
class Model_Classifier:
    # initalize model
    def __init__(self, model_class, pretrained_weights, pretrained_path=None, no_iterations=10):
        if pretrained_path is not None:
            self.model = model_class.from_pretrained(pretrained_path).cuda()
        else:
            self.model = model_class.from_pretrained(pretrained_weights).cuda()

        self.TRAINING_ITERATIONS = no_iterations
        self.WARMUP = 1000
        self.BATCH_SIZE = 32
        self.REPORT_FREQUENCY = 100
        self.CHKPT_FREQUENCY = 500
#         self.model.eval()

    def train_batch(self, train_data):
        train_ids = train_data[0].cuda()
        train_segments = train_data[1].cuda()
        train_labels = train_data[2].cuda()
        # train_ids = torch.tensor(train_data['i'].values.tolist()).cuda()
        # train_segments = torch.tensor(train_data['s'].values.tolist()).cuda()
        # train_labels = torch.tensor(train_data['y'].values.tolist()).cuda()
        # print(train_labels)

        self.model.train()
        # Uncomment this for BERT
        # outputs = self.model(input_ids = train_ids, token_type_ids = train_segments, labels = train_labels)

        # Uncomment this for DistilBERT
        outputs = self.model(input_ids = train_ids, labels = train_labels)
        loss = outputs[0].cuda()
        loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        self.model.zero_grad()
        return loss

    # train classifier
    def train(self, data, validation_set):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        self.optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
        # self.scheduler = get_custom_schedule(self.optimizer))
        train_loader = torch.utils.data.DataLoader(data, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.RandomSampler(data))
        dev_loader = torch.utils.data.DataLoader(validation_set, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.RandomSampler(validation_set))
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=len(train_loader)//10, num_training_steps=(self.TRAINING_ITERATIONS+1)*len(train_loader))
        prev_acc = 0.0
        for i in range(1, self.TRAINING_ITERATIONS+1):
            count = 1
            bar = tqdm.notebook.tqdm(total=len(train_loader))
            for batch in train_loader:
                train_data = batch
                count += 1
                loss = self.train_batch(train_data)
                bar.update(1)
                if count%self.CHKPT_FREQUENCY == 0:
                    bar.write("%d: Loss - %s" %(i, loss))
                    acc = self.test(dev_loader)
                    bar.write("%d: Validation - %s" %(i, str(acc)))
                    # if abs(prev_acc-acc) < 0.0001 and loss[0] < 0.001:
                    #     bar.write('Finish')
                    #     bar.close()
                    #     self.save(directory='final')
                    #     break
                    prev_acc = acc
                    bar.write("%d Saved" %(i))
                    self.save(directory='CHKPT_')
            bar.close()
        return

    # save model
    def save(self, directory=None):
      if directory is not None:
          path = '/content/drive/My Drive/Colab/' + directory + '/'
          if not os.path.exists(path):
              os.makedirs(path)
          self.model.save_pretrained('/content/drive/My Drive/Colab/' + directory + '/')
      else:
          self.model.save_pretrained('/content/drive/My Drive/Colab/model/')
    
    # predict values
    def test(self, data, is_print=False, is_loader=True, print_file_name=None, sentence_base='sentence', print_file=None):
        dev_loader = None
        if is_loader==True:
            dev_loader = data
        else:
            dev_loader = torch.utils.data.DataLoader(data, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.SequentialSampler(data))

        if is_print == True and print_file_name is not None:
            test_samples = pd.read_table(print_file_name,header=0)
        self.model.eval()
        correct = 0
        total = 0
        i = 0
        for batch in dev_loader:
            input_ids = batch[0].cuda()
            input_segments = batch[1].cuda()
            labels = batch[2]

            # Uncomment this for BERT
            # outputs = self.model(input_ids = input_ids, token_type_ids = input_segments)

            # Uncomment this for DistilBERT
            outputs = self.model(input_ids = input_ids)
            probs = outputs[0]
            softmax = torch.nn.functional.softmax(probs, dim=1)
            prediction = torch.argmax(softmax, dim=1)
            # print(prediction)
            # print(len(prediction))
            for j in range(len(labels)):
                if labels[j] == prediction[j]:
                    correct += 1
                elif is_print==True:
                    print_file.write("Sentence 1: %s \n Sentence 2: %s \n Label (%d) vs Prediction (%d) \n \n ----- \n \n" %(test_samples.iloc[total][sentence_base+'1'], test_samples.iloc[total][sentence_base+'2'], labels[j], prediction[j]))
                total += 1
        return correct/total

## Train Model

In [0]:
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, no_iterations=2, pretrained_path='/content/drive/My Drive/Colab/CHKPT_/')

In [0]:
bert_class.train(train_dataset, dev_dataset)

HBox(children=(IntProgress(value=0, max=11724), HTML(value='')))

1: Loss - tensor(0.2853, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8754141492886377
1 Saved
1: Loss - tensor(0.0966, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8751705320600273
1 Saved
1: Loss - tensor(0.3156, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8723202104852855
1 Saved
1: Loss - tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.875048723445722
1 Saved
1: Loss - tensor(0.0946, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8776554277918535
1 Saved
1: Loss - tensor(0.1770, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8772900019489378
1 Saved
1: Loss - tensor(0.3710, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8752192555057494
1 Saved
1: Loss - tensor(0.2701, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8762911713116351
1 Saved
1: Loss - tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)
1: Validation - 0.8731485090625609
1

KeyboardInterrupt: ignored

In [0]:
bert_class.save('QQP_PAWS_trained_1')

## Test Model


In [0]:
orig_dev_dataset = CustomDataset(orig_dev_set)
paws_qqp_dev_dataset=None
if PAWS_QQP == True:
    paws_qqp_dev_dataset = CustomDataset(paws_qqp_dev_set)

In [16]:
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/BERT_Trained/QQP_trained_1/')
acc = bert_class.test(paws_qqp_dev_dataset, is_loader=False)
print("DistilBERT Train=QQP; Test=PAWS: %lf" %(acc))
bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/BERT_Trained/QQP_PAWS_trained_1/')
acc = bert_class.test(paws_qqp_dev_dataset, is_loader=False)
print("DistilBERT Train=QQP+PAWS; Test=PAWS: %lf" %(acc))

# If you want to print the errors
# print_file = open('/content/drive/My Drive/Colab/BERT_Trained/QQP_PAWS_trained_1/qqp_false.txt', 'w')
# bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/BERT_Trained/QQP_PAWS_trained_1/')
# acc = bert_class.test(orig_dev_dataset, is_loader=False, is_print=True, print_file_name='/content/drive/My Drive/Colab/data_qqp/dev.tsv', sentence_base='question', print_file=print_file)
# print("DistilBERT 2 QQP: %lf" %(acc))
# print_file.close()
# print_file = open('/content/drive/My Drive/Colab/BERT_Trained/QQP_PAWS_trained_1/qqp_paws_false.txt', 'w')
# # bert_class = Model_Classifier(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/BERT_Trained/QQP_PAWS_trained_1/')
# acc = bert_class.test(paws_qqp_dev_dataset, is_loader=False, is_print=True, print_file_name='/content/drive/My Drive/Colab/data_PAWS_qqp/dev.tsv', sentence_base='sentence', print_file=print_file)
# print("DistilBERT 2 QQP PAWS: %lf" %(acc))
# print_file.close()



DistilBERT Train=QQP; Test=PAWS: 0.317578
DistilBERT Train=QQP+PAWS; Test=PAWS: 0.805022


# **Custom Model**

## Load Data

In [0]:
BASE_QQP_PATH = '/content/drive/My Drive/Colab/data_qqp/'
BASE_PAWS_QQP_PATH = '/content/drive/My Drive/Colab/data_PAWS_qqp/'

In [0]:
# Load QQP sets
data_list = [ pd.read_parquet(BASE_QQP_PATH + 'train_seperate' + str(i) + '.parquet') for i in range(0, 37) ]
data = pd.concat(data_list)
# data = lit_eval(data)
train_set = data
# display(train_set)

In [0]:
# Load PAWS QQP if generated
paws_train_set = None
if PAWS_QQP == True:    
    data_list = [ pd.read_parquet(BASE_PAWS_QQP_PATH + 'train_seperate' + str(i) + '.parquet') for i in range(0, 2) ]
    data = pd.concat(data_list)
    paws_train_set = data
    train_set = pd.concat([train_set, data])

# print(train_set.shape)
# df = train_set.loc[train_set['y']==1]
# print(df.shape)

In [0]:
data_list = [ pd.read_parquet(BASE_QQP_PATH + 'dev_seperate' + str(i) + '.parquet') for i in range(0, 5) ]
data = pd.concat(data_list)
dev_set = data
orig_dev_set = data


# data = pd.read_csv(BASE_QQP_PATH + 'test.csv')
# data['i'] = data['i'].apply(lambda x: ast.literal_eval(x))
# data['s'] = data['s'].apply(lambda x: ast.literal_eval(x))
# test_set = data

In [0]:
paws_qqp_dev_set = None
if PAWS_QQP == True: 
    data_list = [ pd.read_parquet(BASE_PAWS_QQP_PATH + 'dev_seperate' + str(i) + '.parquet') for i in range(0, 1) ]
    data = pd.concat(data_list)
    # data = lit_eval(data)
    paws_qqp_dev_set = data
    dev_set = pd.concat([dev_set, data])
    # display(dev_set)


# print(dev_set.shape)
# df = dev_set.loc[dev_set['y']==1]
# print(df.shape)

## Data Loader

In [0]:
class CustomDataset():
    def __init__(self, data):
        self.inputs1 = data['i1'].values
        self.inputs2 = data['i2'].values
        self.sentence_tokens1 = data['s1'].values
        self.sentence_tokens2 = data['s2'].values
        self.attention_masks1 = data['a1'].values
        self.attention_masks2 = data['a2'].values
        self.starts1 = data['n11'].values
        self.starts2 = data['n21'].values
        self.ends1 = data['n12'].values
        self.ends2 = data['n22'].values
        self.masks1 = data['m1'].values
        self.masks2 = data['m2'].values
        self.labels = data['y'].values
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return tuple([torch.tensor(self.inputs1[idx]), torch.tensor(self.sentence_tokens1[idx]), torch.tensor(self.attention_masks1[idx]), \
                      torch.tensor(self.inputs2[idx]), torch.tensor(self.sentence_tokens2[idx]), torch.tensor(self.attention_masks2[idx]), \
                      torch.tensor(self.starts1[idx]), torch.tensor(self.ends1[idx]), torch.tensor(self.masks1[idx]), \
                      torch.tensor(self.starts2[idx]), torch.tensor(self.ends2[idx]), torch.tensor(self.masks2[idx]), \
                      torch.tensor(self.labels[idx], dtype=int)])

In [0]:
train_dataset = CustomDataset(train_set)
dev_dataset = CustomDataset(dev_set)
# print(dev_dataset.__getitem__(-1))

## Model

In [0]:
MODEL_CLASS, TOKENIZER_CLASS, PRETRAINED = (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased')

### Cross_Attn_Model_with_+


In [0]:
class Model(torch.nn.Module):
    def __init__(self, model_class, pretrained_weights):
        super(Model, self).__init__()
        self.model = model_class.from_pretrained(pretrained_weights)

        self.sentence_comparator = torch.nn.Sequential(torch.nn.Linear(1538, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024), torch.nn.Dropout(p=0.2))
        self.phrase_comparator1 = torch.nn.Sequential(torch.nn.Linear(15360, 4096), torch.nn.GELU(), torch.nn.Dropout(p=0.2))
        self.phrase_comparator2 = torch.nn.Sequential(torch.nn.Linear(4096, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        # self.phrase_comparator = torch.nn.Sequential(torch.nn.Linear(15360, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        self.comparator1 = torch.nn.Sequential(torch.nn.Linear(2048, 512), torch.nn.GELU(), torch.nn.LayerNorm(512))
        self.comparator2 = torch.nn.Sequential(torch.nn.Linear(512, 128))
        self.comparator3 = torch.nn.Sequential(torch.nn.Linear(128, 2), torch.nn.Softmax(dim=1))
        self.cos_fn = torch.nn.CosineSimilarity(dim=1)
        self.dist_fn = torch.nn.PairwiseDistance(keepdim=True)
        self.q_linear = torch.nn.Linear(768, 768)
        self.k_linear = torch.nn.Linear(768, 768)
        self.v_linear = torch.nn.Linear(768, 768)
        self.attention_head = torch.nn.MultiheadAttention(768, 8)
        self.q_linear2 = torch.nn.Linear(768, 768)
        self.k_linear2 = torch.nn.Linear(768, 768)
        self.v_linear2 = torch.nn.Linear(768, 768)
        self.attention_head2 = torch.nn.MultiheadAttention(768, 8)
    
    def embed_data(self, tokens, segments, attentions):
        # Uncomment this for BERT
        # outputs = self.model(input_ids = tokens, token_type_ids = segments)

        # Uncomment this for DistilBERT
        outputs = self.model(input_ids = tokens, attention_mask=attentions)
        return outputs

    def pool_embeds(self, embeds, other_embeds, attentions):
        q = self.q_linear2(other_embeds)
        k = self.k_linear2(embeds)
        v = self.v_linear2(embeds)


        q = q.transpose(0, 1)
        k = k.transpose(0, 1)
        v = v.transpose(0, 1)

        new_embeds, weights = self.attention_head2(q, k, v)
        new_embeds = new_embeds.transpose(0, 1) + embeds

        masks = attentions.unsqueeze(-1).expand(embeds.size()).float()
        sum_embeds = torch.sum(new_embeds*masks, 1)
        normal_factor = torch.sum(masks, 1)
        pool = sum_embeds/normal_factor
        # print(pool.shape)
        return pool

    def pool_phrase_chunks(self, embeds, other_embeds, starts, ends, masks):
        pools = torch.zeros((embeds.shape[0], 7680)).cuda()
        for j in range(10):
            filters = torch.zeros((embeds.shape[0], embeds.shape[1])).cuda()
            for i in range(embeds.shape[0]):
            # embed = embeds[i]
                filters[i, starts[i][j]:ends[i][j]] = 1
            filters = filters.unsqueeze(-1).expand(embeds.size()).float()
            # print(filters.shape)

            spl = embeds*filters

            q = self.q_linear(other_embeds)
            k = self.k_linear(embeds)
            v = self.v_linear(spl)
            # print(q.shape, k.shape, v.shape)

            q = q.transpose(0, 1)
            k = k.transpose(0, 1)
            v = v.transpose(0, 1)

            new_embeds, weights = self.attention_head(q, k, v)
            new_embeds = new_embeds.transpose(0, 1) + spl
            # print(new_embeds.shape)

            sum_embeds = torch.sum(new_embeds*filters, 1)
            normal_factor = torch.sum(filters, 1)
            pool = sum_embeds/normal_factor

            for i in range(embeds.shape[0]):
                pools[i, 768*j:768*(j+1)] = pool[i]*masks[i][j]
                # data = embed[starts[i][j]:ends[i][j]]
                # sum_data = torch.sum(data, 0)
                # normal_factor = data.shape[0]
                # pool = sum_data/normal_factor
                # pool = pool*masks[i][j]
                # pools[i, 768*j:768*(j+1)] = pool

        # print(pools.shape)
        # phrase_chunks = []
        # for chunk in chunks:
        #     sum_embeds = torch.sum(embeds[chunks[0]:chunks[1]], 0)
        # print(pools.shape)
        return pools

    def compare_chunk_pools(self, pools1, pools2):
        pools = torch.cat((pools1, pools2), 1)
        comp1 = self.phrase_comparator1(pools)
        return self.phrase_comparator2(comp1)

    def compare_sentences(self, sentence1, sentence2):
        sentences = torch.cat((sentence1, sentence2), 1)
        dist_euc = self.dist_fn(sentence1, sentence2)
        sentences_fin = torch.cat((sentences, dist_euc), 1)
        dist_cos = self.cos_fn(sentence1, sentence2).unsqueeze(-1)
        sentences_fin2 = torch.cat((sentences_fin, dist_cos), 1)
        # print(sentences.size())
        return self.sentence_comparator(sentences_fin2)

    def compute_siamese(self, feature_up, feature_down):
        features = torch.cat((feature_up, feature_down), 1)
        return self.comparator(features)

    def forward(self, tokens1, tokens2, segments1, segments2, attentions1, attentions2, starts1, starts2, ends1, ends2, masks1, masks2):
        embeds1 = self.embed_data(tokens1, segments1, attentions1)
        embeds2 = self.embed_data(tokens2, segments2, attentions2)
        # print(embeds1)
        pools1 = self.pool_phrase_chunks(embeds1[0], embeds2[0], starts1, ends1, masks1)
        pools2 = self.pool_phrase_chunks(embeds2[0], embeds1[0], starts2, ends2, masks2)

        sentences1 = self.pool_embeds(embeds1[0], embeds2[0], attentions1)
        sentences2 = self.pool_embeds(embeds2[0], embeds1[0], attentions2)

        compare1 = self.compare_sentences(sentences1, sentences2)
        compare2 = self.compare_chunk_pools(pools1, pools2)
        # print(compare1.size())

        # print(compare2.shape)

        conc = torch.cat((compare1, compare2), 1)
        res1 = self.comparator1(conc)
        res2 = self.comparator2(res1)

        # print(res1.shape)

        return self.comparator3(res2)

### Cross_Attn_Model

In [0]:
class Model(torch.nn.Module):
    def __init__(self, model_class, pretrained_weights):
        super(Model, self).__init__()
        self.model = model_class.from_pretrained(pretrained_weights)

        self.sentence_comparator = torch.nn.Sequential(torch.nn.Linear(1538, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024), torch.nn.Dropout(p=0.2))
        self.phrase_comparator1 = torch.nn.Sequential(torch.nn.Linear(15360, 4096), torch.nn.GELU(), torch.nn.Dropout(p=0.2))
        self.phrase_comparator2 = torch.nn.Sequential(torch.nn.Linear(4096, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        # self.phrase_comparator = torch.nn.Sequential(torch.nn.Linear(15360, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        self.comparator1 = torch.nn.Sequential(torch.nn.Linear(2048, 512), torch.nn.GELU(), torch.nn.LayerNorm(512))
        self.comparator2 = torch.nn.Sequential(torch.nn.Linear(512, 128))
        self.comparator3 = torch.nn.Sequential(torch.nn.Linear(128, 2), torch.nn.Softmax(dim=1))
        self.cos_fn = torch.nn.CosineSimilarity(dim=1)
        self.dist_fn = torch.nn.PairwiseDistance(keepdim=True)
        self.q_linear = torch.nn.Linear(768, 768)
        self.k_linear = torch.nn.Linear(768, 768)
        self.v_linear = torch.nn.Linear(768, 768)
        self.attention_head = torch.nn.MultiheadAttention(768, 8)
        self.q_linear2 = torch.nn.Linear(768, 768)
        self.k_linear2 = torch.nn.Linear(768, 768)
        self.v_linear2 = torch.nn.Linear(768, 768)
        self.attention_head2 = torch.nn.MultiheadAttention(768, 8)
    
    def embed_data(self, tokens, segments, attentions):
        # Uncomment this for BERT
        # outputs = self.model(input_ids = tokens, token_type_ids = segments)

        # Uncomment this for DistilBERT
        outputs = self.model(input_ids = tokens, attention_mask=attentions)
        return outputs

    def pool_embeds(self, embeds, other_embeds, attentions):
        q = self.q_linear2(other_embeds)
        k = self.k_linear2(embeds)
        v = self.v_linear2(embeds)


        q = q.transpose(0, 1)
        k = k.transpose(0, 1)
        v = v.transpose(0, 1)

        new_embeds, weights = self.attention_head2(q, k, v)
        new_embeds = new_embeds.transpose(0, 1)

        masks = attentions.unsqueeze(-1).expand(embeds.size()).float()
        sum_embeds = torch.sum(new_embeds*masks, 1)
        normal_factor = torch.sum(masks, 1)
        pool = sum_embeds/normal_factor
        # print(pool.shape)
        return pool

    def pool_phrase_chunks(self, embeds, other_embeds, starts, ends, masks):
        pools = torch.zeros((embeds.shape[0], 7680)).cuda()
        for j in range(10):
            filters = torch.zeros((embeds.shape[0], embeds.shape[1])).cuda()
            for i in range(embeds.shape[0]):
            # embed = embeds[i]
                filters[i, starts[i][j]:ends[i][j]] = 1
            filters = filters.unsqueeze(-1).expand(embeds.size()).float()
            # print(filters.shape)

            spl = embeds*filters

            q = self.q_linear(other_embeds)
            k = self.k_linear(embeds)
            v = self.v_linear(spl)
            # print(q.shape, k.shape, v.shape)

            q = q.transpose(0, 1)
            k = k.transpose(0, 1)
            v = v.transpose(0, 1)

            new_embeds, weights = self.attention_head(q, k, v)
            new_embeds = new_embeds.transpose(0, 1)
            # print(new_embeds.shape)

            sum_embeds = torch.sum(new_embeds*filters, 1)
            normal_factor = torch.sum(filters, 1)
            pool = sum_embeds/normal_factor

            for i in range(embeds.shape[0]):
                pools[i, 768*j:768*(j+1)] = pool[i]*masks[i][j]
                # data = embed[starts[i][j]:ends[i][j]]
                # sum_data = torch.sum(data, 0)
                # normal_factor = data.shape[0]
                # pool = sum_data/normal_factor
                # pool = pool*masks[i][j]
                # pools[i, 768*j:768*(j+1)] = pool

        # print(pools.shape)
        # phrase_chunks = []
        # for chunk in chunks:
        #     sum_embeds = torch.sum(embeds[chunks[0]:chunks[1]], 0)
        # print(pools.shape)
        return pools

    def compare_chunk_pools(self, pools1, pools2):
        pools = torch.cat((pools1, pools2), 1)
        comp1 = self.phrase_comparator1(pools)
        return self.phrase_comparator2(comp1)

    def compare_sentences(self, sentence1, sentence2):
        sentences = torch.cat((sentence1, sentence2), 1)
        dist_euc = self.dist_fn(sentence1, sentence2)
        sentences_fin = torch.cat((sentences, dist_euc), 1)
        dist_cos = self.cos_fn(sentence1, sentence2).unsqueeze(-1)
        sentences_fin2 = torch.cat((sentences_fin, dist_cos), 1)
        # print(sentences.size())
        return self.sentence_comparator(sentences_fin2)

    def compute_siamese(self, feature_up, feature_down):
        features = torch.cat((feature_up, feature_down), 1)
        return self.comparator(features)

    def forward(self, tokens1, tokens2, segments1, segments2, attentions1, attentions2, starts1, starts2, ends1, ends2, masks1, masks2):
        embeds1 = self.embed_data(tokens1, segments1, attentions1)
        embeds2 = self.embed_data(tokens2, segments2, attentions2)
        # print(embeds1)
        pools1 = self.pool_phrase_chunks(embeds1[0], embeds2[0], starts1, ends1, masks1)
        pools2 = self.pool_phrase_chunks(embeds2[0], embeds1[0], starts2, ends2, masks2)

        sentences1 = self.pool_embeds(embeds1[0], embeds2[0], attentions1)
        sentences2 = self.pool_embeds(embeds2[0], embeds1[0], attentions2)

        compare1 = self.compare_sentences(sentences1, sentences2)
        compare2 = self.compare_chunk_pools(pools1, pools2)
        # print(compare1.size())

        # print(compare2.shape)

        conc = torch.cat((compare1, compare2), 1)
        res1 = self.comparator1(conc)
        res2 = self.comparator2(res1)

        # print(res1.shape)

        return self.comparator3(res2)

### Self_Attn_Model

In [0]:
class Model(torch.nn.Module):
    def __init__(self, model_class, pretrained_weights):
        super(Model, self).__init__()
        self.model = model_class.from_pretrained(pretrained_weights)

        self.sentence_comparator = torch.nn.Sequential(torch.nn.Linear(1538, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024), torch.nn.Dropout(p=0.2))
        self.phrase_comparator1 = torch.nn.Sequential(torch.nn.Linear(15360, 4096), torch.nn.GELU(), torch.nn.Dropout(p=0.2))
        self.phrase_comparator2 = torch.nn.Sequential(torch.nn.Linear(4096, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        # self.phrase_comparator = torch.nn.Sequential(torch.nn.Linear(15360, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        self.comparator1 = torch.nn.Sequential(torch.nn.Linear(2048, 512), torch.nn.GELU(), torch.nn.LayerNorm(512))
        self.comparator2 = torch.nn.Sequential(torch.nn.Linear(512, 128))
        self.comparator3 = torch.nn.Sequential(torch.nn.Linear(128, 2), torch.nn.Softmax(dim=1))
        self.cos_fn = torch.nn.CosineSimilarity(dim=1)
        self.dist_fn = torch.nn.PairwiseDistance(keepdim=True)
        self.q_linear = torch.nn.Linear(768, 768)
        self.k_linear = torch.nn.Linear(768, 768)
        self.v_linear = torch.nn.Linear(768, 768)
        self.attention_head = torch.nn.MultiheadAttention(768, 8)
        self.q_linear2 = torch.nn.Linear(768, 768)
        self.k_linear2 = torch.nn.Linear(768, 768)
        self.v_linear2 = torch.nn.Linear(768, 768)
        self.attention_head2 = torch.nn.MultiheadAttention(768, 8)
    
    def embed_data(self, tokens, segments, attentions):
        # Uncomment this for BERT
        # outputs = self.model(input_ids = tokens, token_type_ids = segments)

        # Uncomment this for DistilBERT
        outputs = self.model(input_ids = tokens, attention_mask=attentions)
        return outputs

    def pool_embeds(self, embeds, other_embeds, attentions):
        q = self.q_linear2(embeds)
        k = self.k_linear2(embeds)
        v = self.v_linear2(embeds)


        q = q.transpose(0, 1)
        k = k.transpose(0, 1)
        v = v.transpose(0, 1)

        new_embeds, weights = self.attention_head2(q, k, v)
        new_embeds = new_embeds.transpose(0, 1)

        masks = attentions.unsqueeze(-1).expand(embeds.size()).float()
        sum_embeds = torch.sum(new_embeds*masks, 1)
        normal_factor = torch.sum(masks, 1)
        pool = sum_embeds/normal_factor
        # print(pool.shape)
        return pool

    def pool_phrase_chunks(self, embeds, other_embeds, starts, ends, masks):
        pools = torch.zeros((embeds.shape[0], 7680)).cuda()
        for j in range(10):
            filters = torch.zeros((embeds.shape[0], embeds.shape[1])).cuda()
            for i in range(embeds.shape[0]):
            # embed = embeds[i]
                filters[i, starts[i][j]:ends[i][j]] = 1
            filters = filters.unsqueeze(-1).expand(embeds.size()).float()
            # print(filters.shape)

            spl = embeds*filters

            q = self.q_linear(embeds)
            k = self.k_linear(embeds)
            v = self.v_linear(spl)
            # print(q.shape, k.shape, v.shape)

            q = q.transpose(0, 1)
            k = k.transpose(0, 1)
            v = v.transpose(0, 1)

            new_embeds, weights = self.attention_head(q, k, v)
            new_embeds = new_embeds.transpose(0, 1)
            # print(new_embeds.shape)

            sum_embeds = torch.sum(new_embeds*filters, 1)
            normal_factor = torch.sum(filters, 1)
            pool = sum_embeds/normal_factor

            for i in range(embeds.shape[0]):
                pools[i, 768*j:768*(j+1)] = pool[i]*masks[i][j]
                # data = embed[starts[i][j]:ends[i][j]]
                # sum_data = torch.sum(data, 0)
                # normal_factor = data.shape[0]
                # pool = sum_data/normal_factor
                # pool = pool*masks[i][j]
                # pools[i, 768*j:768*(j+1)] = pool

        # print(pools.shape)
        # phrase_chunks = []
        # for chunk in chunks:
        #     sum_embeds = torch.sum(embeds[chunks[0]:chunks[1]], 0)
        # print(pools.shape)
        return pools

    def compare_chunk_pools(self, pools1, pools2):
        pools = torch.cat((pools1, pools2), 1)
        comp1 = self.phrase_comparator1(pools)
        return self.phrase_comparator2(comp1)

    def compare_sentences(self, sentence1, sentence2):
        sentences = torch.cat((sentence1, sentence2), 1)
        dist_euc = self.dist_fn(sentence1, sentence2)
        sentences_fin = torch.cat((sentences, dist_euc), 1)
        dist_cos = self.cos_fn(sentence1, sentence2).unsqueeze(-1)
        sentences_fin2 = torch.cat((sentences_fin, dist_cos), 1)
        # print(sentences.size())
        return self.sentence_comparator(sentences_fin2)

    def compute_siamese(self, feature_up, feature_down):
        features = torch.cat((feature_up, feature_down), 1)
        return self.comparator(features)

    def forward(self, tokens1, tokens2, segments1, segments2, attentions1, attentions2, starts1, starts2, ends1, ends2, masks1, masks2):
        embeds1 = self.embed_data(tokens1, segments1, attentions1)
        embeds2 = self.embed_data(tokens2, segments2, attentions2)
        # print(embeds1)
        pools1 = self.pool_phrase_chunks(embeds1[0], embeds2[0], starts1, ends1, masks1)
        pools2 = self.pool_phrase_chunks(embeds2[0], embeds1[0], starts2, ends2, masks2)

        sentences1 = self.pool_embeds(embeds1[0], embeds2[0], attentions1)
        sentences2 = self.pool_embeds(embeds2[0], embeds1[0], attentions2)

        compare1 = self.compare_sentences(sentences1, sentences2)
        compare2 = self.compare_chunk_pools(pools1, pools2)
        # print(compare1.size())

        # print(compare2.shape)

        conc = torch.cat((compare1, compare2), 1)
        res1 = self.comparator1(conc)
        res2 = self.comparator2(res1)

        # print(res1.shape)

        return self.comparator3(res2)

### Phrase_Only_Attn_Model

In [0]:
class Model(torch.nn.Module):
    def __init__(self, model_class, pretrained_weights):
        super(Model, self).__init__()
        self.model = model_class.from_pretrained(pretrained_weights)

        self.sentence_comparator = torch.nn.Sequential(torch.nn.Linear(1538, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024), torch.nn.Dropout(p=0.2))
        self.phrase_comparator1 = torch.nn.Sequential(torch.nn.Linear(15360, 4096), torch.nn.GELU(), torch.nn.Dropout(p=0.2))
        self.phrase_comparator2 = torch.nn.Sequential(torch.nn.Linear(4096, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        # self.phrase_comparator = torch.nn.Sequential(torch.nn.Linear(15360, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        self.comparator1 = torch.nn.Sequential(torch.nn.Linear(2048, 512), torch.nn.GELU(), torch.nn.LayerNorm(512))
        self.comparator2 = torch.nn.Sequential(torch.nn.Linear(512, 128))
        self.comparator3 = torch.nn.Sequential(torch.nn.Linear(128, 2), torch.nn.Softmax(dim=1))
        self.cos_fn = torch.nn.CosineSimilarity(dim=1)
        self.dist_fn = torch.nn.PairwiseDistance(keepdim=True)
        self.q_linear = torch.nn.Linear(768, 768)
        self.k_linear = torch.nn.Linear(768, 768)
        self.v_linear = torch.nn.Linear(768, 768)
        self.attention_head = torch.nn.MultiheadAttention(768, 8)
        # self.q_linear2 = torch.nn.Linear(768, 768)
        # self.k_linear2 = torch.nn.Linear(768, 768)
        # self.v_linear2 = torch.nn.Linear(768, 768)
        # self.attention_head2 = torch.nn.MultiheadAttention(768, 8)
    
    def embed_data(self, tokens, segments, attentions):
        # Uncomment this for BERT
        # outputs = self.model(input_ids = tokens, token_type_ids = segments)

        # Uncomment this for DistilBERT
        outputs = self.model(input_ids = tokens, attention_mask=attentions)
        return outputs

    def pool_embeds(self, embeds, other_embeds, attentions):
        # q = self.q_linear2(embeds)
        # k = self.k_linear2(embeds)
        # v = self.v_linear2(embeds)


        # q = q.transpose(0, 1)
        # k = k.transpose(0, 1)
        # v = v.transpose(0, 1)

        # new_embeds, weights = self.attention_head2(q, k, v)
        # new_embeds = new_embeds.transpose(0, 1)

        new_embeds = embeds

        masks = attentions.unsqueeze(-1).expand(embeds.size()).float()
        sum_embeds = torch.sum(new_embeds*masks, 1)
        normal_factor = torch.sum(masks, 1)
        pool = sum_embeds/normal_factor
        # print(pool.shape)
        return pool

    def pool_phrase_chunks(self, embeds, other_embeds, starts, ends, masks):
        pools = torch.zeros((embeds.shape[0], 7680)).cuda()
        for j in range(10):
            filters = torch.zeros((embeds.shape[0], embeds.shape[1])).cuda()
            for i in range(embeds.shape[0]):
            # embed = embeds[i]
                filters[i, starts[i][j]:ends[i][j]] = 1
            filters = filters.unsqueeze(-1).expand(embeds.size()).float()
            # print(filters.shape)

            spl = embeds*filters

            q = self.q_linear(embeds)
            k = self.k_linear(embeds)
            v = self.v_linear(spl)
            # print(q.shape, k.shape, v.shape)

            q = q.transpose(0, 1)
            k = k.transpose(0, 1)
            v = v.transpose(0, 1)

            new_embeds, weights = self.attention_head(q, k, v)
            new_embeds = new_embeds.transpose(0, 1)
            # print(new_embeds.shape)

            sum_embeds = torch.sum(new_embeds*filters, 1)
            normal_factor = torch.sum(filters, 1)
            pool = sum_embeds/normal_factor

            for i in range(embeds.shape[0]):
                pools[i, 768*j:768*(j+1)] = pool[i]*masks[i][j]
                # data = embed[starts[i][j]:ends[i][j]]
                # sum_data = torch.sum(data, 0)
                # normal_factor = data.shape[0]
                # pool = sum_data/normal_factor
                # pool = pool*masks[i][j]
                # pools[i, 768*j:768*(j+1)] = pool

        # print(pools.shape)
        # phrase_chunks = []
        # for chunk in chunks:
        #     sum_embeds = torch.sum(embeds[chunks[0]:chunks[1]], 0)
        # print(pools.shape)
        return pools

    def compare_chunk_pools(self, pools1, pools2):
        pools = torch.cat((pools1, pools2), 1)
        comp1 = self.phrase_comparator1(pools)
        return self.phrase_comparator2(comp1)

    def compare_sentences(self, sentence1, sentence2):
        sentences = torch.cat((sentence1, sentence2), 1)
        dist_euc = self.dist_fn(sentence1, sentence2)
        sentences_fin = torch.cat((sentences, dist_euc), 1)
        dist_cos = self.cos_fn(sentence1, sentence2).unsqueeze(-1)
        sentences_fin2 = torch.cat((sentences_fin, dist_cos), 1)
        # print(sentences.size())
        return self.sentence_comparator(sentences_fin2)

    def compute_siamese(self, feature_up, feature_down):
        features = torch.cat((feature_up, feature_down), 1)
        return self.comparator(features)

    def forward(self, tokens1, tokens2, segments1, segments2, attentions1, attentions2, starts1, starts2, ends1, ends2, masks1, masks2):
        embeds1 = self.embed_data(tokens1, segments1, attentions1)
        embeds2 = self.embed_data(tokens2, segments2, attentions2)
        # print(embeds1)
        pools1 = self.pool_phrase_chunks(embeds1[0], embeds2[0], starts1, ends1, masks1)
        pools2 = self.pool_phrase_chunks(embeds2[0], embeds1[0], starts2, ends2, masks2)

        sentences1 = self.pool_embeds(embeds1[0], embeds2[0], attentions1)
        sentences2 = self.pool_embeds(embeds2[0], embeds1[0], attentions2)

        compare1 = self.compare_sentences(sentences1, sentences2)
        compare2 = self.compare_chunk_pools(pools1, pools2)
        # print(compare1.size())

        # print(compare2.shape)

        conc = torch.cat((compare1, compare2), 1)
        res1 = self.comparator1(conc)
        res2 = self.comparator2(res1)

        # print(res1.shape)

        return self.comparator3(res2)

### No_Attn_Model

In [0]:
class Model(torch.nn.Module):
    def __init__(self, model_class, pretrained_weights):
        super(Model, self).__init__()
        self.model = model_class.from_pretrained(pretrained_weights)

        self.sentence_comparator = torch.nn.Sequential(torch.nn.Linear(1538, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024), torch.nn.Dropout(p=0.2))
        self.phrase_comparator1 = torch.nn.Sequential(torch.nn.Linear(15360, 4096), torch.nn.GELU(), torch.nn.Dropout(p=0.2))
        self.phrase_comparator2 = torch.nn.Sequential(torch.nn.Linear(4096, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        # self.phrase_comparator = torch.nn.Sequential(torch.nn.Linear(15360, 1024), torch.nn.GELU(), torch.nn.LayerNorm(1024))
        self.comparator1 = torch.nn.Sequential(torch.nn.Linear(2048, 512), torch.nn.GELU(), torch.nn.LayerNorm(512))
        self.comparator2 = torch.nn.Sequential(torch.nn.Linear(512, 128))
        self.comparator3 = torch.nn.Sequential(torch.nn.Linear(128, 2), torch.nn.Softmax(dim=1))
        self.cos_fn = torch.nn.CosineSimilarity(dim=1)
        self.dist_fn = torch.nn.PairwiseDistance(keepdim=True)
    
    def embed_data(self, tokens, segments, attentions):
        # Uncomment this for BERT
        # outputs = self.model(input_ids = tokens, token_type_ids = segments)

        # Uncomment this for DistilBERT
        outputs = self.model(input_ids = tokens, attention_mask=attentions)
        return outputs

    def pool_embeds(self, embeds, other_embeds, attentions):
        masks = attentions.unsqueeze(-1).expand(embeds.size()).float()
        sum_embeds = torch.sum(embeds*masks, 1)
        normal_factor = torch.sum(masks, 1)
        pool = sum_embeds/normal_factor
        # print(pool.shape)
        return pool

    def pool_phrase_chunks(self, embeds, other_embeds, starts, ends, masks):
        pools = torch.zeros((embeds.shape[0], 7680)).cuda()
        for j in range(10):
            filters = torch.zeros((embeds.shape[0], embeds.shape[1])).cuda()
            for i in range(embeds.shape[0]):
            # embed = embeds[i]
                filters[i, starts[i][j]:ends[i][j]] = 1
            filters = filters.unsqueeze(-1).expand(embeds.size()).float()
            # print(filters.shape)

            spl = embeds*filters
            new_embeds = spl
            # print(new_embeds.shape)

            sum_embeds = torch.sum(new_embeds*filters, 1)
            normal_factor = torch.sum(filters, 1)
            pool = sum_embeds/normal_factor

            for i in range(embeds.shape[0]):
                pools[i, 768*j:768*(j+1)] = pool[i]*masks[i][j]
                # data = embed[starts[i][j]:ends[i][j]]
                # sum_data = torch.sum(data, 0)
                # normal_factor = data.shape[0]
                # pool = sum_data/normal_factor
                # pool = pool*masks[i][j]
                # pools[i, 768*j:768*(j+1)] = pool

        # print(pools.shape)
        # phrase_chunks = []
        # for chunk in chunks:
        #     sum_embeds = torch.sum(embeds[chunks[0]:chunks[1]], 0)
        # print(pools.shape)
        return pools

    def compare_chunk_pools(self, pools1, pools2):
        pools = torch.cat((pools1, pools2), 1)
        comp1 = self.phrase_comparator1(pools)
        return self.phrase_comparator2(comp1)

    def compare_sentences(self, sentence1, sentence2):
        sentences = torch.cat((sentence1, sentence2), 1)
        dist_euc = self.dist_fn(sentence1, sentence2)
        sentences_fin = torch.cat((sentences, dist_euc), 1)
        dist_cos = self.cos_fn(sentence1, sentence2).unsqueeze(-1)
        sentences_fin2 = torch.cat((sentences_fin, dist_cos), 1)
        # print(sentences.size())
        return self.sentence_comparator(sentences_fin2)

    def compute_siamese(self, feature_up, feature_down):
        features = torch.cat((feature_up, feature_down), 1)
        return self.comparator(features)

    def forward(self, tokens1, tokens2, segments1, segments2, attentions1, attentions2, starts1, starts2, ends1, ends2, masks1, masks2):
        embeds1 = self.embed_data(tokens1, segments1, attentions1)
        embeds2 = self.embed_data(tokens2, segments2, attentions2)
        # print(embeds1)
        pools1 = self.pool_phrase_chunks(embeds1[0], embeds2[0], starts1, ends1, masks1)
        pools2 = self.pool_phrase_chunks(embeds2[0], embeds1[0], starts2, ends2, masks2)

        sentences1 = self.pool_embeds(embeds1[0], embeds2[0], attentions1)
        sentences2 = self.pool_embeds(embeds2[0], embeds1[0], attentions2)

        compare1 = self.compare_sentences(sentences1, sentences2)
        compare2 = self.compare_chunk_pools(pools1, pools2)
        # print(compare1.size())

        # print(compare2.shape)

        conc = torch.cat((compare1, compare2), 1)
        res1 = self.comparator1(conc)
        res2 = self.comparator2(res1)

        # print(res1.shape)

        return self.comparator3(res2)

### Model Util

In [0]:
class Model_util():
    # initalize model
    def __init__(self, model_class, pretrained_weights, pretrained_path=None, no_iterations=10):
        self.model = Model(model_class, pretrained_weights)
        if pretrained_path is not None:
            self.model.load_state_dict(torch.load(pretrained_path))
        
        self.model = self.model.cuda()
        self.loss_func = torch.nn.CrossEntropyLoss()

        self.TRAINING_ITERATIONS = no_iterations
        self.WARMUP = 1000
        self.BATCH_SIZE = 32
        self.REPORT_FREQUENCY = 100
        self.CHKPT_FREQUENCY = 1000
#         self.model.eval()

    def train_batch(self, train_data):
        labels = train_data[12].cuda()
        tokens1 = train_data[0].cuda()
        tokens2 = train_data[3].cuda()
        segments1 = train_data[1].cuda()
        segments2 = train_data[4].cuda()
        attentions1 = train_data[2].cuda()
        attentions2 = train_data[5].cuda()
        starts1 = train_data[6].cuda()
        ends1 = train_data[7].cuda()
        masks1 = train_data[8].cuda()
        starts2 = train_data[9].cuda()
        ends2 = train_data[10].cuda()
        masks2 = train_data[11].cuda()
        
        self.model.train()
        outputs = self.model(tokens1, tokens2, segments1, segments2, attentions1, attentions2, starts1, starts2, ends1, ends2, masks1, masks2)
        loss = self.loss_func(outputs, labels)
        loss.backward()
        self.optimizer.step()
        # self.scheduler.step()
        self.model.zero_grad()
        return loss

    # train classifier
    def train(self, data, validation_set):
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-5, eps=1e-8)
        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, verbose=True)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, 0.9)
        train_loader = torch.utils.data.DataLoader(data, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.RandomSampler(data))
        dev_loader = torch.utils.data.DataLoader(validation_set, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.RandomSampler(validation_set))

        prev_acc = 0.0
        for i in range(1, self.TRAINING_ITERATIONS+1):
            count = 0
            # bar = tqdm.notebook.tqdm(total=len(train_loader))
            for batch in train_loader:
                train_data = batch
                count += 1
                loss = self.train_batch(train_data)
                # print(loss)
                # break
                # bar.update(1)
                # break
                if count%self.REPORT_FREQUENCY == 0:
                    print("%d: Loss - %s" %(count, loss))
                # if count%self.CHKPT_FREQUENCY == 0:
                    # bar.write("%d: Loss - %s" %(i, loss))
            acc = self.test(dev_loader)
            print("%d: Validation - %s" %(count, str(acc)))
            prev_acc = acc
            self.save(directory='CHKPT_')
            print("%d Saved" %(count))
            self.scheduler.step()
            # bar.close()
        return

    # save model
    def save(self, directory=None):
      if directory is not None:
          path = '/content/drive/My Drive/Colab/' + directory
          if not os.path.exists(path):
              os.makedirs(path)
          torch.save(self.model.state_dict(), path + '/1.pt')
      else:
          torch.save(self.model.state_dict(), '/content/drive/My Drive/Colab/model/1.pt')
    
    # predict values
    def test(self, data, is_print=False, is_loader=True, print_file_name=None, sentence_base='sentence', print_file=None):
        dev_loader = None
        if is_loader==True:
            dev_loader = data
        else:
            dev_loader = torch.utils.data.DataLoader(data, batch_size=self.BATCH_SIZE, sampler=torch.utils.data.SequentialSampler(data))

        if is_print == True and print_file_name is not None:
            test_samples = pd.read_table(print_file_name,header=0)
        self.model.eval()
        correct = 0
        total = 0
        i = 0
        for batch in dev_loader:
            labels = batch[12]
            tokens1 = batch[0].cuda()
            tokens2 = batch[3].cuda()
            segments1 = batch[1].cuda()
            segments2 = batch[4].cuda()
            attentions1 = batch[2].cuda()
            attentions2 = batch[5].cuda()
            starts1 = batch[6].cuda()
            ends1 = batch[7].cuda()
            masks1 = batch[8].cuda()
            starts2 = batch[9].cuda()
            ends2 = batch[10].cuda()
            masks2 = batch[11].cuda()

            outputs = self.model(tokens1, tokens2, segments1, segments2, attentions1, attentions2, starts1, starts2, ends1, ends2, masks1, masks2)
            probs = outputs.cpu()
            prediction = torch.argmax(probs, dim=1)
            # prediction = np.zeros(probs.size()[0])
            # for j in range(len(prediction)):
            #     if probs[j] > 0.5:
            #         prediction[j] = 1
            for j in range(len(labels)):
                if labels[j] == prediction[j]:
                    correct += 1
                elif is_print==True:
                    print_file.write("Sentence 1: %s \n Sentence 2: %s \n Label (%d) vs Prediction (%d) \n \n ----- \n \n" %(test_samples.iloc[total][sentence_base+'1'], test_samples.iloc[total][sentence_base+'2'], labels[j], prediction[j]))
                total += 1
        return correct/total

### Train Model

In [16]:
classifier = Model_util(MODEL_CLASS, PRETRAINED, no_iterations=10, pretrained_path='/content/drive/My Drive/Colab/Custom_Trained/Attn_Both_Cross_+.pt')
classifier.train(train_dataset, dev_dataset)
# classifier.train(CustomDataset(paws_train_set), CustomDataset(paws_qqp_dev_set))

KeyboardInterrupt: ignored

### Test Model

In [16]:
classifier = Model_util(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/Custom_Trained/Attn_Both_Cross_+.pt')
# classifier = Model_util(MODEL_CLASS, PRETRAINED, pretrained_path='/content/drive/My Drive/Colab/CHKPT_/1.pt')
test_data = CustomDataset(paws_qqp_dev_set)
# acc = classifier.test(test_data, is_loader=False)
acc = classifier.test(test_data, is_loader=False, print_file_name='/content/drive/My Drive/Colab/data_PAWS_qqp/dev.tsv', print_file=open('/content/drive/My Drive/Colab/temp.txt', 'w'), is_print=True)
print("HERE1")
print("PAWS_QQP test set: %lf" %(acc))
print("HERE2")
acc = classifier.test(dev_dataset, is_loader=False)
print("Full test set: %lf" %(acc))



HERE1
PAWS_QQP test set: 0.577548
HERE2
Full test set: 0.872613
