In [13]:
import pandas as pd
import numpy as np
# LOAD SST2 DATASET
DATA_SST_TRAIN = pd.read_csv('./assets/datasets/SST2/train.tsv', sep='\t')
DATA_SST_TEST = pd.read_csv('./assets/datasets/SST2/test.tsv', sep='\t')
DATA_SST_DEV = pd.read_csv('./assets/datasets/SST2/dev.tsv', sep='\t')

# LOAD QUORA QUESTION PAIRS
columns = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']
types_dict = {'id': int, 'qid1': int, 'qid2': int , 'question1': str, 'question2': str, 'is_duplicate': int}
DATA_QQP_ALL = pd.read_csv('./assets/datasets/QQP/QQP_dataset_all.tsv', sep='\t', error_bad_lines=True, dtype=types_dict)
DATA_QQP_ALL['is_duplicate'] = DATA_QQP_ALL['is_duplicate'].fillna(value=0)
DATA_QQP_ALL = DATA_QQP_ALL.fillna(value='sentence missing')

In [18]:
random_signal = np.random.rand(len(DATA_QQP_ALL))
mask_train = random_signal < 0.9
mask_test = (random_signal > 0.9) & (random_signal < 0.997)
mask_dev = random_signal > 0.997

train = DATA_QQP_ALL[mask_train]
test = DATA_QQP_ALL[mask_test]
dev = DATA_QQP_ALL[mask_dev]
print(len(train), len(test), len(dev))

364031 39122 1137


In [24]:
#train.to_csv('./assets/datasets/QQP/train.tsv', sep='\t')
#test.to_csv('./assets/datasets/QQP/test.tsv', sep='\t')
#dev.to_csv('./assets/datasets/QQP/dev.tsv', sep='\t')

In [25]:
columns = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']
types_dict = {'id': int, 'qid1': int, 'qid2': int , 
              'question1': str, 'question2': str, 'is_duplicate': int}
DATA_QQP_TRAIN = pd.read_csv('./assets/datasets/QQP/train.tsv', sep='\t', dtype=types_dict)
DATA_QQP_TEST = pd.read_csv('./assets/datasets/QQP/test.tsv', sep='\t', dtype=types_dict)
DATA_QQP_DEV = pd.read_csv('./assets/datasets/QQP/dev.tsv', sep='\t', dtype=types_dict)


In [26]:
DATA_QQP_TRAIN.iloc[0:10]

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [2]:
import torch
import argparse
from transformers import BertModel, BertTokenizer

# Custom imports
from model.utils import *
from model.data_utils import *
from model.transformer import Transformer
from model.bracketing import IdentityChunker, NNSimilarityChunker, cos
from model.generators import IdentityGenerator, EmbeddingGenerator
from model.classifiers import AttentionClassifier, SeqPairAttentionClassifier, NaivePoolingClassifier, SeqPairFancyClassifier
from model.model import MultiTaskNet, End2EndModel

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Device being used: {device}")


########### LOAD MODELS AND OPTIMIZER ###########
transformer_net = Transformer(model_class=BertModel,
                              tokenizer_class=BertTokenizer,
                              pre_trained_weights='bert-base-uncased',
                              device=device)

bracketing_net = NNSimilarityChunker(sim_function=cos,
                                     threshold=0.7,
                                     exclude_special_tokens=False,
                                     combinatorics='sequential',
                                     device=device)

generator_net = EmbeddingGenerator(pool_function=abs_max_pooling, 
                                   device=device)

seq_classifier = AttentionClassifier(embedding_dim=768,
                                     sentset_size=2,
                                     dropout=0.3,
                                     n_sentiments=2,
                                     pool_mode='concat',
                                     device=device).to(device)

seq_pair_classifier = SeqPairFancyClassifier(embedding_dim=768,
                                             num_classes=2,
                                             dropout=0.3,
                                             n_attention_vecs=2,
                                             device=device)#.to(device)

naive_classifier = NaivePoolingClassifier(embedding_dim=768, 
                                          num_classes=2, 
                                          dropout=0., 
                                          pool_mode='max_pooling', 
                                          device=device).to(device)

multitask_net = MultiTaskNet(seq_classifier,
                             seq_pair_classifier,
                             device=device).to(device)

bracketing_net = IdentityChunker().to(device)
generator_net = IdentityGenerator().to(device)

model = End2EndModel(transformer=transformer_net,
                     bracketer=bracketing_net,
                     generator=generator_net,
                     multitasknet=multitask_net,
                     device=device).to(device)

Device being used: cpu


In [7]:
import math
import gc
import time
from torch.utils.tensorboard import SummaryWriter
from model.data_utils import get_batch_SST2_from_indices, get_batch_QQP_from_indices

DATA_QQP_TRAIN = DATA_QQP_ALL.iloc[0:300000]
DATA_QQP_DEV = DATA_QQP_ALL.iloc[300000:300100] # 1000 samples to evaluate the performance
torch.manual_seed(0)

# Writer will output to ./runs/ directory by default
writer = SummaryWriter(log_dir='./tensorboard/', comment='Test run to see how this works!')

eval_periodicity = 20

counter = {'SST2': 0, 'QQP': 0}
batch_size = {'SST2': 24, 'QQP': 24}
n_batches = {'SST2': math.floor(len(DATA_SST_TRAIN)/24), 'QQP': math.floor(len(DATA_QQP_TRAIN)/24)}
get_batch_function = {'SST2': get_batch_SST2_from_indices, 'QQP': get_batch_QQP_from_indices}
dataframe = {'SST2': DATA_SST_TRAIN, 'QQP': DATA_QQP_TRAIN}
dev_dataframes_dict = {'SST2': DATA_SST_DEV, 'QQP': DATA_QQP_DEV}
datasets = ['SST2','QQP'] # here the datasets in training
batch_indices = {}


global_counter, losseval, acceval = 0, 0, 0

In [8]:
def eval_model_on_DF(model, dataframes_dict, batch_size=16, global_counter=0):    
    k=0
    metrics_dict = {}
    for dataset, df in dataframes_dict.items():
        n_batches = math.floor(len(df)/batch_size)
        batch_splits = [-1]*(len(dataframes_dict)+1)
        batch_splits[k] = 0 # [-1, -1, 0, -1, -1]
        batch_splits[k+1] = len(df)
        k += 1
        dev_acc = 0
        for i in range(n_batches):
            batch_targets, batch_sequences = [], []
            indices = list(range(i*batch_size, (i+1)*batch_size))
            dataset_batch = get_batch_function[dataset](df, indices)
            # construct targets
            batch_targets.append(torch.tensor([data[1] for data in dataset_batch], 
                                              dtype=torch.int64, 
                                              device=device))
            # construct sequences
            batch_sequences.extend([data[0] for data in dataset_batch])
            batch_predictions = model.forward(batch_sequences, batch_splits=batch_splits)
            L = model.loss(batch_predictions, batch_targets, weights=None)
            m = model.metrics(batch_predictions, batch_targets)
            dev_acc += m[0]
            # Log to tensorboard
        acc = dev_acc/n_batches
        metrics_dict[dataset] = acc
        #writer.add_scalars(f'Metrics/{dataset}/dev', {dataset: acc}, global_counter)
        
    return metrics_dict

In [9]:
torch.cuda.empty_cache()


optimizer = torch.optim.Adam(multitask_net.parameters(), 
                             lr=0.0001,
                             betas=(0.9, 0.999), 
                             eps=1e-08, 
                             weight_decay=0.0001, 
                             amsgrad=False)

finished_training = False
t = time.time()
while not finished_training:
    for dataset in datasets:
        if counter[dataset] >= n_batches[dataset] or global_counter == 0:
            counter[dataset] = 0
            # Re-shuffle the training batches data
            batch_indices[dataset] = torch.randperm(n_batches[dataset]*batch_size[dataset],
                                                    device=torch.device('cpu')).reshape(-1, batch_size[dataset])
    
    batch_sequences, batch_targets, batch_splits = [], [], [0]
    for dataset in datasets:
        idx = counter[dataset]
        dataset_batch = get_batch_function[dataset](dataframe[dataset], 
                                                    batch_indices[dataset][idx, :])
        # List of tensors, one for each task
        try:
            batch_targets.append(torch.tensor([data[1] for data in dataset_batch], 
                                            dtype=torch.int64, 
                                            device=device))
        except:
            L = [data[1] for data in dataset_batch]
            
            raise ValueError(f'This thing failed when the target tensor was in dataset {dataset}: {L}, indices: {batch_indices[dataset][idx, :]}')
        
        # Big list combining the input sequences/ tuple of sequences because the batch needs
        # to be at the same "depth" level
        batch_sequences.extend([data[0] for data in dataset_batch])
        batch_splits.append(batch_splits[-1] + len(dataset_batch))
        counter[dataset] += 1

    model.train()
    batch_predictions = model.forward(batch_sequences, batch_splits=batch_splits)
    L = model.loss(batch_predictions, batch_targets, weights=None)
    metrics = model.metrics(batch_predictions, batch_targets)
    # Log to tensorboard
    writer.add_scalar('Loss/train', L.item(), global_counter)
    writer.add_scalars('Metrics/train', {datasets[i]: metrics[i] for i in range(len(datasets))}, global_counter)
    # Update net
    optimizer.zero_grad()
    L.backward()
    optimizer.step()
    losseval += L.item()
    if global_counter % eval_periodicity == 0:
        ## evaluate stuff
        model.eval()
        metrics_dict = eval_model_on_DF(model, dev_dataframes_dict, batch_size=16, global_counter=global_counter)
        print('eval metrics are:', metrics_dict)
        writer.add_scalars('Metrics/dev', metrics_dict, global_counter)
        
    if global_counter % eval_periodicity == 0:
        #print(f'Accuracies SST2: {metrics[0]} ---- QQP: {metrics[1]}')
        print(f'################### GLOBAL COUNTER {global_counter} ###################')
        print(f'Iterations per second: {eval_periodicity/(time.time()-t)}')
        t = time.time()
        print(f'Accuracies QQP: {acceval/eval_periodicity}')
        print(f'Global Loss: {losseval/eval_periodicity}')
        losseval, acceval = 0, 0
    global_counter += 1

0
eval metrics are: {'SST2': 0.5092592592592593, 'QQP': 0.5520833333333334}
################### GLOBAL COUNTER 0 ###################
Iterations per second: 0.09311179208030342
Accuracies QQP: 0.0
Global Loss: 0.030136942863464355
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
eval metrics are: {'SST2': 0.6215277777777778, 'QQP': 0.5520833333333334}
################### GLOBAL COUNTER 20 ###################
Iterations per second: 0.021367616518102133
Accuracies QQP: 0.0
Global Loss: 0.6701055437326431
21
22
23
24
25


KeyboardInterrupt: 

In [None]:
import itertools

limit = 3
indices = list(range(50))
idx_combinations = [indices[s:e] for s, e in itertools.combinations(range(len(indices)+1), 2)]
#print(idx_combinations)



print(len(idx_combinations))
print(idx_combinations)

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN, mean_shift

DBSCAN_algorithm = DBSCAN(eps=0.7, min_samples=2)
inp = np.random.rand(48, 768)
clustered = DBSCAN_algorithm.fit_predict(inp)

ms = mean_shift(inp)
print(ms[1])

In [None]:
import torch
T = torch.tensor([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
Y = T[1:1,:]
U = torch.Tensor(1, 2)
I = torch.cat([T, U], dim=0)

In [None]:
list(range(4, 8))

In [None]:
T = torch.tensor([1, 2])
U = torch.tensor([3, 4])
H = torch.Tensor(1, 3)
L = [U, H]
I = torch.cat(L, dim=0)
print(I)

In [19]:
import datetime;
ts = str(datetime.datetime.now())
print(str(ts))
print(type(str(ts)))

2020-02-06 15:18:10.030791
<class 'str'>


In [27]:
import os
os.path.join('firstpart', 'second')

'firstpart/second'

In [52]:
import torch
import torch.nn.functional as nn
from sklearn import cluster
T1 = torch.rand(3, 100)
T1 = nn.normalize(T1, p=2, dim=-1)
T2 = torch.rand(4, 100) - torch.rand(4, 100)
T2 = nn.normalize(T2, p=2, dim=-1)
T = torch.cat([T1, T2], dim=0)
cl = cluster.MeanShift(bandwidth=0.5) # n_samples, n_features
clustering = cl.fit(T)
clustering.labels_

array([1, 3, 2, 5, 4, 6, 0])

In [3]:
import yaml
import math
import pandas as pd

config = yaml.safe_load('./config/datasets.yml')
with open('./config/datasets.yml', 'r') as file:
    config = yaml.load(file, Loader=yaml.Loader)
#print(config)
print(config['SST2']['get_batch_fn'])

<function get_batch_SST2_from_indices at 0x11dfa1e18>


In [5]:
dataframes = {}
for dataset in config['datasets']:
    dataframes[dataset] = {}
    for kind in ['train', 'test', 'dev']:
        dataframes[dataset][kind] = pd.read_csv(config[dataset]['path'][kind], sep='\t')

counter = {dataset: config[dataset]['counter'] for dataset in config['datasets']}
batch_size = {dataset: config[dataset]['batch_size'] for dataset in config['datasets']}
n_batches = {dataset: math.floor(len(dataframes[dataset]['train'])/batch_size[dataset]) 
             for dataset in config['datasets']}
get_batch_function = {dataset: config[dataset]['get_batch_fn'] for dataset in config['datasets']}
dev_dataframes_dict = {dataset: dataframe[dataset]['dev'] for dataset in config['datasets']}

In [12]:
#print(counter)
#print(batch_size)
#print(n_batches)
#print(get_batch_function)
print(dataframes['QQP']['test'])

       Unnamed: 0      id    qid1    qid2  \
0              16      16      33      34   
1              18      18      37      38   
2              24      24      49      50   
3              27      27      55      56   
4              38      38      77      78   
5              45      45      91      92   
6              57      57     115     116   
7              59      59     119     120   
8              60      60     121     122   
9              62      62     125     126   
10             79      79     159     160   
11             96      96     193     194   
12            166     166     333     334   
13            182     182     365     366   
14            188     188     377     378   
15            191     191     383     384   
16            197     197     395     396   
17            211     211     423     424   
18            212     212     425     426   
19            216     216     433     434   
20            220     220     441     442   
21        

In [17]:
model.data_utils.get_batch_SST2_from_indices

Unnamed: 0                                                      4
id                                                              4
qid1                                                            9
qid2                                                           10
question1       Which one dissolve in water quikly sugar, salt...
question2                 Which fish would survive in salt water?
is_duplicate                                                    0
Name: 4, dtype: object
