# Libraries


In [1]:
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
from transformers import TrainingArguments
from transformers import Trainer
import torch
import pandas as pd
from torch.optim import AdamW
from tqdm import tqdm
import time
import numpy as np
from torch import cuda
from sklearn.model_selection import train_test_split
import json

# Functions

In [2]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [3]:
#Dataset do torch auxilia no treinamento dos modelos
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [4]:
"""
This function converts the list of sentences into a BERT input
"""
def bertInput_clean(sentences):

    token_text = "[SEP]".join(sentences)
    
    return token_text


"""
This function converts the list of sentences into a BERT input
"""
def input_clean(sentences):

    token_text = " ".join(sentences)
    
    return token_text

In [None]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

#model_checkpoint = 'roberta-base'
#tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

SENTENCE_SIZE = 50
EPOCHS = 15
BATCH_SIZE = 1
MAX_LEN = 512
MASK_PERC = 0.15
LR = 5e-5
patience = 3
use_amp = True
weights = [0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]

osm_tables = ['bins_points_information', 'bins_polygons_information', 'bins_roads_information', 'bins_lines_information']
#osm_tables = ['bins_roads_information', 'bins_lines_information']
for n in range(1, 2):
    #for w in np.arange(0.0, 1.1, 0.1):
    for w in weights:
        #Ajustando o parâmetro w
        wgt = round(w, 1)
        for osm_table in osm_tables:
        
            #Flag para permitir o treinamento correto
            do_training = False
            
            #Caso especial para carregar os dados de pontos e treinar apenas uma vez
            if(osm_table == 'bins_points_information' and wgt == 0.0):
                file_name = './geographic/GEOC2VEC/austin-sl-tuple-geoc2vec-' + str(n) + osm_table + '-pfp-c.parquet'
                model_name = './geographic/GEOC2VECBERT15TKT-03/austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + osm_table + '-pfp-c'
                #model_name = './geographic/GEOC2VECBERT15TKT-02/austin-sl-tuple-geoc2vec-roberta-MLM-' + str(n) + osm_table + '-pfp-c'
                do_training = True
                
            elif(osm_table != 'bins_points_information'):
                file_name = './geographic/GEOC2VEC/austin-sl-tuple-geoc2vec-' + str(n) + osm_table + '-wgt' + str(wgt) + 'pfp-c.parquet'
                model_name = './geographic/GEOC2VECBERT15TKT-03/austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + osm_table + '-wgt' + str(wgt) + '-pfp-c'
                #model_name = './geographic/GEOC2VECBERT15TKT-02/austin-sl-tuple-geoc2vec-roberta-MLM-' + str(n) + osm_table + '-wgt' + str(wgt) + '-pfp-c'
                do_training = True
                
            if(do_training):
                
                start_time = time.time()
                
                #Carregando o modelo vazio para o finetuning
                model = DistilBertForMaskedLM.from_pretrained(model_checkpoint)
                #model = RobertaForMaskedLM.from_pretrained(model_checkpoint)
                
                #Carregar dataset
                print("Carregando dados...")
                print(file_name)
                sentences = pd.read_parquet(file_name)
                validation = sentences[['center_poi', 'context_osm']].drop_duplicates()
                validation = validation.values.tolist()
                sentences = sentences.values.tolist()
                print('Quantidade de sentenças:', len(sentences))


                print("Gerando subtextos com foco nos tipos de POI (Treino)...")
                #Criando textos menores com o conjunto de duas palavras dos POIs e dados geográficos
                #Esse método considera a mudança do TIPO de dado geográfico para criar um novo subtexto
                train_sentences = []
                local_sentences = []
                count_sentences = 1
                actual_type = sentences[0][3] # => Dado geográfico
                for i, sentence in enumerate(sentences):

                    #Para voltar a versão anterior descomente esse trecho
                    '''if((count_sentences % SENTENCE_SIZE) == 0):
                        
                        final_sentence = '[CLS]'.join(local_sentences)
                        train_sentences.append(final_sentence)
                        
                        count_sentences = 1
                        local_sentences = []'''

                    #Aqui deve ser feito um split_test
                    if(sentence[3] != actual_type):
                        
                        #Esse trecho é novo
                        if(len(local_sentences) >= SENTENCE_SIZE):
                            
                            print('Treino ultrapassou em:', i, 'tam:', len(local_sentences))
                            
                            final_sentence = '[CLS]'.join(local_sentences[0:int(len(local_sentences)/2)])
                            train_sentences.append(final_sentence)
                            
                            final_sentence = '[CLS]'.join(local_sentences[int(len(local_sentences)/2):len(local_sentences)])
                            train_sentences.append(final_sentence)
                        
                        else:
                            #Salvando a parte final
                            final_sentence = '[CLS]'.join(local_sentences)
                            train_sentences.append(final_sentence)

                        actual_type = sentence[3]
                        count_sentences = 1

                        #Começando o novo contexto
                        sentence_text = [sentence[1], sentence[3]]
                        local_sentences = [bertInput_clean(sentence_text)]

                    else:
                        count_sentences+=1
                        sentence_text = [sentence[1], sentence[3]]
                        local_sentences.append(bertInput_clean(sentence_text))

                #Adicionando último trecho
                if(len(local_sentences) > 0):
                    final_sentence = '[CLS]'.join(local_sentences)
                    train_sentences.append(final_sentence)
            
                #Esvaziando memória
                del sentences, local_sentences
                
                
                print("Gerando subtextos com foco nos tipos de POI (Validação)...")
                #Criando textos menores com o conjunto de duas palavras dos POIs e dados geográficos
                #Esse método considera a mudança do TIPO de dado geográfico para criar um novo subtexto
                val_sentences = []
                local_sentences = []
                count_sentences = 1
                actual_type = validation[0][1] # => Dado geográfico
                for i, sentence in enumerate(validation):

                    #Descomente para voltar ao original
                    '''if((count_sentences % SENTENCE_SIZE) == 0):
                        
                        final_sentence = '[CLS]'.join(local_sentences)
                        val_sentences.append(final_sentence)
                        
                        count_sentences = 1
                        local_sentences = []'''

                    #Aqui deve ser feito um split_test
                    if(sentence[1] != actual_type):
                        
                        #Esse trecho é novo
                        if(len(local_sentences) >= SENTENCE_SIZE):
                            
                            print('Validação ultrapassou em:', i, 'tam:', len(local_sentences))
                            
                            final_sentence = '[CLS]'.join(local_sentences[0:int(len(local_sentences)/2)])
                            train_sentences.append(final_sentence)
                            
                            final_sentence = '[CLS]'.join(local_sentences[int(len(local_sentences)/2):len(local_sentences)])
                            train_sentences.append(final_sentence)
                        
                        else:
                            #Salvando a parte final
                            final_sentence = '[CLS]'.join(local_sentences)
                            val_sentences.append(final_sentence)
                        
                        actual_type = sentence[1]
                        count_sentences = 1
                        
                        #Começando o novo contexto
                        sentence_text = [sentence[0], sentence[1]]
                        local_sentences = [bertInput_clean(sentence_text)]

                    else:
                        count_sentences+=1
                        sentence_text = [sentence[0], sentence[1]]
                        local_sentences.append(bertInput_clean(sentence_text))

                #Adicionando último trecho
                if(len(local_sentences) > 0):
                    final_sentence = '[CLS]'.join(local_sentences)
                    val_sentences.append(final_sentence)
            
                #Esvaziando memória
                del validation, local_sentences
                
                print('Conjunto de Treino: ', len(train_sentences))
                print('Conjunto de Validação: ', len(val_sentences))
                
                #Tokenizando e salvando uma cópia dos tokens para representar as labels
                inputs_train = tokenizer(train_sentences, return_tensors='pt', max_length=MAX_LEN, truncation = True, padding='max_length')
                inputs_train['labels'] = inputs_train.input_ids.detach().clone()

                inputs_val = tokenizer(val_sentences, return_tensors='pt', max_length=MAX_LEN, truncation = True, padding='max_length')
                inputs_val['labels'] = inputs_val.input_ids.detach().clone()

                #Esvaziando memória
                del train_sentences, val_sentences

                #Encontrando MASK_PERC% das frases que serão mascaradas
                #As frases terão a segunda sentença mascarada ex: [CLS]Bar[SEP]Park[SEP] => [CLS]Bar[SEP]#####[SEP]
                print("Mascarando dados...")
                rand_train = torch.rand(inputs_train.input_ids.shape)
                rand_val = torch.rand(inputs_val.input_ids.shape)
                
                #Gerando um mascaramento em posições aleatório
                #101 = [CLS]
                #102 = [SEP]
                mask_arr_train = (rand_train < MASK_PERC) * (inputs_train.input_ids != 101) * \
                           (inputs_train.input_ids != 102) * (inputs_train.input_ids != 0)
                
                mask_arr_val = (rand_val < MASK_PERC) * (inputs_val.input_ids != 101) * \
                           (inputs_val.input_ids != 102) * (inputs_val.input_ids != 0)
                
                #Tokens que terão os tokens mascarados
                selection_train = []
                for i in range(inputs_train.input_ids.shape[0]):
                    selection_train.append(
                        torch.flatten(mask_arr_train[i].nonzero()).tolist()
                    )
                    
                selection_val = []
                for i in range(inputs_val.input_ids.shape[0]):
                    selection_val.append(
                        torch.flatten(mask_arr_val[i].nonzero()).tolist()
                    )

                #Esvaziando memória
                del rand_train, mask_arr_train, rand_val, mask_arr_val
                
                #Alterando os tokens de TODA A PALAVRA
                #103 = [MASK]
                for i in range(inputs_train.input_ids.shape[0]):
                    inputs_train.input_ids[i, selection_train[i]] = 103
                    
                    for j in selection_train[i]:
                        #Moving front and back
                        b = j - 1
                        f = j + 1

                        #Preenchendo restante do token
                        while ((b > 0) and 
                               (inputs_train.input_ids[i, b] != 101 and 
                                inputs_train.input_ids[i, b] != 102 and 
                                inputs_train.input_ids[i, b] != 103)):

                            inputs_train.input_ids[i, b] = 103
                            b = b-1

                        while ((f < len(inputs_train.input_ids[i])) and 
                               (inputs_train.input_ids[i, f] != 0 and 
                                inputs_train.input_ids[i, f] != 101 and 
                                inputs_train.input_ids[i, f] != 102 and 
                                inputs_train.input_ids[i, f] != 103)):
                            inputs_train.input_ids[i, f] = 103
                            f = f+1
                
                for i in range(inputs_val.input_ids.shape[0]):
                    inputs_val.input_ids[i, selection_val[i]] = 103
                    
                    for j in selection_val[i]:
                        #Moving front and back
                        b = j - 1
                        f = j + 1

                        #Preenchendo restante do token
                        while ((b > 0) and 
                               (inputs_val.input_ids[i, b] != 101 and 
                                inputs_val.input_ids[i, b] != 102 and 
                                inputs_val.input_ids[i, b] != 103)):

                            inputs_val.input_ids[i, b] = 103
                            b = b-1

                        while ((f < len(inputs_val.input_ids[i])) and 
                               (inputs_val.input_ids[i, f] != 0 and 
                                inputs_val.input_ids[i, f] != 101 and 
                                inputs_val.input_ids[i, f] != 102 and 
                                inputs_val.input_ids[i, f] != 103)):
                            inputs_val.input_ids[i, f] = 103
                            f = f+1
                    
                #Esvaziando memória
                del selection_train, selection_val

                #Transformando os dados no objeto dataset do torch
                print("Preparando para o treinamento...")
                dataset_train = Dataset(inputs_train)
                loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
                dataset_val = Dataset(inputs_val)
                loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True)
                
                #Esvaziando memória
                del inputs_train, inputs_val

                #Preparando o dispotivo para o treinamento
                device = 'cuda' if cuda.is_available() else 'cpu' # CPU OR GPU
                torch.cuda.empty_cache()
                # and move our model over to the selected device
                model.to(device)

                optim = AdamW(model.parameters(), lr=LR)

                # to track the training loss as the model trains
                train_losses = []
                # to track the validation loss as the model trains
                valid_losses = []
                # to track the average training loss per epoch as the model trains
                avg_train_losses = []
                # to track the average validation loss per epoch as the model trains
                avg_valid_losses = [] 

                # initialize the early_stopping object
                early_stopping = EarlyStopping(patience=patience, verbose=True)
                scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
                has_early_stopping = False

                for epoch in range(EPOCHS):
                    # setup loop with TQDM and dataloader
                    # activate training mode
                    model.train()
                    loop_train = tqdm(loader_train, leave=True)
                    for batch_train in loop_train:

                        optim.zero_grad()
                        # pull all tensor batches required for training
                        input_ids = batch_train['input_ids'].to(device)
                        attention_mask = batch_train['attention_mask'].to(device)
                        labels = batch_train['labels'].to(device)

                        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
                            outputs = model(input_ids, attention_mask=attention_mask,
                                        labels=labels)
                            loss = outputs.loss

                        scaler.scale(loss).backward()
                        scaler.step(optim)
                        scaler.update()

                        loop_train.set_description(f'Epoch {epoch}')
                        loop_train.set_postfix(loss=loss.item())


                        train_losses.append(loss.item())

                    ######################    
                    # validate the model #
                    ######################
                    model.eval() # prep model for evaluation
                    loop_val = tqdm(loader_val, leave=True)
                    for batch_val in loop_val:

                        # forward pass: compute predicted outputs by passing inputs to the model
                        input_ids = batch_val['input_ids'].to(device)
                        attention_mask = batch_val['attention_mask'].to(device)
                        labels = batch_val['labels'].to(device)

                        # process
                        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
                            outputs = model(input_ids, attention_mask=attention_mask,
                                        labels=labels)
                            loss = outputs.loss

                        # record validation loss
                        valid_losses.append(loss.item())

                    # calculate average loss over an epoch
                    train_loss = np.average(train_losses)
                    valid_loss = np.average(valid_losses)
                    avg_train_losses.append(train_loss)
                    avg_valid_losses.append(valid_loss)

                    # early_stopping needs the validation loss to check if it has decresed, 
                    # and if it has, it will make a checkpoint of the current model
                    early_stopping(valid_loss, model)

                    if early_stopping.early_stop:
                        print("Early stopping")
                        final_time = (time.time() - start_time)
                        has_early_stopping = True
                        training_dictionary = {'epoch': epoch+1,
                                               'epochs': EPOCHS,
                                               'patience': patience,
                                               'train_loss': train_loss,
                                               'valid_loss': valid_loss,
                                               'avg_train_losses':avg_train_losses,
                                               'avg_valid_losses': avg_valid_losses,
                                               'time': final_time}
                        break

                    # clear lists to track next epoch
                    train_losses = []
                    valid_losses = []

                # load the last checkpoint with the best model
                model.load_state_dict(torch.load('checkpoint.pt'))

                #Novo tempo para o caso das coisas terem sido diferentes
                if(has_early_stopping == False):
                    final_time = (time.time() - start_time)
                    training_dictionary = {'epoch': epoch+1,
                                           'epochs': EPOCHS,
                                           'patience': patience,
                                           'train_loss': train_loss,
                                           'valid_loss': valid_loss,
                                           'avg_train_losses':avg_train_losses,
                                           'avg_valid_losses': avg_valid_losses,
                                           'time': final_time}

                #Salvando o modelo pronto
                print("Salvando o modelo...")
                args = TrainingArguments(
                    output_dir=model_name,
                    per_device_train_batch_size=BATCH_SIZE,
                    num_train_epochs=EPOCHS
                )

                trainer = Trainer(
                    model=model,
                    args=args,
                    train_dataset=dataset_train,
                    eval_dataset=dataset_val
                )

                trainer.save_model()
                
                #Salvando as estatísticas do treinamento
                file_name = model_name + '/training_dictionary.json'
                with open(file_name, "w") as outfile:
                    json.dump(training_dictionary, outfile)
                
                #Esvaziando memória
                del loader_train
                del loader_val
                del dataset_train
                del dataset_val
                del train_losses
                del valid_losses
                del avg_train_losses
                del avg_valid_losses
                del trainer

Carregando dados...
./geographic/GEOC2VEC/austin-sl-tuple-geoc2vec-1bins_polygons_information-wgt0.2pfp-c.parquet
Quantidade de sentenças: 2452843
Gerando subtextos com foco nos tipos de POI (Treino)...
Treino ultrapassou em: 147 tam: 63
Treino ultrapassou em: 364 tam: 91
Treino ultrapassou em: 519 tam: 65
Treino ultrapassou em: 743 tam: 64
Treino ultrapassou em: 893 tam: 54
Treino ultrapassou em: 1178 tam: 66
Treino ultrapassou em: 1657 tam: 145
Treino ultrapassou em: 1876 tam: 51
Treino ultrapassou em: 1976 tam: 60
Treino ultrapassou em: 2204 tam: 148
Treino ultrapassou em: 2300 tam: 60
Treino ultrapassou em: 2594 tam: 72
Treino ultrapassou em: 2709 tam: 75
Treino ultrapassou em: 2801 tam: 64
Treino ultrapassou em: 2885 tam: 64
Treino ultrapassou em: 2965 tam: 68
Treino ultrapassou em: 3070 tam: 90
Treino ultrapassou em: 3185 tam: 60
Treino ultrapassou em: 3447 tam: 55
Treino ultrapassou em: 3735 tam: 68
Treino ultrapassou em: 3827 tam: 52
Treino ultrapassou em: 4011 tam: 120
Treino 

Treino ultrapassou em: 243173 tam: 182
Treino ultrapassou em: 243288 tam: 65
Treino ultrapassou em: 243376 tam: 76
Treino ultrapassou em: 243501 tam: 90
Treino ultrapassou em: 243854 tam: 78
Treino ultrapassou em: 244002 tam: 52
Treino ultrapassou em: 244523 tam: 72
Treino ultrapassou em: 244731 tam: 136
Treino ultrapassou em: 244876 tam: 65
Treino ultrapassou em: 245075 tam: 52
Treino ultrapassou em: 245244 tam: 169
Treino ultrapassou em: 245313 tam: 51
Treino ultrapassou em: 245513 tam: 60
Treino ultrapassou em: 245747 tam: 135
Treino ultrapassou em: 246002 tam: 91
Treino ultrapassou em: 246210 tam: 96
Treino ultrapassou em: 246372 tam: 72
Treino ultrapassou em: 246487 tam: 80
Treino ultrapassou em: 246683 tam: 85
Treino ultrapassou em: 246897 tam: 112
Treino ultrapassou em: 247163 tam: 70
Treino ultrapassou em: 247499 tam: 130
Treino ultrapassou em: 247724 tam: 144
Treino ultrapassou em: 247793 tam: 51
Treino ultrapassou em: 247859 tam: 51
Treino ultrapassou em: 247991 tam: 102
Trei

Treino ultrapassou em: 486684 tam: 80
Treino ultrapassou em: 486818 tam: 54
Treino ultrapassou em: 486989 tam: 81
Treino ultrapassou em: 487052 tam: 57
Treino ultrapassou em: 487176 tam: 52
Treino ultrapassou em: 487276 tam: 72
Treino ultrapassou em: 487386 tam: 75
Treino ultrapassou em: 487556 tam: 60
Treino ultrapassou em: 487656 tam: 80
Treino ultrapassou em: 487908 tam: 224
Treino ultrapassou em: 487968 tam: 51
Treino ultrapassou em: 488142 tam: 90
Treino ultrapassou em: 488609 tam: 126
Treino ultrapassou em: 488808 tam: 94
Treino ultrapassou em: 488888 tam: 64
Treino ultrapassou em: 489269 tam: 55
Treino ultrapassou em: 489472 tam: 63
Treino ultrapassou em: 489535 tam: 54
Treino ultrapassou em: 489619 tam: 76
Treino ultrapassou em: 489887 tam: 94
Treino ultrapassou em: 490025 tam: 54
Treino ultrapassou em: 490256 tam: 165
Treino ultrapassou em: 490361 tam: 85
Treino ultrapassou em: 490491 tam: 80
Treino ultrapassou em: 490731 tam: 112
Treino ultrapassou em: 490869 tam: 102
Treino 

Treino ultrapassou em: 697546 tam: 65
Treino ultrapassou em: 697762 tam: 56
Treino ultrapassou em: 697828 tam: 57
Treino ultrapassou em: 698179 tam: 182
Treino ultrapassou em: 698283 tam: 56
Treino ultrapassou em: 698481 tam: 63
Treino ultrapassou em: 698754 tam: 66
Treino ultrapassou em: 699157 tam: 56
Treino ultrapassou em: 699609 tam: 112
Treino ultrapassou em: 699871 tam: 70
Treino ultrapassou em: 699959 tam: 64
Treino ultrapassou em: 700239 tam: 69
Treino ultrapassou em: 700533 tam: 72
Treino ultrapassou em: 700946 tam: 170
Treino ultrapassou em: 701076 tam: 55
Treino ultrapassou em: 701286 tam: 75
Treino ultrapassou em: 701493 tam: 144
Treino ultrapassou em: 701598 tam: 80
Treino ultrapassou em: 701876 tam: 68
Treino ultrapassou em: 701992 tam: 52
Treino ultrapassou em: 702420 tam: 50
Treino ultrapassou em: 702552 tam: 90
Treino ultrapassou em: 702757 tam: 195
Treino ultrapassou em: 703025 tam: 112
Treino ultrapassou em: 703341 tam: 128
Treino ultrapassou em: 703407 tam: 51
Trein

Treino ultrapassou em: 921259 tam: 102
Treino ultrapassou em: 921503 tam: 51
Treino ultrapassou em: 921655 tam: 56
Treino ultrapassou em: 921899 tam: 64
Treino ultrapassou em: 922009 tam: 95
Treino ultrapassou em: 922500 tam: 150
Treino ultrapassou em: 922732 tam: 168
Treino ultrapassou em: 923183 tam: 121
Treino ultrapassou em: 923267 tam: 68
Treino ultrapassou em: 923417 tam: 90
Treino ultrapassou em: 923587 tam: 145
Treino ultrapassou em: 923679 tam: 76
Treino ultrapassou em: 923861 tam: 66
Treino ultrapassou em: 924245 tam: 300
Treino ultrapassou em: 924390 tam: 65
Treino ultrapassou em: 924446 tam: 56
Treino ultrapassou em: 924670 tam: 224
Treino ultrapassou em: 924730 tam: 51
Treino ultrapassou em: 924884 tam: 52
Treino ultrapassou em: 924953 tam: 57
Treino ultrapassou em: 925162 tam: 75
Treino ultrapassou em: 925316 tam: 85
Treino ultrapassou em: 925496 tam: 132
Treino ultrapassou em: 925686 tam: 70
Treino ultrapassou em: 925991 tam: 67
Treino ultrapassou em: 926199 tam: 51
Trei

Treino ultrapassou em: 1127412 tam: 80
Treino ultrapassou em: 1127552 tam: 80
Treino ultrapassou em: 1127687 tam: 65
Treino ultrapassou em: 1127771 tam: 56
Treino ultrapassou em: 1127840 tam: 51
Treino ultrapassou em: 1127950 tam: 95
Treino ultrapassou em: 1128055 tam: 90
Treino ultrapassou em: 1128146 tam: 91
Treino ultrapassou em: 1128250 tam: 104
Treino ultrapassou em: 1128341 tam: 52
Treino ultrapassou em: 1128425 tam: 68
Treino ultrapassou em: 1128687 tam: 104
Treino ultrapassou em: 1128812 tam: 55
Treino ultrapassou em: 1128927 tam: 80
Treino ultrapassou em: 1129552 tam: 238
Treino ultrapassou em: 1129672 tam: 70
Treino ultrapassou em: 1129756 tam: 56
Treino ultrapassou em: 1129924 tam: 154
Treino ultrapassou em: 1130044 tam: 75
Treino ultrapassou em: 1130258 tam: 70
Treino ultrapassou em: 1130490 tam: 80
Treino ultrapassou em: 1130600 tam: 85
Treino ultrapassou em: 1131080 tam: 72
Treino ultrapassou em: 1131146 tam: 54
Treino ultrapassou em: 1131423 tam: 78
Treino ultrapassou em

Treino ultrapassou em: 1318126 tam: 54
Treino ultrapassou em: 1318286 tam: 92
Treino ultrapassou em: 1318646 tam: 335
Treino ultrapassou em: 1319096 tam: 250
Treino ultrapassou em: 1319226 tam: 80
Treino ultrapassou em: 1319551 tam: 215
Treino ultrapassou em: 1319859 tam: 81
Treino ultrapassou em: 1320083 tam: 98
Treino ultrapassou em: 1320188 tam: 95
Treino ultrapassou em: 1320318 tam: 65
Treino ultrapassou em: 1320532 tam: 84
Treino ultrapassou em: 1320686 tam: 65
Treino ultrapassou em: 1321025 tam: 196
Treino ultrapassou em: 1321420 tam: 196
Treino ultrapassou em: 1321601 tam: 92
Treino ultrapassou em: 1322259 tam: 143
Treino ultrapassou em: 1322415 tam: 50
Treino ultrapassou em: 1322559 tam: 96
Treino ultrapassou em: 1322907 tam: 98
Treino ultrapassou em: 1323063 tam: 96
Treino ultrapassou em: 1323179 tam: 64
Treino ultrapassou em: 1323379 tam: 112
Treino ultrapassou em: 1323490 tam: 51
Treino ultrapassou em: 1323578 tam: 64
Treino ultrapassou em: 1323746 tam: 78
Treino ultrapassou

Treino ultrapassou em: 1503226 tam: 100
Treino ultrapassou em: 1503326 tam: 52
Treino ultrapassou em: 1503675 tam: 70
Treino ultrapassou em: 1504003 tam: 60
Treino ultrapassou em: 1504225 tam: 64
Treino ultrapassou em: 1504516 tam: 66
Treino ultrapassou em: 1504818 tam: 122
Treino ultrapassou em: 1505126 tam: 165
Treino ultrapassou em: 1505421 tam: 70
Treino ultrapassou em: 1505721 tam: 91
Treino ultrapassou em: 1505868 tam: 140
Treino ultrapassou em: 1506030 tam: 54
Treino ultrapassou em: 1506285 tam: 60
Treino ultrapassou em: 1506411 tam: 96
Treino ultrapassou em: 1506549 tam: 96
Treino ultrapassou em: 1506679 tam: 65
Treino ultrapassou em: 1507375 tam: 264
Treino ultrapassou em: 1507579 tam: 72
Treino ultrapassou em: 1507932 tam: 60
Treino ultrapassou em: 1508072 tam: 65
Treino ultrapassou em: 1508242 tam: 102
Treino ultrapassou em: 1508444 tam: 65
Treino ultrapassou em: 1508596 tam: 60
Treino ultrapassou em: 1508728 tam: 60
Treino ultrapassou em: 1509285 tam: 260
Treino ultrapassou

Treino ultrapassou em: 1699976 tam: 112
Treino ultrapassou em: 1700176 tam: 119
Treino ultrapassou em: 1700756 tam: 103
Treino ultrapassou em: 1701242 tam: 136
Treino ultrapassou em: 1701362 tam: 108
Treino ultrapassou em: 1701579 tam: 112
Treino ultrapassou em: 1701717 tam: 78
Treino ultrapassou em: 1702121 tam: 65
Treino ultrapassou em: 1702217 tam: 60
Treino ultrapassou em: 1702445 tam: 51
Treino ultrapassou em: 1702711 tam: 64
Treino ultrapassou em: 1702903 tam: 52
Treino ultrapassou em: 1703028 tam: 65
Treino ultrapassou em: 1703336 tam: 280
Treino ultrapassou em: 1703441 tam: 100
Treino ultrapassou em: 1703607 tam: 154
Treino ultrapassou em: 1703699 tam: 64
Treino ultrapassou em: 1704013 tam: 70
Treino ultrapassou em: 1704119 tam: 94
Treino ultrapassou em: 1704511 tam: 170
Treino ultrapassou em: 1704679 tam: 144
Treino ultrapassou em: 1705023 tam: 80
Treino ultrapassou em: 1705361 tam: 208
Treino ultrapassou em: 1705890 tam: 60
Treino ultrapassou em: 1706035 tam: 70
Treino ultrap

Treino ultrapassou em: 1887650 tam: 60
Treino ultrapassou em: 1887990 tam: 54
Treino ultrapassou em: 1888224 tam: 72
Treino ultrapassou em: 1888416 tam: 78
Treino ultrapassou em: 1888467 tam: 51
Treino ultrapassou em: 1888756 tam: 289
Treino ultrapassou em: 1888876 tam: 80
Treino ultrapassou em: 1888956 tam: 56
Treino ultrapassou em: 1889081 tam: 65
Treino ultrapassou em: 1889448 tam: 264
Treino ultrapassou em: 1889517 tam: 51
Treino ultrapassou em: 1890134 tam: 136
Treino ultrapassou em: 1890222 tam: 68
Treino ultrapassou em: 1890453 tam: 91
Treino ultrapassou em: 1890709 tam: 60
Treino ultrapassou em: 1890885 tam: 128
Treino ultrapassou em: 1890995 tam: 100
Treino ultrapassou em: 1891121 tam: 96
Treino ultrapassou em: 1891259 tam: 102
Treino ultrapassou em: 1891692 tam: 64
Treino ultrapassou em: 1891985 tam: 247
Treino ultrapassou em: 1892273 tam: 272
Treino ultrapassou em: 1892355 tam: 78
Treino ultrapassou em: 1892644 tam: 156
Treino ultrapassou em: 1892740 tam: 56
Treino ultrapass

Treino ultrapassou em: 2075792 tam: 56
Treino ultrapassou em: 2075855 tam: 57
Treino ultrapassou em: 2076056 tam: 105
Treino ultrapassou em: 2076472 tam: 171
Treino ultrapassou em: 2077066 tam: 135
Treino ultrapassou em: 2077192 tam: 96
Treino ultrapassou em: 2077460 tam: 208
Treino ultrapassou em: 2077684 tam: 128
Treino ultrapassou em: 2077750 tam: 51
Treino ultrapassou em: 2077855 tam: 95
Treino ultrapassou em: 2078202 tam: 139
Treino ultrapassou em: 2078308 tam: 70
Treino ultrapassou em: 2078464 tam: 84
Treino ultrapassou em: 2078940 tam: 144
Treino ultrapassou em: 2079265 tam: 208
Treino ultrapassou em: 2079665 tam: 240
Treino ultrapassou em: 2079859 tam: 56
Treino ultrapassou em: 2080027 tam: 160
Treino ultrapassou em: 2080304 tam: 128
Treino ultrapassou em: 2080505 tam: 78
Treino ultrapassou em: 2080680 tam: 70
Treino ultrapassou em: 2080898 tam: 136
Treino ultrapassou em: 2081078 tam: 78
Treino ultrapassou em: 2081290 tam: 54
Treino ultrapassou em: 2081626 tam: 210
Treino ultra

Treino ultrapassou em: 2266935 tam: 51
Treino ultrapassou em: 2267085 tam: 108
Treino ultrapassou em: 2267364 tam: 117
Treino ultrapassou em: 2267535 tam: 65
Treino ultrapassou em: 2267598 tam: 60
Treino ultrapassou em: 2267926 tam: 70
Treino ultrapassou em: 2268270 tam: 100
Treino ultrapassou em: 2268502 tam: 98
Treino ultrapassou em: 2268740 tam: 112
Treino ultrapassou em: 2269054 tam: 100
Treino ultrapassou em: 2269288 tam: 96
Treino ultrapassou em: 2269388 tam: 91
Treino ultrapassou em: 2269523 tam: 63
Treino ultrapassou em: 2269749 tam: 127
Treino ultrapassou em: 2269889 tam: 112
Treino ultrapassou em: 2269991 tam: 51
Treino ultrapassou em: 2270239 tam: 50
Treino ultrapassou em: 2270299 tam: 51
Treino ultrapassou em: 2270473 tam: 60
Treino ultrapassou em: 2270697 tam: 96
Treino ultrapassou em: 2270928 tam: 84
Treino ultrapassou em: 2271094 tam: 52
Treino ultrapassou em: 2271334 tam: 192
Treino ultrapassou em: 2271801 tam: 160
Treino ultrapassou em: 2271945 tam: 96
Treino ultrapass

Treino ultrapassou em: 2451209 tam: 55
Treino ultrapassou em: 2451469 tam: 128
Treino ultrapassou em: 2451897 tam: 90
Treino ultrapassou em: 2452065 tam: 128
Treino ultrapassou em: 2452337 tam: 176
Treino ultrapassou em: 2452735 tam: 96
Gerando subtextos com foco nos tipos de POI (Validação)...
Conjunto de Treino:  176216
Conjunto de Validação:  35624
Mascarando dados...
Preparando para o treinamento...


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 176216/176216 [1:42:38<00:00, 28.61it/s, loss=0.0412]  
100%|██████████| 35624/35624 [04:26<00:00, 133.44it/s]


Validation loss decreased (inf --> 0.028851).  Saving model ...


Epoch 1: 100%|██████████| 176216/176216 [1:42:58<00:00, 28.52it/s, loss=0.000267]
 56%|█████▋    | 20106/35624 [02:30<01:56, 132.85it/s]

## Split by Geographic Feature Version

- This version separates the data considering the replicated geographic data types.- 
Whenever the geographic data changes, it signifies that another replication sequence will start.- 
There's also a sentence size control to prevent tokenization overflow- .
In this case, small documents based on these changes are generate- d.
The validation set is created using an 80-20 spl- it.
It doesn't work with Early Stop.

In [6]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

SENTENCE_SIZE = 200
EPOCHS = 3
BATCH_SIZE = 4
MAX_LEN = 512
MASK_PERC = 0.15
LR = 5e-5
valid_size = 0.2
patience = 5
use_amp = True

weights = [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]
osm_tables = ['bins_points_information', 'bins_polygons_information', 'bins_roads_information', 'bins_lines_information']

for n in range(0, 1):
    

    for w in weights:
        #Adjusting parameter w
        wgt = round(w, 1)
        for osm_table in osm_tables:
        
            #Flag to allow correct training
            do_training = False
            
            #Special case to load point data and train only once
            if(osm_table == 'bins_points_information' and wgt == 0.0):
                file_name = './austin-sl-tuple-geoc2vec-' + str(n) + osm_table + '-pfp-c.csv'
                model_name = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + osm_table + '-pfp-c'
                
                do_training = True
                
            elif(osm_table != 'bins_points_information'):
                
                file_name = './austin-sl-tuple-geoc2vec-' + str(n) + osm_table + '-wgt' + str(wgt) + 'pfp-c.csv'
                model_name = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + osm_table + '-wgt' + str(wgt) + '-pfp-c'
                do_training = True
                
            if(do_training):
                
                start_time = time.time()
                
                #Loading the empty model for fine-tuning
                model = DistilBertForMaskedLM.from_pretrained(model_checkpoint)

                #Load dataset
                print("Loading data...")
                print(file_name)
                sentences = pd.read_csv(file_name)
                sentences = sentences.values.tolist()
                print('Number of sentences:', len(sentences))


                print("Generating subtexts focusing on POI types...")
                #Creating smaller texts with the set of two words from POIs and geographical data
                #This method considers the change of the geographical data TYPE to create a new subtext
                bert_sentences = []
                local_sentences = []
                actual_type = sentences[0][3] # => Geographical data
                for i, sentence in enumerate(sentences):

                    if(sentence[3] != actual_type):
                        
                        final_sentence = '[SEP]'.join(local_sentences)
                        bert_sentences.append(final_sentence)
                        
                        actual_type = sentence[3]
                        sentence_text = [sentence[1], sentence[3]]
                        local_sentences = [input_clean(sentence_text)]

                    else:
                        sentence_text = [sentence[1], sentence[3]]
                        local_sentences.append(input_clean(sentence_text))

                #Adding last segment
                if(len(local_sentences) > 0):
                    bert_sentences.append(final_sentence)
            
                #Clearing memory
                del sentences, local_sentences
                
                print("Generating Training and Validation sets...")
                train, validation = train_test_split(bert_sentences, test_size=valid_size, random_state=42)
                print('Training set: ', len(train))
                print('Validation set: ', len(validation))
                
                
                #Clearing memory
                del bert_sentences

                #Tokenizing and saving a copy of the tokens to represent the labels
                inputs_train = tokenizer(train, return_tensors='pt', max_length=MAX_LEN, truncation = True, padding=True)
                inputs_train['labels'] = inputs_train.input_ids.detach().clone()

                inputs_val = tokenizer(validation, return_tensors='pt', max_length=MAX_LEN, truncation = True, padding=True)
                inputs_val['labels'] = inputs_val.input_ids.detach().clone()


                #Clearing memory
                del train, validation

                #Finding MASK_PERC% of sentences to be masked
                #The sentences will have the second sentence masked e.g., [CLS]Bar[SEP]Park[SEP] => [CLS]Bar[SEP]#####[SEP]
                print("Masking data...")
                rand_train = torch.rand(inputs_train.input_ids.shape)
                rand_val = torch.rand(inputs_val.input_ids.shape)
                
                #Generating random masking positions
                #101 = [CLS]
                #102 = [SEP]
                mask_arr_train = (rand_train < MASK_PERC) * (inputs_train.input_ids != 101) * \
                           (inputs_train.input_ids != 102) * (inputs_train.input_ids != 0)
                
                mask_arr_val = (rand_val < MASK_PERC) * (inputs_val.input_ids != 101) * \
                           (inputs_val.input_ids != 102) * (inputs_val.input_ids != 0)
                
                #Tokens to be masked
                selection_train = []
                for i in range(inputs_train.input_ids.shape[0]):
                    selection_train.append(
                        torch.flatten(mask_arr_train[i].nonzero()).tolist()
                    )
                    
                selection_val = []
                for i in range(inputs_val.input_ids.shape[0]):
                    selection_val.append(
                        torch.flatten(mask_arr_val[i].nonzero()).tolist()
                    )

                #Clearing memory
                del rand_train, mask_arr_train, rand_val, mask_arr_val
                
                #Changing tokens
                #103 = [MASK]
                for i in range(inputs_train.input_ids.shape[0]):
                    inputs_train.input_ids[i, selection_train[i]] = 103
                
                for i in range(inputs_val.input_ids.shape[0]):
                    inputs_val.input_ids[i, selection_val[i]] = 103
                    
                #Clearing memory
                del selection_train, selection_val

                #Transforming data into torch dataset object
                print("Preparing for training...")
                dataset_train = Dataset(inputs_train)
                loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
                dataset_val = Dataset(inputs_val)
                loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True)
                
                #Clearing memory
                del inputs_train, inputs_val

                #Preparing the device for training
                device = 'cuda' if cuda.is_available() else 'cpu' # CPU OR GPU
                torch.cuda.empty_cache()
                # and move our model over to the selected device
                model.to(device)

                optim = AdamW(model.parameters(), lr=LR)

                # to track the training loss as the model trains
                train_losses = []
                # to track the validation loss as the model trains
                valid_losses = []
                # to track the average training loss per epoch as the model trains
                avg_train_losses = []
                # to track the average validation loss per epoch as the model trains
                avg_valid_losses = [] 

                # initialize the early_stopping object
                early_stopping = EarlyStopping(patience=patience, verbose=False)
                scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
                has_early_stopping = False

                for epoch in range(EPOCHS):
                    # setup loop with TQDM and dataloader
                    # activate training mode
                    model.train()
                    loop_train = tqdm(loader_train, leave=True)
                    for batch_train in loop_train:

                        optim.zero_grad()
                        # pull all tensor batches required for training
                        input_ids = batch_train['input_ids'].to(device)
                        attention_mask = batch_train['attention_mask'].to(device)
                        labels = batch_train['labels'].to(device)

                        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
                            outputs = model(input_ids, attention_mask=attention_mask,
                                        labels=labels)
                            loss = outputs.loss

                        scaler.scale(loss).backward()
                        scaler.step(optim)
                        scaler.update()

                        loop_train.set_description(f'Epoch {epoch}')
                        loop_train.set_postfix(loss=loss.item())


                        train_losses.append(loss.item())

                    ######################    
                    # validate the model #
                    ######################
                    model.eval() # prep model for evaluation
                    loop_val = tqdm(loader_val, leave=True)
                    for batch_val in loop_val:

                        # forward pass: compute predicted outputs by passing inputs to the model
                        input_ids = batch_val['input_ids'].to(device)
                        attention_mask = batch_val['attention_mask'].to(device)
                        labels = batch_val['labels'].to(device)

                        # process
                        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
                            outputs = model(input_ids, attention_mask=attention_mask,
                                        labels=labels)
                            loss = outputs.loss

                        # record validation loss
                        valid_losses.append(loss.item())

                    # calculate average loss over an epoch
                    train_loss = np.average(train_losses)
                    valid_loss = np.average(valid_losses)
                    avg_train_losses.append(train_loss)
                    avg_valid_losses.append(valid_loss)

                    # clear lists to track next epoch
                    train_losses = []
                    valid_losses = []

                #Saving the trained model
                print("Saving the model...")
                args = TrainingArguments(
                    output_dir=model_name,
                    per_device_train_batch_size=BATCH_SIZE,
                    num_train_epochs=EPOCHS
                )

                trainer = Trainer(
                    model=model,
                    args=args,
                    train_dataset=dataset_train,
                    eval_dataset=dataset_val
                )

                trainer.save_model()
                
                final_time = (time.time() - start_time)
                training_dictionary = {'epoch': (EPOCHS+1),
                                       'epochs': EPOCHS,
                                       'patience': patience,
                                       'train_loss': train_loss,
                                       'valid_loss': valid_loss,
                                       'avg_train_losses':avg_train_losses,
                                       'avg_valid_losses': avg_valid_losses,
                                       'time': final_time}
                
                #Saving training statistics
                file_name = model_name + '/training_dictionary.json'
                with open(file_name, "w") as outfile:
                    json.dump(training_dictionary, outfile)
                
                #Clearing memory
                del loader_train
                del loader_val
                del dataset_train
                del dataset_val
                del train_losses
                del valid_losses
                del avg_train_losses
                del avg_valid_losses
                del trainer

Carregando dados...
./geographic/GEOC2VEC/austin-sl-tuple-geoc2vec-0bins_polygons_information-wgt0.8pfp-c.parquet
Quantidade de sentenças: 1822380
Gerando subtextos com foco nos tipos de POI...
16
Gerando conjunto de Treino e Validação...
Conjunto de Treino:  70503
Conjunto de Validação:  17626


# Getting Embeddings Example

In [None]:
# Obtains embeddings of sentences instead of words
def executeModel_CLS(model, tokenizer, poi_type):
    
    # Tokenizes the text
    tokenized_text = tokenizer(poi_type, return_tensors='pt', max_length=512, truncation=True, padding=True)
    
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers = model(**tokenized_text, output_hidden_states=True)
    
    # Create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers['hidden_states'], dim=0)
    
    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    
    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1, 0, 2)
    
    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.mean(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec.tolist())
        

    # print ('Shape is: %d x %d' % (len(token_vecs_sum_text_01), len(token_vecs_sum_text_01[0])))
    
    # CLS vector
    return token_vecs_sum[0]

In [None]:
n = 0
wgt = 0.5

# File paths for different types of geographic information
file_name_point = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + 'bins_points_information-pfp-c'
file_name_polygon = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + 'bins_polygons_information-wgt' + str(wgt) + '-pfp-c'
file_name_roads = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + 'bins_roads_information-wgt' + str(wgt) + '-pfp-c'
file_name_lines = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + 'bins_lines_information-wgt' + str(wgt) + '-pfp-c'

# Load pre-trained models for different types of geographic information
model_points = DistilBertForMaskedLM.from_pretrained(file_name_point)
model_polygons = DistilBertForMaskedLM.from_pretrained(file_name_polygon)
model_roads = DistilBertForMaskedLM.from_pretrained(file_name_roads)
model_lines = DistilBertForMaskedLM.from_pretrained(file_name_lines)

# Set models to evaluation mode
model_points.eval()
model_lines.eval()
model_roads.eval()
model_polygons.eval()
            
poi_type = 'Park'

# Obtaining the embeddings for each word
sentence_embedding_points = executeModel_CLS(model_points, tokenizer, poi_type)
sentence_embedding_lines = executeModel_CLS(model_lines, tokenizer, poi_type)
sentence_embedding_roads = executeModel_CLS(model_roads, tokenizer, poi_type)
sentence_embedding_polygons = executeModel_CLS(model_polygons, tokenizer, poi_type)

# Concatenating the embeddings
p1_sentence_embedding = sentence_embedding_points + sentence_embedding_lines + sentence_embedding_roads + sentence_embedding_polygons