# Libraries


In [1]:
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
from transformers import TrainingArguments
from transformers import Trainer
import torch
import pandas as pd
from torch.optim import AdamW
from tqdm import tqdm
import time
import numpy as np
from torch import cuda
from sklearn.model_selection import train_test_split
import json

# Functions

In [2]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [3]:
#Dataset do torch auxilia no treinamento dos modelos
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [4]:
"""
This function converts the list of sentences into a BERT input
"""
def bertInput_clean(sentences):

    token_text = "[SEP]".join(sentences)
    
    return token_text


"""
This function converts the list of sentences into a BERT input
"""
def input_clean(sentences):

    token_text = " ".join(sentences)
    
    return token_text

## Split by Geographic Feature Version

- This version separates the data considering the replicated geographic data types.

Whenever the geographic data changes, it signifies that another replication sequence will start.- 
There's also a sentence size control to prevent tokenization overflow- .
In this case, small documents based on these changes are generate- d.
The validation set is created using an 80-20 spl- it.
It doesn't work with Early Stop.

In [6]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

SENTENCE_SIZE = 200
EPOCHS = 3
BATCH_SIZE = 4
MAX_LEN = 512
MASK_PERC = 0.15
LR = 5e-5
valid_size = 0.2
patience = 5
use_amp = True

weights = [0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0]
osm_tables = ['bins_points_information', 'bins_polygons_information', 'bins_roads_information', 'bins_lines_information']

for n in range(0, 1):
    

    for w in weights:
        #Adjusting parameter w
        wgt = round(w, 1)
        for osm_table in osm_tables:
        
            #Flag to allow correct training
            do_training = False
            
            #Special case to load point data and train only once
            if(osm_table == 'bins_points_information' and wgt == 0.0):
                file_name = './austin-sl-tuple-geoc2vec-' + str(n) + osm_table + '-pfp-c.csv'
                model_name = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + osm_table + '-pfp-c'
                
                do_training = True
                
            elif(osm_table != 'bins_points_information'):
                
                file_name = './austin-sl-tuple-geoc2vec-' + str(n) + osm_table + '-wgt' + str(wgt) + 'pfp-c.csv'
                model_name = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + osm_table + '-wgt' + str(wgt) + '-pfp-c'
                do_training = True
                
            if(do_training):
                
                start_time = time.time()
                
                #Loading the empty model for fine-tuning
                model = DistilBertForMaskedLM.from_pretrained(model_checkpoint)

                #Load dataset
                print("Loading data...")
                print(file_name)
                sentences = pd.read_csv(file_name)
                sentences = sentences.values.tolist()
                print('Number of sentences:', len(sentences))


                print("Generating subtexts focusing on POI types...")
                #Creating smaller texts with the set of two words from POIs and geographical data
                #This method considers the change of the geographical data TYPE to create a new subtext
                bert_sentences = []
                local_sentences = []
                actual_type = sentences[0][3] # => Geographical data
                for i, sentence in enumerate(sentences):

                    if(sentence[3] != actual_type):
                        
                        final_sentence = '[SEP]'.join(local_sentences)
                        bert_sentences.append(final_sentence)
                        
                        actual_type = sentence[3]
                        sentence_text = [sentence[1], sentence[3]]
                        local_sentences = [input_clean(sentence_text)]

                    else:
                        sentence_text = [sentence[1], sentence[3]]
                        local_sentences.append(input_clean(sentence_text))

                #Adding last segment
                if(len(local_sentences) > 0):
                    bert_sentences.append(final_sentence)
            
                #Clearing memory
                del sentences, local_sentences
                
                print("Generating Training and Validation sets...")
                train, validation = train_test_split(bert_sentences, test_size=valid_size, random_state=42)
                print('Training set: ', len(train))
                print('Validation set: ', len(validation))
                
                
                #Clearing memory
                del bert_sentences

                #Tokenizing and saving a copy of the tokens to represent the labels
                inputs_train = tokenizer(train, return_tensors='pt', max_length=MAX_LEN, truncation = True, padding=True)
                inputs_train['labels'] = inputs_train.input_ids.detach().clone()

                inputs_val = tokenizer(validation, return_tensors='pt', max_length=MAX_LEN, truncation = True, padding=True)
                inputs_val['labels'] = inputs_val.input_ids.detach().clone()


                #Clearing memory
                del train, validation

                #Finding MASK_PERC% of sentences to be masked
                #The sentences will have the second sentence masked e.g., [CLS]Bar[SEP]Park[SEP] => [CLS]Bar[SEP]#####[SEP]
                print("Masking data...")
                rand_train = torch.rand(inputs_train.input_ids.shape)
                rand_val = torch.rand(inputs_val.input_ids.shape)
                
                #Generating random masking positions
                #101 = [CLS]
                #102 = [SEP]
                mask_arr_train = (rand_train < MASK_PERC) * (inputs_train.input_ids != 101) * \
                           (inputs_train.input_ids != 102) * (inputs_train.input_ids != 0)
                
                mask_arr_val = (rand_val < MASK_PERC) * (inputs_val.input_ids != 101) * \
                           (inputs_val.input_ids != 102) * (inputs_val.input_ids != 0)
                
                #Tokens to be masked
                selection_train = []
                for i in range(inputs_train.input_ids.shape[0]):
                    selection_train.append(
                        torch.flatten(mask_arr_train[i].nonzero()).tolist()
                    )
                    
                selection_val = []
                for i in range(inputs_val.input_ids.shape[0]):
                    selection_val.append(
                        torch.flatten(mask_arr_val[i].nonzero()).tolist()
                    )

                #Clearing memory
                del rand_train, mask_arr_train, rand_val, mask_arr_val
                
                #Changing tokens
                #103 = [MASK]
                for i in range(inputs_train.input_ids.shape[0]):
                    inputs_train.input_ids[i, selection_train[i]] = 103
                
                for i in range(inputs_val.input_ids.shape[0]):
                    inputs_val.input_ids[i, selection_val[i]] = 103
                    
                #Clearing memory
                del selection_train, selection_val

                #Transforming data into torch dataset object
                print("Preparing for training...")
                dataset_train = Dataset(inputs_train)
                loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
                dataset_val = Dataset(inputs_val)
                loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True)
                
                #Clearing memory
                del inputs_train, inputs_val

                #Preparing the device for training
                device = 'cuda' if cuda.is_available() else 'cpu' # CPU OR GPU
                torch.cuda.empty_cache()
                # and move our model over to the selected device
                model.to(device)

                optim = AdamW(model.parameters(), lr=LR)

                # to track the training loss as the model trains
                train_losses = []
                # to track the validation loss as the model trains
                valid_losses = []
                # to track the average training loss per epoch as the model trains
                avg_train_losses = []
                # to track the average validation loss per epoch as the model trains
                avg_valid_losses = [] 

                # initialize the early_stopping object
                early_stopping = EarlyStopping(patience=patience, verbose=False)
                scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
                has_early_stopping = False

                for epoch in range(EPOCHS):
                    # setup loop with TQDM and dataloader
                    # activate training mode
                    model.train()
                    loop_train = tqdm(loader_train, leave=True)
                    for batch_train in loop_train:

                        optim.zero_grad()
                        # pull all tensor batches required for training
                        input_ids = batch_train['input_ids'].to(device)
                        attention_mask = batch_train['attention_mask'].to(device)
                        labels = batch_train['labels'].to(device)

                        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
                            outputs = model(input_ids, attention_mask=attention_mask,
                                        labels=labels)
                            loss = outputs.loss

                        scaler.scale(loss).backward()
                        scaler.step(optim)
                        scaler.update()

                        loop_train.set_description(f'Epoch {epoch}')
                        loop_train.set_postfix(loss=loss.item())


                        train_losses.append(loss.item())

                    ######################    
                    # validate the model #
                    ######################
                    model.eval() # prep model for evaluation
                    loop_val = tqdm(loader_val, leave=True)
                    for batch_val in loop_val:

                        # forward pass: compute predicted outputs by passing inputs to the model
                        input_ids = batch_val['input_ids'].to(device)
                        attention_mask = batch_val['attention_mask'].to(device)
                        labels = batch_val['labels'].to(device)

                        # process
                        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
                            outputs = model(input_ids, attention_mask=attention_mask,
                                        labels=labels)
                            loss = outputs.loss

                        # record validation loss
                        valid_losses.append(loss.item())

                    # calculate average loss over an epoch
                    train_loss = np.average(train_losses)
                    valid_loss = np.average(valid_losses)
                    avg_train_losses.append(train_loss)
                    avg_valid_losses.append(valid_loss)

                    # clear lists to track next epoch
                    train_losses = []
                    valid_losses = []

                #Saving the trained model
                print("Saving the model...")
                args = TrainingArguments(
                    output_dir=model_name,
                    per_device_train_batch_size=BATCH_SIZE,
                    num_train_epochs=EPOCHS
                )

                trainer = Trainer(
                    model=model,
                    args=args,
                    train_dataset=dataset_train,
                    eval_dataset=dataset_val
                )

                trainer.save_model()
                
                final_time = (time.time() - start_time)
                training_dictionary = {'epoch': (EPOCHS+1),
                                       'epochs': EPOCHS,
                                       'patience': patience,
                                       'train_loss': train_loss,
                                       'valid_loss': valid_loss,
                                       'avg_train_losses':avg_train_losses,
                                       'avg_valid_losses': avg_valid_losses,
                                       'time': final_time}
                
                #Saving training statistics
                file_name = model_name + '/training_dictionary.json'
                with open(file_name, "w") as outfile:
                    json.dump(training_dictionary, outfile)
                
                #Clearing memory
                del loader_train
                del loader_val
                del dataset_train
                del dataset_val
                del train_losses
                del valid_losses
                del avg_train_losses
                del avg_valid_losses
                del trainer

Carregando dados...
./geographic/GEOC2VEC/austin-sl-tuple-geoc2vec-0bins_polygons_information-wgt0.8pfp-c.parquet
Quantidade de sentenças: 1822380
Gerando subtextos com foco nos tipos de POI...
16
Gerando conjunto de Treino e Validação...
Conjunto de Treino:  70503
Conjunto de Validação:  17626


# Getting Embeddings Example

In [None]:
# Obtains embeddings of sentences instead of words
def executeModel_CLS(model, tokenizer, poi_type):
    
    # Tokenizes the text
    tokenized_text = tokenizer(poi_type, return_tensors='pt', max_length=512, truncation=True, padding=True)
    
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers = model(**tokenized_text, output_hidden_states=True)
    
    # Create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers['hidden_states'], dim=0)
    
    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    
    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1, 0, 2)
    
    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.mean(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec.tolist())
        

    # print ('Shape is: %d x %d' % (len(token_vecs_sum_text_01), len(token_vecs_sum_text_01[0])))
    
    # CLS vector
    return token_vecs_sum[0]

In [None]:
n = 0
wgt = 0.5

# File paths for different types of geographic information
file_name_point = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + 'bins_points_information-pfp-c'
file_name_polygon = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + 'bins_polygons_information-wgt' + str(wgt) + '-pfp-c'
file_name_roads = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + 'bins_roads_information-wgt' + str(wgt) + '-pfp-c'
file_name_lines = './austin-sl-tuple-geoc2vec-distilbert-MLM-' + str(n) + 'bins_lines_information-wgt' + str(wgt) + '-pfp-c'

# Load pre-trained models for different types of geographic information
model_points = DistilBertForMaskedLM.from_pretrained(file_name_point)
model_polygons = DistilBertForMaskedLM.from_pretrained(file_name_polygon)
model_roads = DistilBertForMaskedLM.from_pretrained(file_name_roads)
model_lines = DistilBertForMaskedLM.from_pretrained(file_name_lines)

# Set models to evaluation mode
model_points.eval()
model_lines.eval()
model_roads.eval()
model_polygons.eval()
            
poi_type = 'Park'

# Obtaining the embeddings for each word
sentence_embedding_points = executeModel_CLS(model_points, tokenizer, poi_type)
sentence_embedding_lines = executeModel_CLS(model_lines, tokenizer, poi_type)
sentence_embedding_roads = executeModel_CLS(model_roads, tokenizer, poi_type)
sentence_embedding_polygons = executeModel_CLS(model_polygons, tokenizer, poi_type)

# Concatenating the embeddings
poi_type_embedding = sentence_embedding_points + sentence_embedding_lines + sentence_embedding_roads + sentence_embedding_polygons