In [None]:
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW,DataCollatorWithPadding, AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, BertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments,AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import Dataset as HFDataset, load_metric
from datasets import load_dataset
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder
import wandb

In [None]:
wandb.login()

In [None]:
# Set seed for reproducibility
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available() else None
np.random.seed(seed)
random.seed(seed)

In [None]:
# Create a custom PyTorch Dataset
class EncodedDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [None]:
from torch.optim import RAdam, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup, get_constant_schedule_with_warmup

# Choose optimizer and learning rate scheduler
def getOptimizer(model, total_train_steps, scheduler_type="linear",
                  lr=1e-4, weight_decay=0.01, warmup_steps=0):
    
    optimizer = AdamW(params=model.parameters(), lr=float(lr), weight_decay=weight_decay)
    
    if scheduler_type == "linear":
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_train_steps
        )
    elif scheduler_type == "cosine":
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_train_steps
        )
    else:
        lr_scheduler = get_constant_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=warmup_steps
        )

    return optimizer, lr_scheduler

# DISTILLBERT TIMESPLIT

In [None]:
# Load data 
dataFrame = pd.read_csv('digitalizações_registadas.csv', 
                       delimiter=";",
                       date_format="", 
                       parse_dates=['Data Emissão','Data vencimento indicada',"Data entrada"]) 

# Strip any leading or trailing whitespace from column names
dataFrame.columns = dataFrame.columns.str.strip()

# Get unnamed columns to remove
unnamed_columns = [col for col in dataFrame.columns if col.startswith('Unnamed')]

# Drop unnamed columns
dataFrame = dataFrame.drop(columns=unnamed_columns)

# Drop rows with any null values
dataFrame = dataFrame.dropna(subset=['Data vencimento indicada','Data Emissão','Origem']) #'Contrato'

# Convert columns to date type
dataFrame['Data entrada'] = pd.to_datetime(dataFrame['Data entrada'], format="%d/%m/%Y")
#dataFrame['Data Emissão'] = pd.to_datetime(dataFrame['Data Emissão'], format="%d/%m/%Y")
#dataFrame['Data vencimento indicada'] = pd.to_datetime(dataFrame['Data vencimento indicada'], format="%d/%m/%Y")



In [None]:
# Since we only want to predict Contrato or Manual, we discard rows with Requisição
dataFrame = dataFrame[dataFrame['Origem'] != "Requisição"]

# Set Labels column (this is unecessary as we can use Origem - but good for readability)
dataFrame['Labels'] = dataFrame['Origem']

In [None]:
# Group by the labels in column B
#grouped = dataFrame.groupby('Labels')

# Determine the size of the smaller group
#min_size = grouped.size().min()

# Sample the smaller group size from each group and concatenate the results
#dataFrame = grouped.apply(lambda x: x.sample(min_size)).reset_index(drop=True)

In [None]:
feature_columns = ["Fornecedor","Data Emissão","Data entrada","Data vencimento indicada", "Valor com IVA"]
# Function to format the row data
def format_row(row, feature_columns, dataframe):
    parts = []
    for column in feature_columns:
        value = row[column]
        if pd.api.types.is_datetime64_any_dtype(dataframe[column]):
            value = value.strftime('%d/%m/%Y')
        parts.append(f"{column}: {value}")
    return '\n '.join(parts) + "\n"


def buildDocRepresentation(dataframe, feature_columns):
    dataframe['FullText'] = dataframe.apply(lambda row: format_row(row, feature_columns, dataframe), axis = 1)


buildDocRepresentation(dataFrame, feature_columns)

In [None]:
# Build column with doc text representation
dataFrame['FullText'] = (
"Fornecedor:"+dataFrame['Fornecedor'] 
+ '\n Data emissão:' + dataFrame['Data Emissão']  
+ '\n Data entrada:' + dataFrame['Data entrada'].dt.strftime('%d/%m/%Y')
+ '\n Data vencimento:' + dataFrame['Data vencimento indicada']
+ "\n Valor com IVA:"+dataFrame["Valor com IVA"]
+ "\n"
)

In [None]:
# Init label encoder
label_encoder = LabelEncoder()

# Perform timesplit - we train on older samples and test on recent ones
dataFrame_before_2024 = dataFrame[dataFrame['Data entrada'] < '2024-02-01']
dataFrame_after_2024 = dataFrame[dataFrame['Data entrada'] >= '2024-02-01']

# Especify what is train/test for readability
train_texts = dataFrame_before_2024['FullText'].tolist()
test_texts = dataFrame_after_2024['FullText'].tolist()
train_labels = dataFrame_before_2024['Labels'].tolist()
test_labels = dataFrame_after_2024['Labels'].tolist()

# Encode labels - model cant take actual text - we need to encode text to numbers
encoded_labels_train = label_encoder.fit_transform(train_labels)
encoded_labels_test = label_encoder.fit_transform(test_labels)

In [None]:
# Count Contrato rows and Manual rows
count_contrato = dataFrame_before_2024['Labels'].value_counts().get('Contrato', 0)
count_manual = dataFrame_before_2024['Labels'].value_counts().get('Manual', 0)
print(count_contrato)
print(count_manual)

In [None]:
# Check classes/labels
list(label_encoder.classes_)

In [None]:
# Initialize our tokenizer - SHOULD MATCH OUR CHOSEN MODEL!
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Add special tokens if necessary: in this case we add a PAD token
# to pad our input bc they must have the same length
special_tokens_dict = {"pad_token": "<PAD>"}
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
# Encode our document text representations
encodings_train = tokenizer(train_texts, truncation=True, padding=True, max_length=128 )
encodings_test = tokenizer(test_texts, truncation=True, padding=True, max_length=128 )

In [None]:
# Initialize our encoded pytorch datasets
train_dataset = EncodedDataset({'input_ids': encodings_train['input_ids'], 
                                'attention_mask': encodings_train['attention_mask']}, 
                                encoded_labels_train)

val_dataset = EncodedDataset({'input_ids': encodings_test['input_ids'],
                               'attention_mask': encodings_test['attention_mask']},
                                 encoded_labels_test)

In [None]:
# Initialize DataLoaders - responsible for building and preparing the batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=data_collator)

In [None]:
configs = [
    {
    "runName":"distillbert-mps-timesplit",
    "lr":1e-4,
    "epochs":1,
    "weight_decay":0.01,
    "save":True,
    "scheduler_type":"cosine",
    "warmup_steps":100
    },   
]

In [None]:
for conf in configs:
    # Instantiate the model
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))  # Adjust num_labels as needed

    # move model to device
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)

    # Calculate total train steps for scheduler
    num_training_steps = len(train_loader) * conf['epochs']

    # Init optimizer and scheduler
    optimizer, scheduler = getOptimizer(model, num_training_steps, scheduler_type=conf['scheduler_type'],
                                        lr=conf['lr'], weight_decay=conf['weight_decay'],
                                        warmup_steps=conf['warmup_steps'])

    # Define the loss function
    criterion = torch.nn.CrossEntropyLoss()

    # Training loop
    model.train()
    train_losses = []
    steps = 0

    # Wandb init conf
    runName = f"experiment_{conf['runName']}"
    run = wandb.init(
        project="cob-demo",
        name=runName, 
        config={
            "learning_rate": conf['lr'],
            "epochs": conf['epochs'],
            "weight_decay":conf['weight_decay'],
            "scheduler_type":conf['scheduler_type'],
            "warmup_steps":conf['warmup_steps']
        },
    )

    # Train model
    for epoch in range(conf['epochs']):
        epoch_train_loss = 0.0
        for batch in train_loader:
            steps += 1

            # get inputs
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # forward inputs through model
            outputs = model(input_ids, attention_mask)
            # calculate loss
            loss = criterion(outputs.logits, labels)
            # perform backward pass
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            epoch_train_loss += loss.item()
            wandb.log({"loss": loss.item(), "step":steps, "learning_rate":scheduler.get_last_lr()[0]})


        # Calculate average training loss for the epoch
        epoch_train_loss /= len(train_loader)
        train_losses.append(epoch_train_loss)
        
        print(f"Run: {conf['runName']}   |Epoch {epoch + 1}, Train Loss: {epoch_train_loss}")


    # Evaluate model
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    val_losses = []
    val_accuracies = []
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            #outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            outputs = model(input_ids, attention_mask)

            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            loss = criterion(outputs.logits, labels)

            val_loss += loss.item()
                
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Calculate average validation loss and accuracy
    val_loss /= len(val_loader)
    val_losses.append(val_loss)
    
    accuracy = 100 * correct / total
    val_accuracies.append(accuracy)
    
    print(f"Validation Loss: {val_loss}, Accuracy: {accuracy}%")
    

    report = classification_report(y_true, y_pred, output_dict=True)
    print("Classification Report:")
    print(report)

    wandb.run.summary["report"] = report
    wandb.finish()
    if conf['save']:
        model.save_pretrained("distillbert/"+runName)


In [None]:
report = classification_report(y_true, y_pred)
print("Classification Report:")
print(report)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distillbert/") 

In [None]:
# Function to make predictions on new data
def predict(texts):
    # Tokenize the input texts
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    
    # Move the model to the correct device (CPU or GPU)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    encodings = {key: val.to(device) for key, val in encodings.items()}

    # Make predictions
    with torch.no_grad():
        outputs = model(**encodings)
    
    # Get the predicted class labels
    predictions = torch.argmax(outputs.logits, dim=-1)
    
    return predictions.cpu().numpy()

In [None]:
texts = ["""

"""]
print(predict(texts))

In [None]:
# Example usage
new_texts = [
    """
Título: REGT: Geração de relatórios
Tipo: Feature
Descrição: Na solução de registos de tempo, é necessario implementar a capacidade de imprimir 4 tipos de relatórios sobre os registos de tempo:

1º Relatório - tempo por pessoa, por empresa, por mês
- 3 colunas: Colaborador, Empresa, Percentagem de **tempo (sem ter em conta custo)** que essa pessoa gastou nessa empresa 

2º Relatório - tempo por direção, por empresa, por mês
- 3 colunas: Direção, Empresa, Percentagem de **tempo (sem ter em conta custo)** que os colaboradores dessa direção gastaram nessa empresa

3º Relatório - tempo por empresa, por mês
- 3 colunas: Empresa, Percentagem **tempo (sem ter em conta custo)** que no geral (ou seja, por todos os colaboradores) foi gasto nessa empresa

4º Relatório - Custo por pessoa até agora
- 5 colunas: Empresa, Direção, Colaborador, **tempo (sem ter em conta custo)** que essa pessoa gastou nessa empresa (igual ao 1º), **Custo** que essa percentagem vale.

Nota: Este ultimo relatório permite facilmente chegar ao salario de um colaborador, que era algo que eles não queriam. Se calhar vale a pena comentar isso com eles na próxima reunião?
Empresa Mello
""",
"""
Título: erro numa customização impede mostrar pesquisa de Documento
Tipo: Bug Customização
Descrição: Boa tarde,

Como administrador tenho notado uma lentidão ou por vezes fico sem resposta quando faço uma pesquisa específica, neste caso tentei pesquisar pelo ID 2056368 e nunca foi apresentada a listagem.

Obrigada,



![Screenshot 2024-01-26 131917.png](recordm/instances/364669/files/2822/Screenshot%202024-01-26%20131917.png)
![Screenshot 2024-01-26 171804.png](recordm/instances/364669/files/2822/Screenshot%202024-01-26%20171804.png)
![Screenshot 2024-01-26 171953.png](recordm/instances/364669/files/2822/Screenshot%202024-01-26%20171953.png)
Empresa: Mello
""",
"""
Título: Cores erradas no calendário
Tipo: Suporte
Descrição: As cores nos eventos dos calendarios estão erradas, e deviam aparecer conforme o campo "Cor"
que existe em cada instância.
Empresa: Mello
"""
    
]

predicted_labels = predict(new_texts)
print(predicted_labels)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# Function to plot weight distributions
def plot_weight_distributions(model):
    weights = []
    for name, param in model.named_parameters():
        if 'weight' in name:
            weights.extend(param.cpu().detach().numpy().flatten())

    plt.figure(figsize=(10, 6))
    sns.histplot(weights, bins=100, kde=True)
    plt.title('Distribution of Weights')
    plt.xlabel('Weight Value')
    plt.ylabel('Frequency')
    plt.show()

# Function to plot weight magnitude by layer
def plot_weight_magnitude_by_layer(model):
    layer_names = []
    weight_magnitudes = []
    for name, param in model.named_parameters():
        if 'weight' in name:
            layer_names.append(name)
            weight_magnitudes.append(param.abs().mean().item())

    plt.figure(figsize=(12, 6))
    plt.barh(layer_names, weight_magnitudes)
    plt.title('Average Absolute Weight Magnitude by Layer')
    plt.xlabel('Average Absolute Weight Magnitude')
    plt.ylabel('Layer')
    plt.show()

# Function to plot attention weights
def plot_attention_weights(model):
    # Get the attention weights from the first layer (example)
    attention_weights = model.distilbert.transformer.layer[0].attention.q_lin.weight.detach().cpu().numpy()

    plt.figure(figsize=(10, 8))
    sns.heatmap(attention_weights, cmap='viridis')
    plt.title('Attention Weights from First Layer')
    plt.xlabel('Head')
    plt.ylabel('Weight Index')
    plt.show()

# Plotting the distributions
plot_weight_distributions(model)
plot_weight_magnitude_by_layer(model)
plot_attention_weights(model)

# DISTILLBERT NORMAL SPLIT

In [None]:
# Load data 
dataFrame = pd.read_csv('digitalizações_registadas.csv', #'mdados_docs_cob.csv' mdados_docs_all_16k.csv
                       delimiter=";",
                       date_format="", 
                       parse_dates=['Data Emissão','Data vencimento indicada',"Data entrada"])  #, parse_dates=['DataEmissao']

# Strip any leading or trailing whitespace from column names
dataFrame.columns = dataFrame.columns.str.strip()

# Get unnamed columns to remove
unnamed_columns = [col for col in dataFrame.columns if col.startswith('Unnamed')]

# Drop unnamed columns
dataFrame = dataFrame.drop(columns=unnamed_columns)

# Drop rows with any null values
dataFrame = dataFrame.dropna(subset=['Data vencimento indicada','Data Emissão','Origem']) #'Contrato'

dataFrame['Data entrada'] = pd.to_datetime(dataFrame['Data entrada'], format="%d/%m/%Y")


In [None]:
dataFrame = dataFrame[dataFrame['Origem'] != "Requisição"]
dataFrame['Labels'] = dataFrame['Origem']

In [None]:
dataFrame['FullText'] = (
"Fornecedor:"+dataFrame['Fornecedor'] 
+ '\n Data emissão:' + dataFrame['Data Emissão']  
+ '\n Data entrada:' + dataFrame['Data entrada'].dt.strftime('%d/%m/%Y')
+ '\n Data vencimento:' + dataFrame['Data vencimento indicada']
+ "\n Valor com IVA:"+dataFrame["Valor com IVA"]
+ "\n"
)

In [None]:
# Assume the CSV has 'text' and 'label' columns
texts = dataFrame['FullText'].tolist()
labels = dataFrame['Labels'].tolist()

# Encode labels to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

special_tokens_dict = {"pad_token": "<PAD>"}
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128 )

In [None]:
# Split dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    encodings['input_ids'], encoded_labels, test_size=0.1, random_state=42
)

train_masks, val_masks = train_test_split(
    encodings['attention_mask'], test_size=0.1, random_state=42
)

train_dataset = EncodedDataset({'input_ids': train_texts, 'attention_mask': train_masks}, train_labels)
val_dataset = EncodedDataset({'input_ids': val_texts, 'attention_mask': val_masks}, val_labels)

In [None]:
# DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=data_collator)

In [None]:
configs = [
    {
    "runName":"distillbert-mps-v1",
    "lr":1e-4,
    "epochs":1,
    "weight_decay":0.01,
    "save":True
    },   
]

In [None]:
for conf in configs:
    # Instantiate the model
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))  # Adjust num_labels as needed

    # move model to device
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)


    # Define the training arguments
    training_args = TrainingArguments(
    learning_rate=conf['lr'],
    output_dir='./results',          # output directory
    num_train_epochs=conf['epochs'],              # number of training epochs
    per_device_train_batch_size=32,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=conf['weight_decay'],               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    report_to="wandb",
    #fp16=True,
    use_mps_device=True,
    seed=seed,
    data_seed=seed
    )

    metric = load_metric("accuracy")

    def compute_metrics(p):
        return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    

    # Initialize the Trainer
    # may require the usage of model_init to ensure
    # reproducibility
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

    wandb.init(project="cob-demo")
    wandb.run.name = conf['runName']

    # Train the model
    trainer.train()


    # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Evaluation results: {eval_result}")
    

    wandb.finish()
    if conf['save']:
        model.save_pretrained("distillbert/")
