# RUN COMMON HERE 

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding, GPT2Tokenizer, DistilBertForSequenceClassification, DistilBertModel, DistilBertTokenizer, TrainingArguments, Trainer, TrainingArguments
from datasets import load_metric
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from torch import nn
import matplotlib.pyplot as plt
import seaborn as sns
import random
import wandb

In [None]:
wandb.login()

In [None]:
# Define LSTM architecture
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels, layers=1):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, 
                            num_layers=layers,
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Take the output of the last LSTM cell
        x = self.fc(x)
        return x

In [None]:
# Create a custom PyTorch Dataset
class EncodedDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [None]:
from torch.optim import RAdam, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup, get_constant_schedule_with_warmup

# Choose optimizer and learning rate scheduler
def getOptimizer(model, total_train_steps, scheduler_type="linear",
                  lr=1e-4, weight_decay=0.01, warmup_steps=0):
    
    optimizer = AdamW(params=model.parameters(), lr=float(lr), weight_decay=weight_decay)
    
    if scheduler_type == "linear":
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_train_steps
        )
    elif scheduler_type == "cosine":
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_train_steps
        )
    else:
        lr_scheduler = get_constant_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=warmup_steps
        )

    return optimizer, lr_scheduler

In [None]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# LSTM TIMESPLIT

In [None]:
# Load data 
dataFrame = pd.read_csv('digitalizações_registadas.csv',
                       delimiter=";",
                       date_format="", 
                       parse_dates=['Data Emissão','Data vencimento indicada'])  

# Strip any leading or trailing whitespace from column names
dataFrame.columns = dataFrame.columns.str.strip()

# Get unnamed columns to remove
unnamed_columns = [col for col in dataFrame.columns if col.startswith('Unnamed')]

# Drop unnamed columns
dataFrame = dataFrame.drop(columns=unnamed_columns)

# Drop rows with any null values
dataFrame = dataFrame.dropna(subset=['Data vencimento indicada','Data Emissão','Origem']) #'Contrato'

# Convert columns to date type
dataFrame['Data entrada'] = pd.to_datetime(dataFrame['Data entrada'], format="%d/%m/%Y")


In [None]:
# Since we only want to predict Contrato or Manual, we discard rows with Requisição
dataFrame = dataFrame[dataFrame['Origem'] != "Requisição"]

# Set Labels column (this is unecessary as we can use Origem - but good for readability)
dataFrame['Labels'] = dataFrame['Origem']

# Build column with doc text representation
dataFrame['FullText'] = (
"Fornecedor:"+dataFrame['Fornecedor'] 
+ '\n Data emissão:' + dataFrame['Data Emissão']  
+ '\n Data entrada:' + dataFrame['Data entrada'].dt.strftime('%d/%m/%Y')  
+ '\n Data vencimento:' + dataFrame['Data vencimento indicada']
+ "\n Valor com IVA:"+dataFrame["Valor com IVA"]
+ "\n"
)

In [None]:
# Init label encoder
label_encoder = LabelEncoder()

# Perform timesplit - we train on older samples and test on recent ones
dataFrame_before_2024 = dataFrame[dataFrame['Data entrada'] < '2024-02-01']
dataFrame_after_2024 = dataFrame[dataFrame['Data entrada'] >= '2024-02-01']

# Check lenght of splits
print(len(dataFrame))
print(len(dataFrame_before_2024))
print(len(dataFrame_after_2024))

# Especify what is train/test for readability
train_texts = dataFrame_before_2024['FullText'].tolist()
test_texts = dataFrame_after_2024['FullText'].tolist()
train_labels = dataFrame_before_2024['Labels'].tolist()
test_labels = dataFrame_after_2024['Labels'].tolist()

# Encode labels - model cant take actual text - we need to encode text to numbers
encoded_labels_train = label_encoder.fit_transform(train_labels)
encoded_labels_test = label_encoder.fit_transform(test_labels)

In [None]:
# Initialize our tokenizer - SHOULD MATCH OUR CHOSEN MODEL!
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Add special tokens if necessary: in this case we add a PAD token
# to pad our input bc they must have the same length
special_tokens_dict = {"pad_token": "<PAD>"}
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
# Encode our document text representations
encodings_train = tokenizer(train_texts, truncation=True, padding=True, max_length=128 )
encodings_test = tokenizer(test_texts, truncation=True, padding=True, max_length=128 )

In [None]:
# Create our pytorch datasets
train_dataset = EncodedDataset({'input_ids': encodings_train['input_ids'], 
                                'attention_mask': encodings_train['attention_mask']}, 
                                encoded_labels_train)
val_dataset = EncodedDataset({'input_ids': encodings_test['input_ids'],
                               'attention_mask': encodings_test['attention_mask']},
                                 encoded_labels_test)

In [None]:
# DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=data_collator)

In [None]:
# Model parameter dimensions
vocab_size = len(tokenizer)
embed_dim = 128
hidden_dim = 256
num_labels = len(label_encoder.classes_)

In [None]:
configs = [
    # DEFAULT
    {
    "runName":"lstm-mps-timesplit-Ev1",
    "lr":5e-4,
    "epochs":3,
    "weight_decay":0.00,
    "layers":3,
    "scheduler_type":"linear",
    "warmup_steps":0
    },   
    {
    "runName":"lstm-mps-timesplit-Ev2",
    "lr":5e-5,
    "epochs":3,
    "weight_decay":0.00,
    "layers":3,
    "scheduler_type":"linear",
    "warmup_steps":0
    },
    {
    "runName":"lstm-mps-timesplit-Ev3",
    "lr":5e-6,
    "epochs":3,
    "weight_decay":0.00,
    "layers":3,
    "scheduler_type":"linear",
    "warmup_steps":0
    },
  
]


In [None]:
for conf in configs:
    # Instantiate the model
    model = LSTMModel(vocab_size, embed_dim, hidden_dim, num_labels, conf['layers'])
    # Move model to device (mps or cuda) bc its faster
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)

    # Read some conf values
    num_epochs = conf['epochs']
    lr = conf['lr']
    # Calculate total train steps for lr scheduler
    total_train_steps = len(train_loader) * num_epochs

    # Init optimizer and lr scheduler
    optimizer, scheduler = getOptimizer(model, total_train_steps, scheduler_type=conf['scheduler_type'],
                                        lr=conf['lr'], weight_decay=conf['weight_decay'],
                                        warmup_steps=conf['warmup_steps'])

    # Define loss function    
    criterion = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    train_losses = []
    steps = 0

    # Wandb init
    run = wandb.init(
        project="cob-demo",
        name=f"experiment_{conf['runName']}_layers_{conf['layers']}", 
        config={
            "learning_rate": conf['lr'],
            "epochs": num_epochs,
            "layers":conf['layers'],
            "weight_decay":conf['weight_decay'],
            "scheduler_type":conf['scheduler_type'],
            "warmup_steps":conf['warmup_steps']
        },
    )

    # Train model
    for epoch in range(num_epochs):
        epoch_train_loss = 0.0
        for batch in train_loader:
            steps += 1

            # Get inputs
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # Forward inputs
            outputs = model(input_ids, attention_mask)
            # Calculate loss
            loss = criterion(outputs, labels)
            # Perform backward to update weights
            loss.backward()
            # Optimizer and scheduler steps
            optimizer.step()
            scheduler.step()
            # Zero gradients to avoid explosions
            optimizer.zero_grad()
            
            epoch_train_loss += loss.item()
            wandb.log({"loss": loss.item(), "step":steps, "learning_rate":scheduler.get_last_lr()[0]})


        # Calculate average training loss for the epoch
        epoch_train_loss /= len(train_loader)
        train_losses.append(epoch_train_loss)
        
        print(f"Run: {conf['runName']}   |Epoch {epoch + 1}, Train Loss: {epoch_train_loss}")




    # Evaluate model


In [None]:
model.eval()
val_loss = 0.0
correct = 0
total = 0
val_losses = []
val_accuracies = []
y_true = []
y_pred = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        #outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        outputs = model(input_ids, attention_mask)

        #_, predicted = torch.max(outputs.logits, 1)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        #loss = criterion(outputs.logits, labels)
        loss = criterion(outputs, labels)

        val_loss += loss.item()
                
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

    # Calculate average validation loss and accuracy
    val_loss /= len(val_loader)
    val_losses.append(val_loss)
    
    accuracy = 100 * correct / total
    val_accuracies.append(accuracy)
    
    print(f"Validation Loss: {val_loss}, Accuracy: {accuracy}%")


    report = classification_report(y_true, y_pred, output_dict=True)
    print("Classification Report:")
    print(report)

    wandb.run.summary["report"] = report
    wandb.finish()

In [None]:
# 1. Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

report = classification_report(y_true, y_pred)
print("Classification Report:")
print(report)

In [None]:
#torch.save(model.state_dict(), "lstm_mps.pth")

In [None]:
#model.load_state_dict(torch.load("lstm_invoice_classifier.pth"))

In [None]:
new_invoice_text = "Example invoice text here."
inputs = tokenizer(new_invoice_text, return_tensors='pt', truncation=True, padding=True)

In [None]:
with torch.no_grad():
    outputs = model(inputs['input_ids'], inputs['attention_mask'])
    predictions = torch.argmax(outputs, dim=1)

# Decode the predictions to get the class label
predicted_label = label_encoder.inverse_transform(predictions.cpu().numpy())[0]

print(f"Predicted Contract ID: {predicted_label}")

# LSTM NORMAL SPLIT

In [None]:
# Load data 
dataFrame = pd.read_csv('digitalizações_registadas.csv', #'mdados_docs_cob.csv' mdados_docs_all_16k.csv
                       delimiter=";",
                       date_format="", 
                       parse_dates=['Data Emissão','Data vencimento indicada'])  #, parse_dates=['DataEmissao']

# Strip any leading or trailing whitespace from column names
dataFrame.columns = dataFrame.columns.str.strip()

# Get unnamed columns to remove
unnamed_columns = [col for col in dataFrame.columns if col.startswith('Unnamed')]

# Drop unnamed columns
dataFrame = dataFrame.drop(columns=unnamed_columns)

# Drop rows with any null values
dataFrame = dataFrame.dropna(subset=['Data vencimento indicada','Data Emissão','Origem']) #'Contrato'

dataFrame['Data entrada'] = pd.to_datetime(dataFrame['Data entrada'], format="%d/%m/%Y")


In [None]:
dataFrame = dataFrame[dataFrame['Origem'] != "Requisição"]
dataFrame['Labels'] = dataFrame['Origem']

dataFrame['FullText'] = (
"Fornecedor:"+dataFrame['Fornecedor'] 
+ '\n Data emissão:' + dataFrame['Data Emissão']  
+ '\n Data entrada:' + dataFrame['Data entrada'].dt.strftime('%d/%m/%Y')  
+ '\n Data vencimento:' + dataFrame['Data vencimento indicada']
+ "\n Valor com IVA:"+dataFrame["Valor com IVA"]
+ "\n"
)

In [None]:
# Assume the CSV has 'text' and 'label' columns
texts = dataFrame['FullText'].tolist()
labels = dataFrame['Labels'].tolist()

# Encode labels to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

special_tokens_dict = {"pad_token": "<PAD>"}
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128 ) #truncation=True, padding=True, max_length=128 , return_tensors='pt'

In [None]:
# Split dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    encodings['input_ids'], encoded_labels, test_size=0.2, random_state=42
)

train_masks, val_masks = train_test_split(
    encodings['attention_mask'], test_size=0.2, random_state=42
)

train_dataset = EncodedDataset({'input_ids': train_texts, 'attention_mask': train_masks}, train_labels)
val_dataset = EncodedDataset({'input_ids': val_texts, 'attention_mask': val_masks}, val_labels)

In [None]:
# DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=data_collator)

In [None]:
configs = [
    {
    "runName":"lstm-mps-v1-A",
    "lr":1e-4,
    "epochs":3,
    "weight_decay":0.00,
    "layers":3,
    "scheduler_type":"constant",
    "warmup_steps":0
    },       
    {
    "runName":"lstm-mps-v1-B",
    "lr":5e-4,
    "epochs":3,
    "weight_decay":0.00,
    "layers":3,
    "scheduler_type":"constant",
    "warmup_steps":0
    },   
        {
    "runName":"lstm-mps-v1-C",
    "lr":1e-3,
    "epochs":3,
    "weight_decay":0.00,
    "layers":3,
    "scheduler_type":"constant",
    "warmup_steps":0
    },   
]


In [None]:
for conf in configs:
    # Instantiate the model
    model = LSTMModel(vocab_size, embed_dim, hidden_dim, num_labels, conf['layers'])
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)

    num_epochs = conf['epochs']
    lr = conf['lr']
    total_train_steps = len(train_loader) * num_epochs

    optimizer, scheduler = getOptimizer(model, total_train_steps, scheduler_type=conf['scheduler_type'],
                                        lr=conf['lr'], weight_decay=conf['weight_decay'],
                                        warmup_steps=conf['warmup_steps'])
    
    criterion = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    train_losses = []
    steps = 0

    run = wandb.init(
        project="cob-demo",
        name=f"experiment_{conf['runName']}_layers_{conf['layers']}", 
        config={
            "learning_rate": conf['lr'],
            "epochs": num_epochs,
            "layers":conf['layers'],
            "weight_decay":conf['weight_decay'],
            "scheduler_type":conf['scheduler_type'],
            "warmup_steps":conf['warmup_steps']
        },
    )

    # Train model
    for epoch in range(num_epochs):
        epoch_train_loss = 0.0
        for batch in train_loader:
            steps += 1

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            epoch_train_loss += loss.item()
            wandb.log({"loss": loss.item(), "step":steps, "learning_rate":scheduler.get_last_lr()[0]})


        # Calculate average training loss for the epoch
        epoch_train_loss /= len(train_loader)
        train_losses.append(epoch_train_loss)
        
        print(f"Run: {conf['runName']}   |Epoch {epoch + 1}, Train Loss: {epoch_train_loss}")




    # Evaluate model
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    val_losses = []
    val_accuracies = []
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            #outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            outputs = model(input_ids, attention_mask)

            #_, predicted = torch.max(outputs.logits, 1)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            #loss = criterion(outputs.logits, labels)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
                
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Calculate average validation loss and accuracy
    val_loss /= len(val_loader)
    val_losses.append(val_loss)
    
    accuracy = 100 * correct / total
    val_accuracies.append(accuracy)
    
    print(f"Validation Loss: {val_loss}, Accuracy: {accuracy}%")


    report = classification_report(y_true, y_pred, output_dict=True)
    print("Classification Report:")
    print(report)

    wandb.run.summary["report"] = report
    wandb.finish()

In [None]:
# 1. Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

report = classification_report(y_true, y_pred)
print("Classification Report:")
print(report)