In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding, GPT2Tokenizer, DistilBertForSequenceClassification, DistilBertModel, DistilBertTokenizer, TrainingArguments, Trainer, TrainingArguments
from datasets import load_metric
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from torch import nn
import matplotlib.pyplot as plt
import seaborn as sns
import random
import wandb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, accuracy_score
from sklearn.svm import SVC


In [48]:
do_label_same_size_sample = True

seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available() else None
np.random.seed(seed)
random.seed(seed)

In [49]:
# Load data 
dataFrame = pd.read_csv('digitalizações_registadas.csv', #'mdados_docs_cob.csv' mdados_docs_all_16k.csv
                       delimiter=";",
                       date_format="", 
                       parse_dates=['Data Emissão','Data vencimento indicada'])  #, parse_dates=['DataEmissao']

# Strip any leading or trailing whitespace from column names
dataFrame.columns = dataFrame.columns.str.strip()

# Get unnamed columns to remove
unnamed_columns = [col for col in dataFrame.columns if col.startswith('Unnamed')]

# Drop unnamed columns
dataFrame = dataFrame.drop(columns=unnamed_columns)

# Drop rows with any null values
dataFrame = dataFrame.dropna(subset=['Data vencimento indicada','Data Emissão','Origem']) #'Contrato'
# Drop specific row with date in year 221
dataFrame = dataFrame[dataFrame['Nº documento Fornecedor'] != "ZRF2 2/6001001951"]

dataFrame['Data Emissão'] = pd.to_datetime(dataFrame['Data Emissão'], format="%d/%m/%Y")
dataFrame['Data entrada'] = pd.to_datetime(dataFrame['Data entrada'], format="%d/%m/%Y")
dataFrame['Data vencimento indicada'] = pd.to_datetime(dataFrame['Data vencimento indicada'], format="%d/%m/%Y")

dataFrame['Valor com IVA'] = dataFrame['Valor com IVA'].str.replace(',', '').astype(float)
#dataFrame = pd.get_dummies(dataFrame, columns=['Categoria'])


In [50]:
dataFrame = dataFrame[dataFrame['Origem'] != "Requisição"]
dataFrame['Labels'] = dataFrame['Origem']

In [51]:
if do_label_same_size_sample:
    grouped = dataFrame.groupby('Labels')
    min_size = grouped.size().min()
    dataFrame = grouped.apply(lambda x: x.sample(min_size)).reset_index(drop=True)

In [52]:
len(dataFrame)

7580

In [53]:
# Feature Engineering
dataFrame['MonthEmissão'] = dataFrame['Data Emissão'].dt.month
dataFrame['DayOfWeekEmissão'] = dataFrame['Data Emissão'].dt.dayofweek

dataFrame['MonthEntrada'] = dataFrame['Data entrada'].dt.month
dataFrame['DayOfWeekEntrada'] = dataFrame['Data entrada'].dt.dayofweek

dataFrame['MonthVencimento'] = dataFrame['Data vencimento indicada'].dt.month
dataFrame['DayOfWeekVencimento'] = dataFrame['Data vencimento indicada'].dt.dayofweek

In [54]:
# encode data
label_encoder = LabelEncoder()

dataFrame_before_2024 = dataFrame[dataFrame['Data entrada'] < '2024-02-01']
dataFrame_after_2024 = dataFrame[dataFrame['Data entrada'] >= '2024-02-01']


train_labels = dataFrame_before_2024['Labels'].tolist()
test_labels = dataFrame_after_2024['Labels'].tolist()

classes = ""
encoded_labels_train = label_encoder.fit_transform(train_labels)
classes = label_encoder.classes_ #quick hack
encoded_labels_test = label_encoder.fit_transform(test_labels)


dataFrame_before_2024 = dataFrame_before_2024.drop(["Labels","Estado",
                   "Data Emissão","Data vencimento indicada",
                   "Data entrada","Origem","Nº documento Fornecedor",
                   "Nº Encomenda","Âmbito de Compra"], axis=1)

dataFrame_before_2024['Fornecedor'] = label_encoder.fit_transform(dataFrame_before_2024['Fornecedor'])


dataFrame_after_2024 = dataFrame_after_2024.drop(["Labels","Estado",
                   "Data Emissão","Data vencimento indicada",
                   "Data entrada","Origem","Nº documento Fornecedor",
                   "Nº Encomenda","Âmbito de Compra"], axis=1)

dataFrame_after_2024['Fornecedor'] = label_encoder.fit_transform(dataFrame_after_2024['Fornecedor'])

In [55]:
X_train = dataFrame_before_2024
X_test = dataFrame_after_2024

y_train = encoded_labels_train
y_test = encoded_labels_test

# Verify the dimensions of the splits
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (6667, 8)
X_test shape: (913, 8)
y_train shape: (6667,)
y_test shape: (913,)


In [56]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [57]:
# Create and train the SVM model
svm = SVC(kernel='rbf', random_state=42, C=0.5)
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=classes)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.47
Classification Report:
              precision    recall  f1-score   support

    Contrato       0.43      0.98      0.60       375
      Manual       0.87      0.11      0.19       538

    accuracy                           0.47       913
   macro avg       0.65      0.54      0.40       913
weighted avg       0.69      0.47      0.36       913



In [58]:
# Create and train the SVM model
svm = SVC(kernel='poly', random_state=42, C=0.5)
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=classes)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.47
Classification Report:
              precision    recall  f1-score   support

    Contrato       0.43      0.98      0.60       375
      Manual       0.88      0.11      0.19       538

    accuracy                           0.47       913
   macro avg       0.66      0.54      0.40       913
weighted avg       0.70      0.47      0.36       913

