<a href="https://colab.research.google.com/github/ribesstefano/ml4justice-feature-predictor/blob/main/notebooks/ml4justice_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!python -m spacy download it_core_news_md -qqq
!python -m spacy download it_core_news_lg -qqq

2024-01-08 14:25:19.355011: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-08 14:25:19.355079: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-08 14:25:19.356542: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_md')
2024-01-08 14:25:36.638666: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factor

In [18]:
!pip install lightning -qqq

Collecting lightning
  Downloading lightning-2.1.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.1.3-py3-none-any.whl (777 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning, lightning
Successfully installed lightning-2.1.3 lightning-utilities-0.10.0 pytorch-lightning-2.1.3 torchmetrics-1.2.1


In [26]:
import pandas as pd

base_dir = "/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/processed/"
dataset_df = pd.read_csv(base_dir + 'dataset-sentenze-omicidio-stradale-with-summaries.csv', encoding='utf-8')

# Fix an empy 'facts' column...
dataset_df['facts'] = dataset_df.apply(lambda x: x['facts'] if pd.notna(x['facts']) else x['raw text'], axis=1)
dataset_df[dataset_df['facts'].isna()]

Unnamed: 0,corte,numero sentenza,localita commerciale o luogo di lavoro,vittima eta avanzata,vittima eta giovane,lavori stradali di manutenzione,guida senza patente valida imputato,guida senza patente valida vittima,piu imputati,strada a doppio senso di marcia,...,uso telefono alla guida,vittima perde controllo,imputato perde controllo,condanna,filename,text,raw text,facts,score,summary


## PyTorch Dataset

In [199]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, vectorizer=None):
        if vectorizer:
            self.vectorizer = vectorizer
            self.features = self.vectorizer.transform(texts)
        else:
            self.vectorizer = CountVectorizer(
                ngram_range=(1, 2),  # For example, unigrams and bigrams
                stop_words=['italian'],
                binary=True,
                decode_error='ignore',
            )
            self.features = self.vectorizer.fit_transform(texts)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'X': torch.tensor(self.features[idx].toarray()).squeeze().float(),
            'y': torch.tensor(self.labels.iloc[idx]).float(),
        }

# Data Preparation
texts = dataset_df['raw text']
labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed
dataset = TextDataset(texts, labels)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)
# dataset[0]
batch = next(iter(train_loader))
print(batch['X'].size())
print(batch['y'].size())

torch.Size([8, 102139])
torch.Size([8, 44])


In [196]:
label_columns = dataset_df.iloc[:, 2:46].columns.tolist()
id2label = {i: l for i, l in enumerate(label_columns)}
label2id = {v: k for k, v in id2label.items()}
# id2label

In [179]:
def balance_train_df(train_df, label_columns=label_columns, oversampling_dumping=1.0, min_oversampling=2):
    """ Balance the train_df by oversampling the minority class for each label in label_columns """
    balanced_dfs = []

    for label in label_columns:
        # Get the number of samples with label set to 1 and 0
        label_1_count = train_df[train_df[label] == 1].shape[0]
        label_0_count = train_df[train_df[label] == 0].shape[0]

        # Skip oversampling if all labels are the same
        if label_1_count == 0 or label_0_count == 0:
            balanced_dfs.append(train_df)
            continue

        # Determine the minority and majority class
        minority_class = 1 if label_1_count < label_0_count else 0
        majority_class = 1 if minority_class == 0 else 0

        # Calculate the oversampling factor
        oversampling_factor = label_0_count // label_1_count if minority_class == 1 else label_1_count // label_0_count
        oversampling_factor = max(min_oversampling, int(oversampling_factor * oversampling_dumping))

        # Oversample the minority class
        minority_df = train_df[train_df[label] == minority_class]
        oversampled_minority_df = pd.concat([minority_df] * oversampling_factor)

        # Combine the oversampled minority class with the majority class
        balanced_df = pd.concat([train_df[train_df[label] == majority_class], oversampled_minority_df])

        # # Print the number of samples with label set to 1 and 0
        # print(f"{label}: {len(balanced_df)} (0/1: {balanced_df[balanced_df[label] == 0].shape[0]} / {balanced_df[balanced_df[label] == 1].shape[0]})")

        balanced_dfs.append(balanced_df)

    # Merge the balanced dataframes for each label
    balanced_train_df = pd.concat(balanced_dfs)

    return balanced_train_df

print(len(dataset_df))
print(len(balance_train_df(dataset_df, oversampling_dumping=0.3)))

85
4543


## PyTorch Lightning Model

In [229]:
from torch import nn
from torchmetrics import (
    MetricCollection,
    Accuracy,
    AUROC,
    Precision,
    Recall,
    F1Score,
    # MeanAbsoluteError,
    # MeanSquaredError,
)
from torchmetrics.functional.classification import (
    accuracy,
    auroc,
    precision,
    recall,
    f1_score,
)

In [231]:
class TextClassifier(pl.LightningModule):
    def __init__(self, num_features, num_labels):
        super().__init__()
        self.linear = nn.Linear(num_features, num_labels)
        self.bin_loss = nn.BCELoss()
        # Metrics, a separate metrics collection is defined for each stage
        # NOTE: According to the PyTorch Lightning docs, "similar" metrics,
        # i.e., requiring the same computation, should be optimized w/in a
        # metrics collection.
        stages = ['train_metrics', 'val_metrics', 'test_metrics']
        self.metrics = nn.ModuleDict({s: MetricCollection({
            'acc': Accuracy(task='multilabel', num_labels=num_labels), # Default: macro
            'roc_auc': AUROC(task='multilabel', num_labels=num_labels),
            'precision': Precision(task='multilabel', num_labels=num_labels),
            'recall': Recall(task='multilabel', num_labels=num_labels),
            'f1_score': F1Score(task='multilabel', num_labels=num_labels),
            'acc_weighted': Accuracy(task='multilabel', num_labels=num_labels, average='weighted'),
            'roc_auc_weighted': AUROC(task='multilabel', num_labels=num_labels, average='weighted'),
            'precision_weighted': Precision(task='multilabel', num_labels=num_labels, average='weighted'),
            'recall_weighted': Recall(task='multilabel', num_labels=num_labels, average='weighted'),
            'f1_score_weighted': F1Score(task='multilabel', num_labels=num_labels, average='weighted'),
        }, prefix=s.replace('metrics', '')) for s in stages})

    def forward(self, x):
        return torch.sigmoid(self.linear(x))  # Sigmoid for binary classification

    def step(self, batch, stage='train'):
        x, y = batch['X'], batch['y']
        preds = self.forward(x)
        loss = self.bin_loss(preds, y)
        self.metrics[f'{stage}_metrics'].update(preds, y.to(torch.int32))
        self.log(f'{stage}_loss', loss, on_epoch=True, prog_bar=True)
        self.log_dict(self.metrics[f'{stage}_metrics'], on_epoch=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self.step(batch, stage='train')

    def validation_step(self, batch, batch_idx):
        return self.step(batch, stage='val')

    def test_step(self, batch, batch_idx):
        return self.step(batch, stage='test')

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-3)

# Data Preparation
# balanced_df = balance_train_df(dataset_df, oversampling_dumping=0.3)
# texts = balanced_df['raw text']
# labels = balanced_df.iloc[:, 2:46]  # Adjust indices as needed

# Data Preparation
texts = dataset_df['raw text']
labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed

dataset = TextDataset(texts, labels)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Model Initialization
num_features = len(dataset.vectorizer.get_feature_names_out())
num_labels = labels.shape[1]
model = TextClassifier(num_features, num_labels)

# Training
trainer = pl.Trainer(max_epochs=1, callbacks=[ModelCheckpoint(dirpath='model/', monitor='val_loss')])
trainer.fit(model, train_loader, train_loader)
trainer.test(model, train_loader)[0]

y_preds = trainer.predict(model, torch.vstack([x['X'] for x in train_loader]))[0].unsqueeze(dim=0)
y_preds = nn.Sigmoid()(y_preds)
print(y_preds)
y_targets = np.zeros(y_preds.shape)
y_targets[np.where(y_preds) == 1] = 1
y_targets = torch.Tensor(y_targets).to(torch.int32)

print(accuracy(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=0.3))
print(precision(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=0.3))
print(recall(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=0.3))
print(f1_score(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=0.3))
print(auroc(y_preds, y_targets, task='multilabel', num_labels=num_labels))

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /content/model exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type       | Params
----------------------------------------
0 | linear   | Linear     | 4.5 M 
1 | bin_loss | BCELoss    | 0     
2 | metrics  | ModuleDict | 0     
----------------------------------------
4.5 M     Trainable params
0         Non-trainable params
4.5 M     Total params
17.977    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (2) is smaller than the logging interval Trainer(log

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:492: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

tensor([[0.5232, 0.5704, 0.5190, 0.5166, 0.5206, 0.5180, 0.5188, 0.5278, 0.5262,
         0.5185, 0.7142, 0.7126, 0.5557, 0.5166, 0.5176, 0.5174, 0.5159, 0.5204,
         0.5177, 0.5192, 0.5208, 0.5697, 0.5461, 0.5174, 0.5155, 0.5206, 0.7142,
         0.5590, 0.5188, 0.5405, 0.5199, 0.5854, 0.5162, 0.6166, 0.5171, 0.5789,
         0.5181, 0.5220, 0.5226, 0.5549, 0.5203, 0.5163, 0.5172, 0.5172]])
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)


## Leave-One-Out Traning

In [191]:
import pytorch_lightning as pl
from sklearn.model_selection import LeaveOneOut
from torch.utils.data import Subset

# Data Preparation
texts = dataset_df['raw text'] # dataset_df['facts']
labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed
full_dataset = TextDataset(texts, labels)
# Fit the vectorizer on the entire dataset
vectorizer = CountVectorizer(
    ngram_range=(1, 2),  # For example, unigrams and bigrams
    stop_words=['italian'],
    binary=True,
    decode_error='ignore',
)
vectorizer.fit(texts)

# Leave-One-Out Cross-Validation
loo = LeaveOneOut()
report_table = []

for k, (train_index, test_index) in enumerate(loo.split(full_dataset)):
    # Splitting the dataset
    # Train
    texts = balance_train_df(dataset_df.iloc[train_index, :], oversampling_dumping=0.3)['raw text']
    # texts = dataset_df.iloc[train_index, :]['raw text']
    labels = dataset_df.iloc[train_index, 2:46]  # Adjust indices as needed
    train_subset = TextDataset(texts, labels, vectorizer)
    # Test
    texts = dataset_df.iloc[test_index, :]['raw text']
    labels = dataset_df.iloc[test_index, 2:46]  # Adjust indices as needed
    test_subset = TextDataset(texts, labels, vectorizer)

    print(train_subset[0]['X'].size())
    print(test_subset[0]['X'].size())

    # train_subset = Subset(full_dataset, train_index)
    # test_subset = Subset(full_dataset, test_index)

    # Creating data loaders for training and testing
    train_loader = DataLoader(train_subset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_subset, batch_size=1, shuffle=False)

    # Model Initialization
    num_features = len(vectorizer.get_feature_names_out())
    num_labels = labels.shape[1]
    model = TextClassifier(num_features, num_labels)
    # Trainer setup
    trainer = pl.Trainer(
        max_epochs=5,
        enable_model_summary=False,
        precision="32-true",
    )
    # Train the model
    trainer.fit(model, train_loader, test_loader)
    # Evaluate the model on the test data
    results = trainer.test(model, test_loader, verbose=False)[0]

y_preds = trainer.predict(model, torch.vstack([x['X'] for x in train_loader]))[0].unsqueeze(dim=0)
y_preds = nn.Sigmoid()(y_preds)
print(y_preds)
y_targets = np.zeros(y_preds.shape)
y_targets[np.where(y_preds) == 1] = 1
y_targets = torch.Tensor(y_targets).to(torch.int32)

    for threshold in [0.1 * i for i in range(11)]:
        results.update({
            f'test_accuracy': accuracy(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold),
            f'test_precision': precision(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold),
            f'test_recall': recall(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold),
            f'test_f1_score': f1_score(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold),
        })


    X = torch.vstack([x['X'] for x in test_loader])
    y_logits = torch.vstack(trainer.predict(model, X))
    y_probs = nn.Sigmoid()(y_logits).numpy()
    print(y_probs.shape)

    predictions = get_predictions(y_probs, threshold=0.7)
    predicted_labels = get_predicted_labels(predictions, id2label=id2label)
    print(predictions)
    print(predicted_labels)

    results['k'] = k
    report_table.append(results)
    print(f"K = {k} done.")
    if k > 2:
        break

report_table = pd.DataFrame(report_table)
report_table
# # Calculate the average accuracy
# average_accuracy = sum(accuracies) / len(accuracies)
# print(f'Average Accuracy: {average_accuracy}')

torch.Size([102139])
torch.Size([102139])


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

(1, 44)
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.]]
[['prossimita di un incrocio o curva', 'ostacolo prevedibile dall imputato', 'ostacolo evitabile dall imputato', 'eccesso verso imputato', 'notte', 'maltempo', 'violazione stop o semaforo o codice della strada da parte imputato', 'strada rovinata o bagnata', 'guida brusca o manovra brusca imputato', 'presenza rimorchio imputato', 'imputato perde controllo']]
K = 0 done.
torch.Size([102139])
torch.Size([102139])


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

(1, 44)
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
[['prossimita di un incrocio o curva', 'ostacolo prevedibile dall imputato', 'ostacolo evitabile dall imputato', 'eccesso verso imputato', 'notte', 'maltempo', 'violazione stop o semaforo o codice della strada da parte imputato', 'strada rovinata o bagnata', 'presenza rimorchio imputato']]
K = 1 done.
torch.Size([102139])
torch.Size([102139])


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

(1, 44)
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[['prossimita di un incrocio o curva', 'ostacolo prevedibile dall imputato', 'ostacolo evitabile dall imputato', 'eccesso verso imputato', 'notte', 'maltempo', 'violazione stop o semaforo o codice della strada da parte imputato']]
K = 2 done.
torch.Size([102139])
torch.Size([102139])


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

(1, 44)
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[['prossimita di un incrocio o curva', 'ostacolo prevedibile dall imputato', 'ostacolo evitabile dall imputato', 'eccesso verso imputato', 'notte', 'maltempo', 'violazione stop o semaforo o codice della strada da parte imputato']]
K = 3 done.


Unnamed: 0,test_loss,test_acc,test_f1_score,test_hp_metric,test_opt_score,test_precision,test_recall,test_roc_auc,k
0,0.326519,0.886364,0.736842,0.886364,1.623206,0.636364,0.875,0.986111,0
1,0.202718,0.886364,0.736842,0.886364,1.623206,0.636364,0.875,0.986111,1
2,0.096055,0.954545,0.909091,0.954545,1.863636,0.909091,0.909091,0.988981,2
3,0.171738,0.909091,0.8,0.909091,1.709091,0.888889,0.727273,0.988981,3


In [160]:
def get_predictions(probs, threshold=0.5):
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= threshold)] = 1
    return predictions

def get_predicted_labels(predictions, id2label=id2label):
    # Turn predicted ID's into actual label names
    labeled_predictions = []
    for pred in predictions:
        tmp = [id2label[idx] for idx, label in enumerate(pred) if label == 1.0]
        labeled_predictions.append(tmp)
    return labeled_predictions

# Data Preparation
texts = dataset_df['raw text']
labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed
dataset = TextDataset(texts, labels)
train_loader = DataLoader(dataset, batch_size=8, shuffle=False)

X = torch.vstack([x['X'] for x in train_loader])
y_logits = torch.vstack(trainer.predict(model, X))
y_probs = nn.Sigmoid()(y_logits).numpy()
print(y_probs.shape)

predictions = get_predictions(y_probs, threshold=0.7)
predicted_labels = get_predicted_labels(predictions, id2label=id2label)
for i, (t, l) in enumerate(zip(texts, predicted_labels)):
    print('-' * 80)
    print(f'Sentenza n. {i}:')
    print('-' * 80)
    print(l)
    print(t)
    if i > 4:
        break

Predicting: |          | 0/? [00:00<?, ?it/s]

(85, 44)
--------------------------------------------------------------------------------
Sentenza n. 0:
--------------------------------------------------------------------------------
['prossimita di un incrocio o curva', 'ostacolo prevedibile dall imputato', 'ostacolo evitabile dall imputato', 'eccesso verso imputato', 'notte', 'maltempo', 'violazione stop o semaforo o codice della strada da parte imputato']
SENTENZA 
Cassazione penale sez. IV - 03/05/2023, n. 20253 
Intestazione
 LA CORTE SUPREMA DI CASSAZIONE 
 SEZIONE QUARTA PENALE 
 Composta dagli Ill.mi Sigg.ri Magistrati: 
Dott. PICCIALLI Patrizia - Presidente - 
Dott. PEZZELLA Vincenzo - Consigliere - 
Dott. D'ANDREA Alessadro - Consigliere - 
Dott. MICCICHE'Loredana - rel. Consigliere - 
Dott. NOCERA Andrea - Consigliere - 
ha pronunciato la seguente: 
 SENTENZA 
sul ricorso proposto da: 
 H.M. nato il (Omissis); 
avverso la sentenza del 12/04/2022 della CORTE APPELLO di ANCONA 
visti gli atti, il provvedimento impugnato e i

In [85]:
report_table.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-loo.csv", index=False)

## Plotting

In [81]:
tmp = report_table.drop(columns=['test_hp_metric', 'k', 'test_opt_score']).describe().round(3)
tmp.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-loo-aggregated.csv")
tmp

Unnamed: 0,test_loss,test_acc,test_f1_score,test_precision,test_recall,test_roc_auc
count,85.0,85.0,85.0,85.0,85.0,85.0
mean,0.776,0.879,0.539,0.699,0.47,0.836
std,0.892,0.057,0.232,0.3,0.237,0.114
min,0.058,0.75,0.0,0.0,0.0,0.5
25%,0.28,0.841,0.462,0.6,0.308,0.776
50%,0.469,0.886,0.571,0.75,0.444,0.844
75%,0.969,0.909,0.714,1.0,0.625,0.914
max,6.291,0.977,0.9,1.0,1.0,1.0


## Stratified K-Fold

Cannot work on multi-labels, skipping.

In [97]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.preprocessing import MultiLabelBinarizer

# # Data Preparation
# texts = dataset_df['raw text'] # dataset_df['facts']
# labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed
# full_dataset = TextDataset(texts, labels)

# mlb = MultiLabelBinarizer()
# mlb.fit(labels_columns)
# print(len(mlb.transform(labels)))

# skf = StratifiedKFold(n_splits=5)

# for k, (train_index, test_index) in enumerate(skf.split(texts.to_numpy(), mlb.transform(labels).T)):
#     # Splitting the dataset
#     train_subset = Subset(full_dataset, train_index)
#     test_subset = Subset(full_dataset, test_index)
#     # Creating data loaders for training and testing
#     train_loader = DataLoader(train_subset, batch_size=8, shuffle=True)
#     test_loader = DataLoader(test_subset, batch_size=1, shuffle=False)
#     # Model Initialization
#     num_features = len(full_dataset.vectorizer.get_feature_names_out())
#     num_labels = labels.shape[1]
#     model = TextClassifier(num_features, num_labels)
#     # Trainer setup
#     trainer = pl.Trainer(
#         max_epochs=5,
#         enable_model_summary=False,
#         precision="32-true",
#     )
#     # Train the model
#     trainer.fit(model, train_loader)
#     # Evaluate the model on the test data
#     results = trainer.test(model, test_loader, verbose=False)[0]
#     results['k'] = k
#     report_table.append(results)
#     print(f"K = {k} done.")

# report_table = pd.DataFrame(report_table)
# report_table.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-skf.csv", index=False)
# report_table

In [None]:
# tmp = report_table.drop(columns=['test_hp_metric', 'k', 'test_opt_score']).describe().round(3)
# tmp.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-skf-aggregated.csv")
# tmp

## Using Text Embeddings from Spacy

In [12]:
import spacy

# nlp = spacy.load('it_core_news_md')
nlp = spacy.load('it_core_news_lg')

In [None]:
for doc in nlp.pipe(dataset_df['facts'].dropna().tolist(), batch_size=50):
    print(doc.vector.shape)
    break

In [8]:
doc = nlp('Il tuo testo qui')

for token in doc:
    print(token.text, token.vector)

Il [-7.8459e+00 -4.5490e+00  6.1788e+00  6.7682e-04  1.7720e+00 -6.0620e+00
 -1.6026e+01 -4.0896e-02  7.3851e+00 -4.2115e+00 -4.1298e+00 -5.9586e+00
 -1.0974e+01 -5.8506e+00  2.2846e+00 -9.4184e-01 -9.5699e+00  9.4961e+00
 -5.6640e+00  5.0692e+00 -1.1745e+00  3.2972e-01 -2.8499e+00 -2.3215e+00
 -5.7043e+00 -5.5089e+00 -4.1577e+00 -6.0236e+00  2.3485e-01 -1.2325e+01
 -4.4324e+00 -1.0489e+01 -1.1555e+01  2.4902e+00 -7.3307e+00  7.3845e+00
 -1.3803e+00  6.0386e+00 -4.2963e+00 -2.3835e+00  4.8599e+00  2.3706e+00
  1.0202e+01  9.2351e+00 -5.3355e+00 -5.4578e+00  4.4547e-02 -3.1373e+00
  1.0590e+01  3.2910e+00  1.2975e+00  5.4566e+00 -8.4582e-01  5.5630e+00
 -3.7731e+00  2.4545e+00  8.9220e-01 -1.1405e+00 -4.5700e+00 -5.5250e+00
 -1.4209e+01 -1.3374e+01  1.3272e+00 -6.6337e+00  7.8099e-01  1.1635e+01
  6.1561e+00  8.4455e+00 -2.2063e+00  2.8781e-01 -1.2951e+01 -9.9293e+00
 -2.3724e+00 -7.6580e+00  3.3341e+00 -7.9327e+00 -6.2643e+00 -4.8530e+00
  5.2297e-01 -5.3943e+00 -1.9698e+00 -1.6415e+00