<a href="https://colab.research.google.com/github/ribesstefano/ml4justice-feature-predictor/blob/main/notebooks/ml4justice_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install lightning -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd

base_dir = "/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/processed/"
dataset_df = pd.read_csv(base_dir + 'dataset-sentenze-omicidio-stradale-with-summaries.csv', encoding='utf-8')

# Fix an empy 'facts' column...
dataset_df['facts'] = dataset_df.apply(lambda x: x['facts'] if pd.notna(x['facts']) else x['raw text'], axis=1)
dataset_df[dataset_df['facts'].isna()]

Unnamed: 0,corte,numero sentenza,localita commerciale o luogo di lavoro,vittima eta avanzata,vittima eta giovane,lavori stradali di manutenzione,guida senza patente valida imputato,guida senza patente valida vittima,piu imputati,strada a doppio senso di marcia,...,uso telefono alla guida,vittima perde controllo,imputato perde controllo,condanna,filename,text,raw text,facts,score,summary


In [3]:
label_columns = dataset_df.iloc[:, 2:46].columns.tolist()
id2label = {i: l for i, l in enumerate(label_columns)}
label2id = {v: k for k, v in id2label.items()}
# id2label

## PyTorch Dataset

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, vectorizer=None, precomputed_features=False):
        self.precomputed_features = precomputed_features
        if precomputed_features:
            self.features = texts
        else:
            if vectorizer:
                self.vectorizer = vectorizer
                self.features = self.vectorizer.transform(texts)
            else:
                self.vectorizer = CountVectorizer(
                    ngram_range=(1, 2),  # For example, unigrams and bigrams
                    stop_words=['italian'],
                    binary=True,
                    decode_error='ignore',
                )
                self.features = self.vectorizer.fit_transform(texts)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if self.precomputed_features:
            return {
                'X': torch.tensor(self.features[idx]).squeeze().float(),
                'y': torch.tensor(self.labels.iloc[idx]).float(),
            }
        else:
            return {
                'X': torch.tensor(self.features[idx].toarray()).squeeze().float(),
                'y': torch.tensor(self.labels.iloc[idx]).float(),
            }

# Data Preparation
texts = dataset_df['raw text']
labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed
dataset = TextDataset(texts, labels)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)
# dataset[0]
batch = next(iter(train_loader))
print(batch['X'].size())
print(batch['y'].size())

torch.Size([8, 102139])
torch.Size([8, 44])


In [5]:
def balance_train_df(train_df, label_columns=label_columns, oversampling_dumping=1.0, min_oversampling=2):
    """ Balance the train_df by oversampling the minority class for each label in label_columns """
    balanced_dfs = []

    for label in label_columns:
        # Get the number of samples with label set to 1 and 0
        label_1_count = train_df[train_df[label] == 1].shape[0]
        label_0_count = train_df[train_df[label] == 0].shape[0]

        # Skip oversampling if all labels are the same
        if label_1_count == 0 or label_0_count == 0:
            balanced_dfs.append(train_df)
            continue

        # Determine the minority and majority class
        minority_class = 1 if label_1_count < label_0_count else 0
        majority_class = 1 if minority_class == 0 else 0

        # Calculate the oversampling factor
        oversampling_factor = label_0_count // label_1_count if minority_class == 1 else label_1_count // label_0_count
        oversampling_factor = max(min_oversampling, int(oversampling_factor * oversampling_dumping))

        # Oversample the minority class
        minority_df = train_df[train_df[label] == minority_class]
        oversampled_minority_df = pd.concat([minority_df] * oversampling_factor)

        # Combine the oversampled minority class with the majority class
        balanced_df = pd.concat([train_df[train_df[label] == majority_class], oversampled_minority_df])

        # # Print the number of samples with label set to 1 and 0
        # print(f"{label}: {len(balanced_df)} (0/1: {balanced_df[balanced_df[label] == 0].shape[0]} / {balanced_df[balanced_df[label] == 1].shape[0]})")

        balanced_dfs.append(balanced_df)

    # Merge the balanced dataframes for each label
    balanced_train_df = pd.concat(balanced_dfs)

    return balanced_train_df

print(len(dataset_df))
print(len(balance_train_df(dataset_df, oversampling_dumping=0.3)))

85
4543


## PyTorch Lightning Model

In [6]:
from torch import nn
import numpy as np
from torchmetrics import (
    MetricCollection,
    Accuracy,
    AUROC,
    Precision,
    Recall,
    F1Score,
    # MeanAbsoluteError,
    # MeanSquaredError,
)
from torchmetrics.functional.classification import (
    accuracy,
    auroc,
    precision,
    recall,
    f1_score,
)

In [7]:
class TextClassifier(pl.LightningModule):
    def __init__(self, num_features, num_labels, hidden_size=0, dropout=0.6):
        super().__init__()
        if hidden_size:
            self.model = nn.Sequential(
                nn.Linear(num_features, hidden_size),
                nn.Dropout(dropout),
                nn.Linear(hidden_size, num_labels),
            )
        else:
            self.model = nn.Sequential(
                nn.Linear(num_features, num_labels),
            )
        self.bin_loss = nn.BCELoss()
        # Metrics, a separate metrics collection is defined for each stage
        # NOTE: According to the PyTorch Lightning docs, "similar" metrics,
        # i.e., requiring the same computation, should be optimized w/in a
        # metrics collection.
        stages = ['train_metrics', 'val_metrics', 'test_metrics']
        self.metrics = nn.ModuleDict({s: MetricCollection({
            'acc': Accuracy(task='multilabel', num_labels=num_labels), # Default: macro
            'roc_auc': AUROC(task='multilabel', num_labels=num_labels),
            'precision': Precision(task='multilabel', num_labels=num_labels),
            'recall': Recall(task='multilabel', num_labels=num_labels),
            'f1_score': F1Score(task='multilabel', num_labels=num_labels),
            'acc_weighted': Accuracy(task='multilabel', num_labels=num_labels, average='weighted'),
            'roc_auc_weighted': AUROC(task='multilabel', num_labels=num_labels, average='weighted'),
            'precision_weighted': Precision(task='multilabel', num_labels=num_labels, average='weighted'),
            'recall_weighted': Recall(task='multilabel', num_labels=num_labels, average='weighted'),
            'f1_score_weighted': F1Score(task='multilabel', num_labels=num_labels, average='weighted'),
        }, prefix=s.replace('metrics', '')) for s in stages})

    def forward(self, x):
        return torch.sigmoid(self.model(x))  # Sigmoid for binary classification

    def step(self, batch, stage='train'):
        x, y = batch['X'], batch['y']
        preds = self.forward(x)
        loss = self.bin_loss(preds, y)
        self.metrics[f'{stage}_metrics'].update(preds, y.to(torch.int32))
        self.log(f'{stage}_loss', loss, on_epoch=True, prog_bar=True)
        self.log_dict(self.metrics[f'{stage}_metrics'], on_epoch=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self.step(batch, stage='train')

    def validation_step(self, batch, batch_idx):
        return self.step(batch, stage='val')

    def test_step(self, batch, batch_idx):
        return self.step(batch, stage='test')

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-3)

# Data Preparation
# balanced_df = balance_train_df(dataset_df, oversampling_dumping=0.3)
# texts = balanced_df['raw text']
# labels = balanced_df.iloc[:, 2:46]  # Adjust indices as needed

# Data Preparation
texts = dataset_df['raw text']
labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed

dataset = TextDataset(texts, labels)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Model Initialization
num_features = len(dataset.vectorizer.get_feature_names_out())
num_labels = labels.shape[1]
model = TextClassifier(num_features, num_labels, hidden_size=512)

# Training
trainer = pl.Trainer(max_epochs=2, callbacks=[ModelCheckpoint(dirpath='model/', monitor='val_loss')])
trainer.fit(model, train_loader, train_loader)
trainer.test(model, train_loader)[0]

y_preds = trainer.predict(model, torch.vstack([x['X'] for x in train_loader]))[0].unsqueeze(dim=0)
y_preds = nn.Sigmoid()(y_preds)
print(y_preds)
y_targets = np.zeros(y_preds.shape)
y_targets[np.where(y_preds) == 1] = 1
y_targets = torch.Tensor(y_targets).to(torch.int32)

print(accuracy(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=0.3))
print(precision(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=0.3))
print(recall(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=0.3))
print(f1_score(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=0.3))
print(auroc(y_preds, y_targets, task='multilabel', num_labels=num_labels))

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type       | Params
----------------------------------------
0 | model    | Sequential | 52.3 M
1 | bin_loss | BCELoss    | 0     
2 | metrics  | ModuleDict | 0     
----------------------------------------
52.3 M    Trainable params
0         Non-trainable params
52.3 M    Total params
209.273   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (2) is smaller than the logging interval Trainer(log

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:492: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

tensor([[0.5078, 0.5067, 0.5024, 0.5002, 0.5009, 0.5004, 0.5002, 0.5244, 0.5438,
         0.5006, 0.6304, 0.6751, 0.5085, 0.5023, 0.5006, 0.5001, 0.5046, 0.5014,
         0.5002, 0.5029, 0.5037, 0.5028, 0.6624, 0.5610, 0.5040, 0.5240, 0.6373,
         0.5055, 0.5003, 0.6521, 0.5038, 0.5048, 0.5010, 0.5336, 0.5012, 0.5260,
         0.5003, 0.5088, 0.5236, 0.5006, 0.5006, 0.5001, 0.5007, 0.5026]])
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)


## Leave-One-Out Traning

### Precompute Embeddings

In [8]:
dataset_df['raw text'].count()

85

In [26]:
# Data Preparation
texts = dataset_df['raw text'] # dataset_df['facts']
labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed
full_dataset = TextDataset(texts, labels)
# Fit the vectorizer on the entire dataset
vectorizer = CountVectorizer(
    # ngram_range=(1, 2),  # For example, unigrams and bigrams
    ngram_range=(1, 2),  # For example, unigrams and bigrams
    stop_words=['italian'],
    binary=True,
    decode_error='ignore',
)
vectors = vectorizer.fit_transform(dataset_df['raw text']).toarray().tolist()
num_features_vec = len(vectorizer.get_feature_names_out())
dataset_df['count_vectors'] = vectors
print(f"{num_features_vec:,}")

87,831


In [11]:
!python -m spacy download it_core_news_lg -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.9/567.9 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')


In [12]:
import spacy

nlp = spacy.load('it_core_news_lg')

# texts = dataset_df['raw text'].tolist()
texts = dataset_df['facts'].tolist()
vectors = [doc.vector for doc in nlp.pipe(texts, batch_size=16)]
num_features_emb = np.array(vectors).shape[1]
dataset_df['text_embedding'] = vectors

del nlp

len(dataset_df['text_embedding'])

85

### Start Training Loop

In [17]:
import pytorch_lightning as pl
from sklearn.model_selection import LeaveOneOut
from torch.utils.data import Subset

USE_SPACY = False
num_features = num_features_emb if USE_SPACY else num_features_vec
features_col = "text_embedding" if USE_SPACY else "count_vectors"

# Leave-One-Out Cross-Validation
loo = LeaveOneOut()
report_table = []
report_table_per_threshold = []
report_table_per_label = []
predictions = []
targets = []

for k, (train_index, test_index) in enumerate(loo.split(full_dataset)):
    # Splitting the dataset
    # train_subset = Subset(full_dataset, train_index)
    # test_subset = Subset(full_dataset, test_index)
    train_df = dataset_df.iloc[train_index, :].copy()
    test_df = dataset_df.iloc[test_index, :].copy()
    # Train
    # train_df = balance_train_df(train_df, oversampling_dumping=0.3)
    texts = np.array(train_df[features_col]) # ['raw text']
    labels = train_df.iloc[:, 2:46]  # Adjust indices as needed
    train_subset = TextDataset(texts, labels, vectorizer, precomputed_features=True)
    # Test
    texts = np.array(test_df[features_col]) # ['raw text']
    labels = test_df.iloc[:, 2:46]  # Adjust indices as needed
    test_subset = TextDataset(texts, labels, vectorizer, precomputed_features=True)

    # Creating data loaders for training and testing
    train_loader = DataLoader(train_subset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_subset, batch_size=1, shuffle=False)

    # Model Initialization
    num_labels = labels.shape[1]
    model = TextClassifier(num_features, num_labels, hidden_size=512)
    # Trainer setup
    trainer = pl.Trainer(
        max_epochs=2,
        # max_steps=1000,
        enable_model_summary=False,
        precision="bf16-true" if torch.cuda.is_available() else "32-true",
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        devices="auto",
    )
    # Train the model
    trainer.fit(model, train_loader, test_loader)
    # Evaluate the model on the test data
    results = trainer.test(model, test_loader, verbose=False)[0]

    # Store predictions and labels for later processing
    test_elem = next(iter(test_loader))
    y_preds = trainer.predict(model, test_elem['X'])[0].unsqueeze(dim=0)
    y_preds = nn.Sigmoid()(y_preds)
    y_targets = test_elem['y'].to(torch.int32)
    predictions.append(y_preds)
    targets.append(y_targets)

    results['k'] = k
    report_table.append(results)
    display(pd.DataFrame([results]))
    print(f"K = {k} done.")
    if k >= 1:
        break

predictions = torch.vstack(predictions)
targets = torch.vstack(targets)
report_table = pd.DataFrame(report_table)
report_table

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

Unnamed: 0,test_loss,test_acc,test_acc_weighted,test_f1_score,test_f1_score_weighted,test_precision,test_precision_weighted,test_recall,test_recall_weighted,test_roc_auc,test_roc_auc_weighted,k
0,0.101842,0.977273,0.875,0.933333,0.875,1.0,0.875,0.875,0.875,0.0,0.0,0


K = 0 done.


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (6) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

Unnamed: 0,test_loss,test_acc,test_acc_weighted,test_f1_score,test_f1_score_weighted,test_precision,test_precision_weighted,test_recall,test_recall_weighted,test_roc_auc,test_roc_auc_weighted,k
0,0.291198,0.886364,0.875,0.736842,0.875,0.636364,0.875,0.875,0.875,0.0,0.0,1


K = 1 done.


Unnamed: 0,test_loss,test_acc,test_acc_weighted,test_f1_score,test_f1_score_weighted,test_precision,test_precision_weighted,test_recall,test_recall_weighted,test_roc_auc,test_roc_auc_weighted,k
0,0.101842,0.977273,0.875,0.933333,0.875,1.0,0.875,0.875,0.875,0.0,0.0,0
1,0.291198,0.886364,0.875,0.736842,0.875,0.636364,0.875,0.875,0.875,0.0,0.0,1


In [12]:
report_table.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-loo.csv", index=False)

In [13]:
basedir = f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/processed/"
torch.save(predictions, f'{basedir}/nlp-count_vectorizer-loo-predictions.pt')
torch.save(targets, f'{basedir}/nlp-count_vectorizer-loo-targets.pt')

In [160]:
def get_predictions(probs, threshold=0.5):
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= threshold)] = 1
    return predictions

def get_predicted_labels(predictions, id2label=id2label):
    # Turn predicted ID's into actual label names
    labeled_predictions = []
    for pred in predictions:
        tmp = [id2label[idx] for idx, label in enumerate(pred) if label == 1.0]
        labeled_predictions.append(tmp)
    return labeled_predictions

# Data Preparation
texts = dataset_df['raw text']
labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed
dataset = TextDataset(texts, labels)
train_loader = DataLoader(dataset, batch_size=8, shuffle=False)

X = torch.vstack([x['X'] for x in train_loader])
y_logits = torch.vstack(trainer.predict(model, X))
y_probs = nn.Sigmoid()(y_logits).numpy()
print(y_probs.shape)

predictions = get_predictions(y_probs, threshold=0.7)
predicted_labels = get_predicted_labels(predictions, id2label=id2label)
for i, (t, l) in enumerate(zip(texts, predicted_labels)):
    print('-' * 80)
    print(f'Sentenza n. {i}:')
    print('-' * 80)
    print(l)
    print(t)
    if i > 1:
        break

Predicting: |          | 0/? [00:00<?, ?it/s]

(85, 44)
--------------------------------------------------------------------------------
Sentenza n. 0:
--------------------------------------------------------------------------------
['prossimita di un incrocio o curva', 'ostacolo prevedibile dall imputato', 'ostacolo evitabile dall imputato', 'eccesso verso imputato', 'notte', 'maltempo', 'violazione stop o semaforo o codice della strada da parte imputato']
SENTENZA 
Cassazione penale sez. IV - 03/05/2023, n. 20253 
Intestazione
 LA CORTE SUPREMA DI CASSAZIONE 
 SEZIONE QUARTA PENALE 
 Composta dagli Ill.mi Sigg.ri Magistrati: 
Dott. PICCIALLI Patrizia - Presidente - 
Dott. PEZZELLA Vincenzo - Consigliere - 
Dott. D'ANDREA Alessadro - Consigliere - 
Dott. MICCICHE'Loredana - rel. Consigliere - 
Dott. NOCERA Andrea - Consigliere - 
ha pronunciato la seguente: 
 SENTENZA 
sul ricorso proposto da: 
 H.M. nato il (Omissis); 
avverso la sentenza del 12/04/2022 della CORTE APPELLO di ANCONA 
visti gli atti, il provvedimento impugnato e i

## Threshold Analysis

In [3]:
import pandas as pd
import torch

In [2]:
basedir = f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/processed/"
predictions = torch.load(f'{basedir}/nlp-count_vectorizer-loo-predictions.pt')
targets = torch.load(f'{basedir}/nlp-count_vectorizer-loo-targets.pt')

In [19]:
predictions = torch.vstack(predictions)
targets = torch.vstack(targets)

In [20]:
num_labels = targets.size()[-1]

report_table_per_threshold = []
for k, (y_preds, y_targets) in enumerate(zip(predictions, targets)):
    y_preds = y_preds.unsqueeze(dim=0)
    y_targets = y_targets.unsqueeze(dim=0)
    for threshold in [0.1 * i for i in range(11)]:
        tmp = {
            'k': k,
            'threshold': threshold,
            f'test_accuracy': accuracy(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold).item(),
            f'test_precision': precision(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold).item(),
            f'test_recall': recall(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold).item(),
            f'test_f1_score': f1_score(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold).item(),
            f'test_accuracy_weighted': accuracy(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold, average='weighted').item(),
            f'test_precision_weighted': precision(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold, average='weighted').item(),
            f'test_recall_weighted': recall(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold, average='weighted').item(),
            f'test_f1_score_weighted': f1_score(y_preds, y_targets, task='multilabel', num_labels=num_labels, threshold=threshold, average='weighted').item(),
        }
        report_table_per_threshold.append(tmp)
pd.DataFrame(report_table_per_threshold)

Unnamed: 0,k,threshold,test_accuracy,test_precision,test_recall,test_f1_score,test_accuracy_weighted,test_precision_weighted,test_recall_weighted,test_f1_score_weighted
0,0,0.0,0.181818,0.181818,1.0,0.307692,1.0,1.0,1.0,1.0
1,0,0.1,0.181818,0.181818,1.0,0.307692,1.0,1.0,1.0,1.0
2,0,0.2,0.181818,0.181818,1.0,0.307692,1.0,1.0,1.0,1.0
3,0,0.3,0.181818,0.181818,1.0,0.307692,1.0,1.0,1.0,1.0
4,0,0.4,0.181818,0.181818,1.0,0.307692,1.0,1.0,1.0,1.0
5,0,0.5,0.181818,0.181818,1.0,0.307692,1.0,1.0,1.0,1.0
6,0,0.6,0.954545,0.875,0.875,0.875,0.875,0.875,0.875,0.875
7,0,0.7,0.977273,1.0,0.875,0.933333,0.875,0.875,0.875,0.875
8,0,0.8,0.818182,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0.9,0.818182,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
report_table_per_label = []

for label_id, label_name in id2label.items():
    y_preds = predictions[:, label_id]
    y_targets = targets[:, label_id]
    for threshold in [0.1 * i for i in range(11)]:
        tmp = {
            'label': label_name,
            'threshold': threshold,
            f'test_accuracy': accuracy(y_preds, y_targets, task='binary', threshold=threshold).item(),
            f'test_precision': precision(y_preds, y_targets, task='binary', threshold=threshold).item(),
            f'test_recall': recall(y_preds, y_targets, task='binary', threshold=threshold).item(),
            f'test_f1_score': f1_score(y_preds, y_targets, task='binary', threshold=threshold).item(),
        }
        report_table_per_label.append(tmp)
pd.DataFrame(report_table_per_label)

Unnamed: 0,label,threshold,test_accuracy,test_precision,test_recall,test_f1_score
0,localita commerciale o luogo di lavoro,0.0,0.0,0.0,0.0,0.0
1,localita commerciale o luogo di lavoro,0.1,0.0,0.0,0.0,0.0
2,localita commerciale o luogo di lavoro,0.2,0.0,0.0,0.0,0.0
3,localita commerciale o luogo di lavoro,0.3,0.0,0.0,0.0,0.0
4,localita commerciale o luogo di lavoro,0.4,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
479,imputato perde controllo,0.6,0.0,0.0,0.0,0.0
480,imputato perde controllo,0.7,0.5,0.0,0.0,0.0
481,imputato perde controllo,0.8,1.0,0.0,0.0,0.0
482,imputato perde controllo,0.9,1.0,0.0,0.0,0.0


## Plotting

In [15]:
tmp = report_table.drop(columns=['k']).describe().round(3)
tmp.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-loo-aggregated.csv")
tmp

Unnamed: 0,test_loss,test_acc,test_acc_weighted,test_f1_score,test_f1_score_weighted,test_precision,test_precision_weighted,test_recall,test_recall_weighted,test_roc_auc,test_roc_auc_weighted
count,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0,85.0
mean,0.636,0.883,0.46,0.538,0.46,0.722,0.46,0.46,0.46,0.0,0.0
std,0.665,0.056,0.242,0.237,0.242,0.315,0.242,0.242,0.242,0.0,0.0
min,0.051,0.727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.283,0.841,0.308,0.462,0.308,0.571,0.308,0.308,0.308,0.0,0.0
50%,0.476,0.886,0.444,0.571,0.444,0.8,0.444,0.444,0.444,0.0,0.0
75%,0.749,0.909,0.571,0.667,0.571,1.0,0.571,0.571,0.571,0.0,0.0
max,5.046,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


In [81]:
tmp = report_table.drop(columns=['test_hp_metric', 'k', 'test_opt_score']).describe().round(3)
tmp.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-loo-aggregated.csv")
tmp

Unnamed: 0,test_loss,test_acc,test_f1_score,test_precision,test_recall,test_roc_auc
count,85.0,85.0,85.0,85.0,85.0,85.0
mean,0.776,0.879,0.539,0.699,0.47,0.836
std,0.892,0.057,0.232,0.3,0.237,0.114
min,0.058,0.75,0.0,0.0,0.0,0.5
25%,0.28,0.841,0.462,0.6,0.308,0.776
50%,0.469,0.886,0.571,0.75,0.444,0.844
75%,0.969,0.909,0.714,1.0,0.625,0.914
max,6.291,0.977,0.9,1.0,1.0,1.0


## Stratified K-Fold

Cannot work on multi-labels, skipping.

In [97]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.preprocessing import MultiLabelBinarizer

# # Data Preparation
# texts = dataset_df['raw text'] # dataset_df['facts']
# labels = dataset_df.iloc[:, 2:46]  # Adjust indices as needed
# full_dataset = TextDataset(texts, labels)

# mlb = MultiLabelBinarizer()
# mlb.fit(labels_columns)
# print(len(mlb.transform(labels)))

# skf = StratifiedKFold(n_splits=5)

# for k, (train_index, test_index) in enumerate(skf.split(texts.to_numpy(), mlb.transform(labels).T)):
#     # Splitting the dataset
#     train_subset = Subset(full_dataset, train_index)
#     test_subset = Subset(full_dataset, test_index)
#     # Creating data loaders for training and testing
#     train_loader = DataLoader(train_subset, batch_size=8, shuffle=True)
#     test_loader = DataLoader(test_subset, batch_size=1, shuffle=False)
#     # Model Initialization
#     num_features = len(full_dataset.vectorizer.get_feature_names_out())
#     num_labels = labels.shape[1]
#     model = TextClassifier(num_features, num_labels)
#     # Trainer setup
#     trainer = pl.Trainer(
#         max_epochs=5,
#         enable_model_summary=False,
#         precision="32-true",
#     )
#     # Train the model
#     trainer.fit(model, train_loader)
#     # Evaluate the model on the test data
#     results = trainer.test(model, test_loader, verbose=False)[0]
#     results['k'] = k
#     report_table.append(results)
#     print(f"K = {k} done.")

# report_table = pd.DataFrame(report_table)
# report_table.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-skf.csv", index=False)
# report_table

In [None]:
# tmp = report_table.drop(columns=['test_hp_metric', 'k', 'test_opt_score']).describe().round(3)
# tmp.to_csv(f"/content/drive/MyDrive/Colab Notebooks/ml_for_justice/data/scores-nlp-count_vectorizer-skf-aggregated.csv")
# tmp

## Using Text Embeddings from Spacy

In [16]:
# !python -m spacy download it_core_news_md -qqq
!python -m spacy download it_core_news_lg -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.9/567.9 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')


In [12]:
import spacy

# nlp = spacy.load('it_core_news_md')
nlp = spacy.load('it_core_news_lg')

In [None]:
for doc in nlp.pipe(dataset_df['facts'].dropna().tolist(), batch_size=50):
    print(doc.vector.shape)
    break

In [8]:
doc = nlp('Il tuo testo qui')

for token in doc:
    print(token.text, token.vector)

Il [-7.8459e+00 -4.5490e+00  6.1788e+00  6.7682e-04  1.7720e+00 -6.0620e+00
 -1.6026e+01 -4.0896e-02  7.3851e+00 -4.2115e+00 -4.1298e+00 -5.9586e+00
 -1.0974e+01 -5.8506e+00  2.2846e+00 -9.4184e-01 -9.5699e+00  9.4961e+00
 -5.6640e+00  5.0692e+00 -1.1745e+00  3.2972e-01 -2.8499e+00 -2.3215e+00
 -5.7043e+00 -5.5089e+00 -4.1577e+00 -6.0236e+00  2.3485e-01 -1.2325e+01
 -4.4324e+00 -1.0489e+01 -1.1555e+01  2.4902e+00 -7.3307e+00  7.3845e+00
 -1.3803e+00  6.0386e+00 -4.2963e+00 -2.3835e+00  4.8599e+00  2.3706e+00
  1.0202e+01  9.2351e+00 -5.3355e+00 -5.4578e+00  4.4547e-02 -3.1373e+00
  1.0590e+01  3.2910e+00  1.2975e+00  5.4566e+00 -8.4582e-01  5.5630e+00
 -3.7731e+00  2.4545e+00  8.9220e-01 -1.1405e+00 -4.5700e+00 -5.5250e+00
 -1.4209e+01 -1.3374e+01  1.3272e+00 -6.6337e+00  7.8099e-01  1.1635e+01
  6.1561e+00  8.4455e+00 -2.2063e+00  2.8781e-01 -1.2951e+01 -9.9293e+00
 -2.3724e+00 -7.6580e+00  3.3341e+00 -7.9327e+00 -6.2643e+00 -4.8530e+00
  5.2297e-01 -5.3943e+00 -1.9698e+00 -1.6415e+00