## Introduction

This notebook is a simplified version of the one used for the "NLP applied to judicial decisions parsing" challenge of the student competition organized each year by the Data team of the Ecole Normale Supérieure of Paris in partnership with the Collège de France.

* Details: https://challengedata.ens.fr/challenges/24
* Leaderboard: https://challengedata.ens.fr/leaderboard/2020

## Context

When a trial is over, a summary of the trial is published with all the important information dealing with the case that have just been judged.
This document is called jurisprudence in French.

In the case of a trial between a victim and an insurer, this document contains all the circumstances, and the medical and financial data from the first injuries to the final amounts of indemnisation.

## Challenge goals

We have “jurisprudence” data as text files and we want to build an algorithm to automate the extraction of the relevant information.
In this challenge, we want to extract from "jurisprudence" the date of the accident and the date of the stabilization of the injuries.

## Let's load our data in a nice Pandas DataFrame

In [None]:
!unzip -q train_folder_predilex.zip

In [None]:
import pandas as pd

x_train_ids = pd.read_csv('train_folder/x_train_ids.csv', index_col='ID')
doc_labels = pd.read_csv('Y_train_predilex.csv', index_col='ID')
doc_labels.columns = ['gender', 'accident_date', 'stabilization_date']

text_files = {}
for index, filename in x_train_ids['filename'].iteritems():
  with open(f'train_folder/txt_files/{filename}') as file:
      raw_content = file.read()
      cleaned_content = raw_content.replace('...','').replace(' ;','.')
      text_files[index] = " ".join(cleaned_content.split())

df_text_files = pd.DataFrame.from_dict(text_files, orient='index', columns=['text'])

df_docs = pd.concat([doc_labels,df_text_files], axis='columns')

In [None]:
df_docs

## Now we split our dataset

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

df_docs_train, df_docs_test = train_test_split(df_docs, test_size=0.2, random_state=42)
df_docs_train, df_docs_test = df_docs_train.copy(), df_docs_test.copy()

print(f'Number of documents in the train set: {len(df_docs_train)}')
print(f'Number of documents in the test set: {len(df_docs_test)}')

## Let's train our own sentence tokenizer! (On the train set) 

In [None]:
from nltk.tokenize import PunktSentenceTokenizer

train_tokenizer = ' '.join(df_docs_train.text)

sent_tokenizer = PunktSentenceTokenizer(train_tokenizer)

## Let's split our documents into sentences

In [None]:
df_docs_train['sentences'] = df_docs_train.apply(lambda row: sent_tokenizer.tokenize(row['text']),axis=1)
df_docs_train.index.name = 'doc_id'
df_docs_train.sort_index(inplace=True)
df_docs_test['sentences'] = df_docs_test.apply(lambda row: sent_tokenizer.tokenize(row['text']),axis=1)
df_docs_test.index.name = 'doc_id'
df_docs_test.sort_index(inplace=True)

## Now we extract the context around each dates and we label it

In [None]:
import re
from preprocessing import Preprocessing

def split_context(context, left=True):
  preprocessing = Preprocessing()
  array = preprocessing.word_tok(context)
  n = len(array)
  half = int(n/2)
  if n % 2 == 0:
      if left:
        return " ".join(array[:half])
      else:
        return " ".join(array[half:])
  else:
      if left:
        return " ".join(array[:half+1])
      else:
        return " ".join(array[half:])

def get_date_context(date_no, date_index, n, chunks):
  # First date in the sentence
  if date_no == 0:
    date_context = chunks[date_index-1]
    date_context += ' '+chunks[date_index]+' '
    date_context += split_context(chunks[date_index+1],left=False)
  # Last date in the sentence
  elif date_no == n:
    date_context = split_context(chunks[date_index-1],left=True)
    date_context += ' '+chunks[date_index]+' '
    date_context += chunks[date_index+1]
  else:
    date_context = split_context(chunks[date_index-1],left=True)
    date_context += ' '+chunks[date_index]+' '
    date_context += split_context(chunks[date_index+1],left=False)
  return date_context

def get_labeled_phrases(df_docs):
  labeled_phrases = []

  preprocessing = Preprocessing()
  
  for doc in df_docs.itertuples():
    for sentence in doc.sentences:
      # Is there a date in the document?
      if re.search(preprocessing.regex_patterns, sentence) is not None:
        # Find all the dates
        matches_union = preprocessing.regex_patterns.findall(sentence)
        # Split the sentence in chunks
        # [phrase, date, phrase, date, phrase,...]
        chunks = preprocessing.regex_patterns.split(sentence)
        # n is the number of dates in the sentence
        n = len(matches_union)

        # For each date
        for date_no, date in enumerate(matches_union):
          # Let's convert the date to the appropriate format
          normalized_date = preprocessing.date_processing(date.lower())
          # Index of the current date in the list of chunks
          date_index = 2*date_no+1

          # If there is only one date in the sentence,
          # the context of the date is the sentence itself
          if n == 1:
            date_context = sentence
          # If there is more than one date in the sentence,
          # we keep only the context around each date
          else:
            date_context = get_date_context(date_no, date_index, n, chunks)
          
          # Let's remove extra whitespaces
          cleaned_date_context = " ".join(date_context.split())
          
          # The date is the date of the accident
          if normalized_date==doc.accident_date:
            labeled_phrases.append({'doc_id':doc.Index,'phrase':cleaned_date_context,'date': normalized_date, 'label': 1})
          # The date is the date of the stabilization
          elif normalized_date==doc.stabilization_date:
            labeled_phrases.append({'doc_id':doc.Index,'phrase':cleaned_date_context,'date': normalized_date, 'label': 2})
          # The date is neither the date of the accident nor the date of stabilization
          else:
            labeled_phrases.append({'doc_id':doc.Index,'phrase':cleaned_date_context,'date': normalized_date,'label': 0})
  
  return pd.DataFrame(labeled_phrases)

In [None]:
df_train, df_test = get_labeled_phrases(df_docs_train), get_labeled_phrases(df_docs_test)

## Let's take a look at our data distributions

In [None]:
from matplotlib import pyplot as plt

fig, axs = plt.subplots(1,2)

df_train.label.hist(ax=axs[0],figsize=(10, 5))
df_test.label.hist(ax=axs[1],figsize=(10, 5))
plt.suptitle('Train & Test Dataset Distribution');

## Let's fine-tune CamemBERT (based on Facebook's RoBERTa model, trained on 138GB of French text) to classify our context

In [None]:
%%capture
!pip install transformers==4.6.1
!pip install sentencepiece==0.1.95

In [None]:
import torch
import pandas as pd
import torch.nn as nn
import numpy as np

from sklearn import metrics
from transformers import AdamW
from sklearn import model_selection
from sklearn.utils.class_weight import compute_class_weight

from transformers import get_linear_schedule_with_warmup

import engine
import config
from dataset import Dataset
from models import CAMEMBERTBase

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

train_dataset = Dataset(
    texts=df_train.phrase.values, targets=df_train.label.values
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2
)

test_dataset = Dataset(
    texts=df_test.phrase.values, targets=df_test.label.values
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=config.TEST_BATCH_SIZE, num_workers=1
)

device = torch.device(config.DEVICE)
model = CAMEMBERTBase()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
)

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(df_train.label.values), df_train.label.values)

best_mcc = 0
for epoch in range(config.EPOCHS):
    engine.train_fn(train_data_loader, model, optimizer, device, scheduler, class_weights/sum(class_weights))
    outputs_p, targets = engine.eval_fn(test_data_loader, model, device)
    outputs = np.argmax(np.array(outputs_p),axis=1)
    macro_f1 = metrics.f1_score(targets, outputs, average='macro')
    mcc = metrics.matthews_corrcoef(y_true=targets, y_pred=outputs)
    print(f"Macro-F1 Score = {macro_f1}")
    print(f"mcc Score = {mcc}")
    if mcc > best_mcc:
        torch.save(model.state_dict(), config.MODEL_PATH)
        best_mcc = mcc

## Let's make predictions for the documents in our test set

In [None]:
import config
import engine
import torch
from dataset import Dataset
from models import CAMEMBERTBase

model = CAMEMBERTBase()
model.load_state_dict(torch.load(config.MODEL_PATH,map_location=torch.device(config.DEVICE)))
model.eval();

In [None]:
import numpy as np
import torch

def sentence_prediction(model, sentence):
  # fetch the tokenizer and max len of tokens from config.py
  tokenizer = config.TOKENIZER
  max_len = config.MAX_LEN
  # the processing is same as it was done for training
  sentence = str(sentence)
  sentence = " ".join(sentence.split())
  # encode the sentence into ids,
  # truncate to max length &
  # add CLS and SEP tokens
  inputs = tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=config.MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            )
  # fetch input ids, mask & token type ids
  ids = inputs["input_ids"]
  mask = inputs["attention_mask"]
  token_type_ids = inputs["token_type_ids"]

  # convert all the inputs to torch tensors
  # we use unsqueeze(0) since we have only one sample
  # this makes the batch size 1
  ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
  mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)
  token_type_ids = torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0)
  # send everything to device
  ids = ids.to('cpu', dtype=torch.long)
  token_type_ids = token_type_ids.to('cpu', dtype=torch.long)
  mask = mask.to('cpu', dtype=torch.long)
  # use the model to make predictions
  outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
  logits = outputs[0].cpu().detach().numpy()
  return logits
  # take sigmoid of prediction and return the output
  outputs = torch.softmax(outputs[0],dim=0).cpu().detach().numpy()
  print(outputs)
  print(np.argmax(outputs))

In [None]:
df_docs_test_labels = df_docs_test[['accident_date','stabilization_date']]

In [None]:
df_docs_test_labels

In [None]:
df_test.drop(columns=['label'],inplace=True)

In [None]:
def get_logits(row):
  outputs = sentence_prediction(model, row['phrase'])
  row['logit_1'] = outputs[1]
  row['logit_2'] = outputs[2]
  row['pred'] = np.argmax(outputs)
  return row

In [None]:
df_test

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()
df_test = df_test.progress_apply(lambda row: get_logits(row),axis=1)

In [None]:
df_accident_date = df_test[df_test.pred == 1].drop(['logit_2','pred'],axis=1)
df_accident_date = df_accident_date.groupby(['doc_id','date']).sum()
df_accident_date = df_accident_date.iloc[df_accident_date.reset_index().groupby(['doc_id'])['logit_1'].idxmax()]
df_accident_date = df_accident_date.drop(columns='logit_1').reset_index().set_index(['doc_id']).rename(columns={"doc_id": "doc_id", "date": "accident_date"})

In [None]:
df_stabilization_date = df_test[df_test.pred == 2].drop(['logit_1','pred'],axis=1)
df_stabilization_date = df_stabilization_date.groupby(['doc_id','date']).sum()
df_stabilization_date = df_stabilization_date.iloc[df_toto_2.reset_index().groupby(['doc_id'])['logit_2'].idxmax()]
df_stabilization_date = df_stabilization_date.drop(columns='logit_2').reset_index().set_index(['doc_id']).rename(columns={"doc_id": "doc_id", "date": "stabilization_date"})

In [None]:
len(df_docs_test_labels)

In [None]:
def compute_overall_accuracy(df_true, df_pred):
  score_accident_date = 0
  score_stabilization_date = 0

  for i in df_pred.index:
    if df_true['accident_date'].loc[i] == df_pred['accident_date'].loc[i]:
      score_accident_date += 1
    if df_true['stabilization_date'].loc[i] == df_pred['stabilization_date'].loc[i]:
      score_stabilization_date += 1
  
  score_accident_date /= len(df_true)
  score_stabilization_date /= len(df_true)

  print((score_accident_date+score_stabilization_date)/2)

In [None]:
compute_overall_accuracy(df_true=df_docs_test_labels, df_pred=pd.concat([df_toto_1,df_toto_2],axis=1).fillna('n.c.'))