# NLP for law documents

The purpose of this exercise is to define a similarity measure and to compute the similarity distances between a given set of phrases representing topics, and sentences that make the judgement.

### Reading input files

In [1]:
# Load library
import pandas as pd
import json
import numpy as np
import re
import os
import sys
import pickle
import os.path
import string
import time
from statistics import mean
from nltk.tokenize import word_tokenize
from gensim.similarities import WmdSimilarity
from gensim import utils
from gensim.models import Doc2Vec
from gensim.models import doc2vec
from gensim.models import Word2Vec
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stops = set(stopwords.words("french"))
import utils

#read file"
filename="NLP Exercise/judgements.jsonl"

topics={0:"Rupture abusive de la relation de travail",
        1:"Rupture abusive du contrat de travail",
        2:"Rupture brutale de relations commerciales établies",
        3:"Rupture brutale des contrats",
        4:"Indemnité compensatrice de rupture",
        5:"Indemnité compensatrice de congés payés",
        6:"Indemnité compensatrice de préavis"
       }

#pretraitement of topics
listOfTopics=[]
for i in range(0,len(topics)):
    topic_tokens=[w for w in topics[i].lower().split() if not w in stops]
    listOfTopics.append(topic_tokens)

df = pd.read_json (filename, lines=True, encoding='utf-8-sig')

[nltk_data] Downloading package stopwords to C:\Users\admin-
[nltk_data]     local\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\admin-
[nltk_data]     local\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\admin-
[nltk_data]     local\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\admin-
[nltk_data]     local\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Segmentation of judgements by \n\n

In [2]:
#preprocessing of inputs files"
alljudgements, list_index, list_extractedpharses = utils.preprocessingInputs(df)

### Train the model

Reading decisions files availaible in ftp://echanges.dila.gouv.fr/CAPP/ to use them for the training of the doc2vec model

In [3]:
repertoryName='CAPP'
utils.ReadTrainingDecisions(repertoryName)       

In [4]:
#load training file
listOfSentences = utils.loadTrainingDecisions()

In [5]:
#pretraitement of training file
sentencesForTraining = utils.formatTrainingDecisions(listOfSentences)

In [8]:
#training the model
# /!\ This might take a while
utils.trainingModel(alljudgements, listOfTopics, sentencesForTraining)

Seconds since epoch = 1565881885.0499713
Epoch # 1 is complete.


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Compute similarities 

### Compute first similarity with Word Mover’s Distance (WMD)

In [5]:
#computethe first  similarity score
model = Doc2Vec.load("doc2vec.model")
scores = utils.computeScoreWmd(alljudgements,listOfTopics, model)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


18.367862224578857  s elapsed.  1001 / 80079
19.090914249420166  s elapsed.  2001 / 80079
24.221704483032227  s elapsed.  3001 / 80079
20.92440676689148  s elapsed.  4001 / 80079
19.643333196640015  s elapsed.  5001 / 80079
20.712408542633057  s elapsed.  6001 / 80079
19.040906190872192  s elapsed.  7001 / 80079
18.084809064865112  s elapsed.  8001 / 80079
24.152515411376953  s elapsed.  9001 / 80079
21.27393364906311  s elapsed.  10001 / 80079
18.630186319351196  s elapsed.  11001 / 80079
18.090758323669434  s elapsed.  12001 / 80079
19.180299520492554  s elapsed.  13001 / 80079
18.875241994857788  s elapsed.  14001 / 80079
21.69240713119507  s elapsed.  15001 / 80079
19.29487109184265  s elapsed.  16001 / 80079
20.70745301246643  s elapsed.  17001 / 80079
23.760550498962402  s elapsed.  18001 / 80079
19.348370790481567  s elapsed.  19001 / 80079
18.813805103302002  s elapsed.  20001 / 80079
20.284903287887573  s elapsed.  21001 / 80079
19.642324447631836  s elapsed.  22001 / 80079
19

### Building the table for visualization

In [6]:
score_norm = utils.normalize_scores(scores)

j = 0
ind_sort = np.argsort(score_norm[:,j])
i = -100

#create the table
df = utils.tableResult(list_index,list_extractedpharses,score_norm)
print(list_extractedpharses[ind_sort[i]],listOfTopics[j]," sim score = ", score_norm[ind_sort[i],j], " max_score: ",score_norm[:,j].max())

• 8 882,57 € à titre de dommages et intérêts pour rupture anticipée du contrat de travail à durée déterminée ['rupture', 'abusive', 'relation', 'travail']  sim score =  0.6352747172124967  max_score:  1.0


In [2]:
#print the dataframe
df

### Sorting the table 

Present the table sorted so as to have at the top the most "problematic" cases, that is rows in
which the best topic is not that distance from the second topic.

In [3]:
#sort the table and print it
showResult = utils.sortTable(df)

In [4]:
#print the table
showResult

### Compute second similarity with cosine similarity

In [None]:
#compute the second similarity score
model = Doc2Vec.load("doc2vec.model")

sims=utils.computeScore(alljudgements, listOfTopics, model)
# Forcing similarity values into the interval [0,1]  
sims=(sims>=0)*sims

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Building the table for visualization

In [5]:
#create the table
dfSecond = utils.tableResult(list_index,list_extractedpharses,sims)

### Sorting the table

In [None]:
#sort the table and print it
showSecondResult = utils.sortTable(dfSecond)

In [None]:
showSecondResult 

## Text for Evaluation

### Put the new judgements in the evaluation.jsonl file and execute the code below

find the evaluation.jsonl file in the exercice repertory ; execute the reading and the segmentation box to initialize data and import libraries  

In [15]:
#put the new judgement in evaluation file name evaluation.jsonl

#Reading evaluation.jsonl"
evalFilename="evaluation.jsonl"
df = pd.read_json (evalFilename, lines=True, encoding='utf-8-sig')

#preprocessing of inputs files"
judgements, number_index, visualizepharses = utils.preprocessingInputs(df)

### Compute first similarity with Word Mover’s Distance (WMD)

In [16]:
##load train model
model= Doc2Vec.load("doc2vec.model")

##compute similarity with all topics
scores = utils.computeScoreWmd(judgements, listOfTopics, model)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [17]:
#normalize the distance
max_col=utils.loadRefMaxScores()
score_norm = 1./(1.+scores)
for j in range(score_norm.shape[1]):
    max_col.append(score_norm[:,j].max())
    score_norm[:,j]/=max_col[j]

j = 0
ind_sort = np.argsort(score_norm[:,j])
i = -1


##show the result
evalShowResult = utils.tableResult(number_index, visualizepharses, score_norm)
print(visualizepharses[ind_sort[i]],listOfTopics[j]," sim score = ", score_norm[ind_sort[i],j], " max_score: ",score_norm[:,j].max())

#sort the table and print it
evalShowResult = utils.sortTable(evalShowResult)

- 10 000 € à titre de dommages et intérêts pour résistance abusive, ['rupture', 'abusive', 'relation', 'travail']  sim score =  0.6210935026767687  max_score:  0.6210935026767687


In [18]:
evalShowResult

Unnamed: 0,line,text,sim_a,sim_b,sim_c,sim_d,sim_e,sim_f,sim_g
16,0,INTIMEES :,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
232,1,30620 BERNIS,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
29,0,DEBATS :,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
222,1,81100 CASTRES,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
302,2,INTIMEE :,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0,CAL,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
311,2,DEBATS :,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
351,2,- 10 000 € à titre de dommages et intérêts pou...,0.621094,0.529140,0.205024,0.692444,0.334194,0.055145,0.053998
403,2,La demande de dommages et intérêts pour résist...,0.535329,0.430229,0.202727,0.660831,0.308939,0.051313,0.050145
209,1,CHAMBRE CIVILE,0.516884,0.394045,0.206076,0.666254,0.342923,0.058023,0.056927


### Compute second similarity with cosine similarity

In [19]:
#compute the second similarity score 
sims2 = utils.computeScore(judgements,listOfTopics,model)
# Forcing similarity values into the interval [0,1]  
sims2=(sims2>=0)*sims2

In [20]:
##show the result
evalShowResult2 = utils.tableResult(number_index, visualizepharses, sims2 )
#sort the table and print it
evalShowResult2 = utils.sortTable(evalShowResult2)

In [21]:
evalShowResult2

Unnamed: 0,line,text,sim_a,sim_b,sim_c,sim_d,sim_e,sim_f,sim_g
16,0,INTIMEES :,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
29,0,DEBATS :,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
232,1,30620 BERNIS,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0,CAL,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
302,2,INTIMEE :,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
311,2,DEBATS :,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
222,1,81100 CASTRES,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
399,2,Les demandes de dommages et intérêts :,0.105961,0.246388,0.128819,0.186824,0.198868,0.246265,0.221777
18,0,"Non comparant, n'ayant pas constitué avocat",0.152776,0.126987,0.152571,0.098065,0.198852,0.195117,0.198579
52,0,débouté Mme J... de sa demande en responsabil...,0.191868,0.192227,0.083051,0.088152,0.128751,0.145744,0.111601


### Methodology to identify topics at word level

### Improvements