Encoding categorical variables and Preparing sentences for encoding

In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Loading the Swiss Doc2Doc IR dataset...")
ds = load_dataset("rcds/swiss_doc2doc_ir")
print("Dataset loaded successfully.")

Loading the Swiss Doc2Doc IR dataset...
Dataset loaded successfully.


In [3]:
df=pd.DataFrame(ds['test'])

df=df.drop(['chamber', 'region', 'origin_court', 'origin_canton', 'origin_chamber', 'law_sub_area', 'year','cited_rulings','rulings'], axis=1)

import ast

df['laws'] = df['laws'].map(ast.literal_eval)
df = df.fillna('none')#law_area
df['facts'] = df['facts'].replace("", "none")
df['considerations'] = df['considerations'].replace("", "none")
df_no_empty=df[df['laws'].apply(len) != 0].reset_index(drop=True)#empty laws
df_no_empty

Unnamed: 0,decision_id,language,facts,considerations,law_area,laws
0,0002f37e-0cfa-41df-8ce0-ec9411ccca2f,fr,none,"Considérant :\nque le 1er avril 2019, A._ a an...",social_law,"[75488867-c001-4eb9-93b9-04264ea91f55, 6ab3892..."
1,00040af4-cd41-4d95-a60b-64b1bf5b5c3b,de,none,Der Präsident zieht in Erwägung:\n1.\nDie Part...,penal_law,"[75488867-c001-4eb9-93b9-04264ea91f55, 6ab3892..."
2,0004e346-7f68-402b-bd25-f9a560630959,de,"Sachverhalt:\nA.\nA.a. Bei A._, geboren 1988, ...",Erwägungen:\n1.\n1.1. Die Beschwerde in öffent...,social_law,"[75488867-c001-4eb9-93b9-04264ea91f55, fc8edf4..."
3,00062164-3d14-4956-a749-8658665ce503,fr,Vu :\nle recours en matière civile formé le 1e...,"considérant :\nque, vu ce qui précède, le prés...",civil_law,"[75488867-c001-4eb9-93b9-04264ea91f55, 6ab3892..."
4,00073fde-172e-4019-9163-98e30509ef22,fr,Considérant en fait et en droit :\n1.\nPar arr...,none,civil_law,"[75488867-c001-4eb9-93b9-04264ea91f55, 6ab3892..."
...,...,...,...,...,...,...
27256,fff27ed4-e8b6-41d3-81aa-131cbe98b81d,de,none,Erwägungen:\n1.\n1.1. Am 31. Oktober 2017 eröf...,civil_law,"[75488867-c001-4eb9-93b9-04264ea91f55, 6ab3892..."
27257,fff40230-89e1-4a9a-a2bb-c975e725afe5,fr,"Faits :\nA.\nA.a. Le 23 décembre 2015, l'Aérop...",Considérant en droit :\n1.\n1.1. L'arrêt attaq...,public_law,"[75488867-c001-4eb9-93b9-04264ea91f55, d130972..."
27258,fff79bd6-391b-4ba4-8089-bee384b7ee87,fr,Considérant en fait et en droit :\n1.\n1.1. Le...,none,public_law,"[75488867-c001-4eb9-93b9-04264ea91f55, 6ab3892..."
27259,fffbe655-d320-4619-bb0a-45eb18f67def,de,"Sachverhalt:\nA.\nA.a. Der 1976 geborene A._, ...",Erwägungen:\n1.\nMit der Beschwerde in öffentl...,social_law,"[75488867-c001-4eb9-93b9-04264ea91f55, fc8edf4..."


In [7]:
#looking of the number of occurances of laws in entire datasets

laws=dict()
for i in df_no_empty['laws']:
    for j in i:
        if j in laws.keys():
            laws[j]+=1
        else:
            laws[j]=1
laws = dict(sorted(laws.items(), key=lambda item: item[1], reverse=True))#sorted by the number of occurances, so while encoding, it is made sure that the order wont change
laws

{'75488867-c001-4eb9-93b9-04264ea91f55': 53749,
 'e6b06567-1236-4210-adb3-e11c26e497d5': 53749,
 '6ab38922-6309-4021-83cc-56d776d7a332': 53749,
 '1af9b596-92d7-4f80-a38b-876ed88ccfe5': 16383,
 '53be6a03-1fd8-4980-aa5c-bd81e9a54d5e': 16383,
 '4b5a2135-fee2-4e3b-811e-15ce1c71bddf': 16383,
 'fd3f1dd4-81dd-482c-940d-91c71e29fee8': 9632,
 'e471493b-7d96-4f78-8369-7b3fb8418e74': 9632,
 '89e314c7-7926-4df2-aa9e-12fdcca58e86': 9632,
 '561feeae-8bdb-4e24-90f7-2c23eb5099a5': 8603,
 'ddbf4d98-14f9-4ccd-ad84-42704f0249cb': 8603,
 'af2af8b1-3414-4898-810f-7b0a1cb95a78': 8603,
 '0671c3a0-7575-4b8a-acae-46315869092c': 8420,
 '83fa99cf-24f4-434e-8b62-bb3e53480836': 8420,
 '7148876a-f26f-4346-9d2c-4731c827da4d': 8420,
 '04bf6369-99cb-41fa-8aff-413679bc8c18': 8286,
 '70d82e7f-fc70-45d6-b607-cd4654badc02': 8286,
 'ea2f453b-fc14-40f4-81ea-83272acf5c89': 8286,
 'dfe4a079-8090-463c-84d3-b72b354fdc7b': 6873,
 '18eb66aa-f83a-4e6a-8422-39b4549f7f1a': 6873,
 '9f26d259-d6e6-4bfa-99b5-f8fded5667bc': 6873,
 'c0730

In [8]:
#selecting only the most occuring laws

selected_laws=[]

for i in laws.items():
    if i[1]>=50:
        selected_laws.append(i[0])
print(selected_laws)
print(len(selected_laws))

['75488867-c001-4eb9-93b9-04264ea91f55', 'e6b06567-1236-4210-adb3-e11c26e497d5', '6ab38922-6309-4021-83cc-56d776d7a332', '1af9b596-92d7-4f80-a38b-876ed88ccfe5', '53be6a03-1fd8-4980-aa5c-bd81e9a54d5e', '4b5a2135-fee2-4e3b-811e-15ce1c71bddf', 'fd3f1dd4-81dd-482c-940d-91c71e29fee8', 'e471493b-7d96-4f78-8369-7b3fb8418e74', '89e314c7-7926-4df2-aa9e-12fdcca58e86', '561feeae-8bdb-4e24-90f7-2c23eb5099a5', 'ddbf4d98-14f9-4ccd-ad84-42704f0249cb', 'af2af8b1-3414-4898-810f-7b0a1cb95a78', '0671c3a0-7575-4b8a-acae-46315869092c', '83fa99cf-24f4-434e-8b62-bb3e53480836', '7148876a-f26f-4346-9d2c-4731c827da4d', '04bf6369-99cb-41fa-8aff-413679bc8c18', '70d82e7f-fc70-45d6-b607-cd4654badc02', 'ea2f453b-fc14-40f4-81ea-83272acf5c89', 'dfe4a079-8090-463c-84d3-b72b354fdc7b', '18eb66aa-f83a-4e6a-8422-39b4549f7f1a', '9f26d259-d6e6-4bfa-99b5-f8fded5667bc', 'c0730338-7fbb-4486-9e57-201f62440bc0', '09200a94-c90a-48da-a4db-840a951e3cec', '629b13cd-cf95-4caf-b21d-8e8c9a7dcffa', '3f3369fd-4067-4b2e-a129-a9675f22092b',

#write the law into txt file for future use of encoding
with open("selected laws.txt", 'w') as f:
    for item in selected_laws:
      f.write(f"{item}\n")

In [4]:
#using this to encode val
with open("selected laws.txt", 'r') as f:
    selected_laws = f.readlines()
    selected_laws = [line.strip() for line in selected_laws]
selected_laws

['75488867-c001-4eb9-93b9-04264ea91f55',
 'e6b06567-1236-4210-adb3-e11c26e497d5',
 '6ab38922-6309-4021-83cc-56d776d7a332',
 '1af9b596-92d7-4f80-a38b-876ed88ccfe5',
 '53be6a03-1fd8-4980-aa5c-bd81e9a54d5e',
 '4b5a2135-fee2-4e3b-811e-15ce1c71bddf',
 'fd3f1dd4-81dd-482c-940d-91c71e29fee8',
 'e471493b-7d96-4f78-8369-7b3fb8418e74',
 '89e314c7-7926-4df2-aa9e-12fdcca58e86',
 '561feeae-8bdb-4e24-90f7-2c23eb5099a5',
 'ddbf4d98-14f9-4ccd-ad84-42704f0249cb',
 'af2af8b1-3414-4898-810f-7b0a1cb95a78',
 '0671c3a0-7575-4b8a-acae-46315869092c',
 '83fa99cf-24f4-434e-8b62-bb3e53480836',
 '7148876a-f26f-4346-9d2c-4731c827da4d',
 '04bf6369-99cb-41fa-8aff-413679bc8c18',
 '70d82e7f-fc70-45d6-b607-cd4654badc02',
 'ea2f453b-fc14-40f4-81ea-83272acf5c89',
 'dfe4a079-8090-463c-84d3-b72b354fdc7b',
 '18eb66aa-f83a-4e6a-8422-39b4549f7f1a',
 '9f26d259-d6e6-4bfa-99b5-f8fded5667bc',
 'c0730338-7fbb-4486-9e57-201f62440bc0',
 '09200a94-c90a-48da-a4db-840a951e3cec',
 '629b13cd-cf95-4caf-b21d-8e8c9a7dcffa',
 '3f3369fd-4067-

In [5]:
#creating a framework which is used to encode

framework=dict()
for i in selected_laws:
    framework[i]=0
framework

{'75488867-c001-4eb9-93b9-04264ea91f55': 0,
 'e6b06567-1236-4210-adb3-e11c26e497d5': 0,
 '6ab38922-6309-4021-83cc-56d776d7a332': 0,
 '1af9b596-92d7-4f80-a38b-876ed88ccfe5': 0,
 '53be6a03-1fd8-4980-aa5c-bd81e9a54d5e': 0,
 '4b5a2135-fee2-4e3b-811e-15ce1c71bddf': 0,
 'fd3f1dd4-81dd-482c-940d-91c71e29fee8': 0,
 'e471493b-7d96-4f78-8369-7b3fb8418e74': 0,
 '89e314c7-7926-4df2-aa9e-12fdcca58e86': 0,
 '561feeae-8bdb-4e24-90f7-2c23eb5099a5': 0,
 'ddbf4d98-14f9-4ccd-ad84-42704f0249cb': 0,
 'af2af8b1-3414-4898-810f-7b0a1cb95a78': 0,
 '0671c3a0-7575-4b8a-acae-46315869092c': 0,
 '83fa99cf-24f4-434e-8b62-bb3e53480836': 0,
 '7148876a-f26f-4346-9d2c-4731c827da4d': 0,
 '04bf6369-99cb-41fa-8aff-413679bc8c18': 0,
 '70d82e7f-fc70-45d6-b607-cd4654badc02': 0,
 'ea2f453b-fc14-40f4-81ea-83272acf5c89': 0,
 'dfe4a079-8090-463c-84d3-b72b354fdc7b': 0,
 '18eb66aa-f83a-4e6a-8422-39b4549f7f1a': 0,
 '9f26d259-d6e6-4bfa-99b5-f8fded5667bc': 0,
 'c0730338-7fbb-4486-9e57-201f62440bc0': 0,
 '09200a94-c90a-48da-a4db-840a95

In [6]:
def change(laws):#takes the list of laws and outputs the encoded version
    l=framework.copy()
    for i in laws:
        if i in l.keys():
            l[i]=1
    return list(l.values())

In [7]:
#encoding the data

encoded_laws=[]

for i in df_no_empty['laws']:
    encoded_laws.append(change(i))

In [8]:
df=df_no_empty.drop('laws', axis=1)
df['onehot_laws']=encoded_laws
df#concatenating the encoded version

Unnamed: 0,decision_id,language,facts,considerations,law_area,onehot_laws
0,0002f37e-0cfa-41df-8ce0-ec9411ccca2f,fr,none,"Considérant :\nque le 1er avril 2019, A._ a an...",social_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,00040af4-cd41-4d95-a60b-64b1bf5b5c3b,de,none,Der Präsident zieht in Erwägung:\n1.\nDie Part...,penal_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0004e346-7f68-402b-bd25-f9a560630959,de,"Sachverhalt:\nA.\nA.a. Bei A._, geboren 1988, ...",Erwägungen:\n1.\n1.1. Die Beschwerde in öffent...,social_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,00062164-3d14-4956-a749-8658665ce503,fr,Vu :\nle recours en matière civile formé le 1e...,"considérant :\nque, vu ce qui précède, le prés...",civil_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,00073fde-172e-4019-9163-98e30509ef22,fr,Considérant en fait et en droit :\n1.\nPar arr...,none,civil_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
27256,fff27ed4-e8b6-41d3-81aa-131cbe98b81d,de,none,Erwägungen:\n1.\n1.1. Am 31. Oktober 2017 eröf...,civil_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27257,fff40230-89e1-4a9a-a2bb-c975e725afe5,fr,"Faits :\nA.\nA.a. Le 23 décembre 2015, l'Aérop...",Considérant en droit :\n1.\n1.1. L'arrêt attaq...,public_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27258,fff79bd6-391b-4ba4-8089-bee384b7ee87,fr,Considérant en fait et en droit :\n1.\n1.1. Le...,none,public_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27259,fffbe655-d320-4619-bb0a-45eb18f67def,de,"Sachverhalt:\nA.\nA.a. Der 1976 geborene A._, ...",Erwägungen:\n1.\nMit der Beschwerde in öffentl...,social_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
#returns false if a document has no selected laws(all 0s)

def check(l):
    for i in l:
        if i > 0:
            return True
    return False

In [10]:
#removes the documents with no selected laws
df_no_empty=df[df['onehot_laws'].apply(check) == True].reset_index(drop=True)
df_no_empty

Unnamed: 0,decision_id,language,facts,considerations,law_area,onehot_laws
0,0002f37e-0cfa-41df-8ce0-ec9411ccca2f,fr,none,"Considérant :\nque le 1er avril 2019, A._ a an...",social_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,00040af4-cd41-4d95-a60b-64b1bf5b5c3b,de,none,Der Präsident zieht in Erwägung:\n1.\nDie Part...,penal_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0004e346-7f68-402b-bd25-f9a560630959,de,"Sachverhalt:\nA.\nA.a. Bei A._, geboren 1988, ...",Erwägungen:\n1.\n1.1. Die Beschwerde in öffent...,social_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,00062164-3d14-4956-a749-8658665ce503,fr,Vu :\nle recours en matière civile formé le 1e...,"considérant :\nque, vu ce qui précède, le prés...",civil_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,00073fde-172e-4019-9163-98e30509ef22,fr,Considérant en fait et en droit :\n1.\nPar arr...,none,civil_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
27256,fff27ed4-e8b6-41d3-81aa-131cbe98b81d,de,none,Erwägungen:\n1.\n1.1. Am 31. Oktober 2017 eröf...,civil_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27257,fff40230-89e1-4a9a-a2bb-c975e725afe5,fr,"Faits :\nA.\nA.a. Le 23 décembre 2015, l'Aérop...",Considérant en droit :\n1.\n1.1. L'arrêt attaq...,public_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27258,fff79bd6-391b-4ba4-8089-bee384b7ee87,fr,Considérant en fait et en droit :\n1.\n1.1. Le...,none,public_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27259,fffbe655-d320-4619-bb0a-45eb18f67def,de,"Sachverhalt:\nA.\nA.a. Der 1976 geborene A._, ...",Erwägungen:\n1.\nMit der Beschwerde in öffentl...,social_law,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoded = encoder.fit_transform(df_no_empty[['law_area']])
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(['law_area']))
df_no_empty['onehot_law_area']=encoded_df.values.tolist()

encoder = OneHotEncoder()
encoded = encoder.fit_transform(df_no_empty[['language']])
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(['language']))
df_no_empty['onehot_language']=encoded_df.values.tolist()

df= df_no_empty.drop(['law_area', 'language'], axis=1)
df

Unnamed: 0,decision_id,facts,considerations,onehot_laws,onehot_law_area,onehot_language
0,0002f37e-0cfa-41df-8ce0-ec9411ccca2f,none,"Considérant :\nque le 1er avril 2019, A._ a an...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 1.0, 0.0]"
1,00040af4-cd41-4d95-a60b-64b1bf5b5c3b,none,Der Präsident zieht in Erwägung:\n1.\nDie Part...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
2,0004e346-7f68-402b-bd25-f9a560630959,"Sachverhalt:\nA.\nA.a. Bei A._, geboren 1988, ...",Erwägungen:\n1.\n1.1. Die Beschwerde in öffent...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0]","[1.0, 0.0, 0.0]"
3,00062164-3d14-4956-a749-8658665ce503,Vu :\nle recours en matière civile formé le 1e...,"considérant :\nque, vu ce qui précède, le prés...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0, 0.0]"
4,00073fde-172e-4019-9163-98e30509ef22,Considérant en fait et en droit :\n1.\nPar arr...,none,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0, 0.0]"
...,...,...,...,...,...,...
27256,fff27ed4-e8b6-41d3-81aa-131cbe98b81d,none,Erwägungen:\n1.\n1.1. Am 31. Oktober 2017 eröf...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
27257,fff40230-89e1-4a9a-a2bb-c975e725afe5,"Faits :\nA.\nA.a. Le 23 décembre 2015, l'Aérop...",Considérant en droit :\n1.\n1.1. L'arrêt attaq...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 1.0, 0.0]"
27258,fff79bd6-391b-4ba4-8089-bee384b7ee87,Considérant en fait et en droit :\n1.\n1.1. Le...,none,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 1.0, 0.0]"
27259,fffbe655-d320-4619-bb0a-45eb18f67def,"Sachverhalt:\nA.\nA.a. Der 1976 geborene A._, ...",Erwägungen:\n1.\nMit der Beschwerde in öffentl...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0]","[1.0, 0.0, 0.0]"


In [12]:
df.to_csv('selected_laws_test.csv', index=False)

Encoding the sentences

In [None]:
df= pd.read_csv("selected_laws_test.csv")
df

Unnamed: 0,decision_id,facts,considerations,onehot_laws,onehot_law_area,onehot_language
0,000127ef-17d2-4ded-8621-c0c962c18fd5,"Sachverhalt:\nA.\nA._, geboren 1954, war ab No...",Erwägungen:\n1.\n1.1. Die Beschwerde kann wege...,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0]","[1.0, 0.0, 0.0]"
1,00015fba-e922-4f05-ae7c-7cfcb823ff54,"Faits :\nA.\nA.a. En 1996, B.X._, domicilié à ...",Considérant en droit :\nI. Recevabilité et poi...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 1.0, 0.0]"
2,0001f593-c8af-4b97-8811-99963dfac084,Considérant en fait et en droit:\n1.\nPar ordo...,none,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 1.0, 0.0]"
3,0003a8f8-ea59-41bb-b3d6-dd58de43ef44,Sachverhalt:\nA. Die Staatsanwaltschaft des Ka...,Erwägungen:\n1. Gemäss <ref-law> beurteilt das...,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...","[0.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 0.0, 0.0]"
4,00074c77-8f73-475c-a7fa-94fd080cf449,"Sachverhalt:\nA. A.A._ und B.A._ (Verpächter, ...",Erwägungen:\n1. 1.1 Die Beschwerde richtet sic...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
...,...,...,...,...,...,...
77634,fff9abfd-110b-4ff1-aa33-1d4bb4432100,Sachverhalt:\nA. X._ veräusserte am 13. Juli 2...,Erwägungen:\n1. 1.1 Beim angefochtenen Entsche...,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 0.0, 0.0]"
77635,fffa49c0-72d8-4ea3-a9e5-53aecc43476f,Sachverhalt:\nSachverhalt:\nA. Die 1960 gebore...,Das Bundesgericht zieht in Erwägung:\nDas Bund...,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0]","[1.0, 0.0, 0.0]"
77636,fffa6d91-5035-4165-88c9-98dfefd4e32e,Sachverhalt:\nA.a Mit Eheschutzentscheid vom 2...,Das Bundesgericht zieht in Erwägung:\n1. 1.1 A...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]"
77637,fffbe741-1a9a-4b9f-bc50-dc20de82b507,Nach Einsicht\nin die Beschwerde vom 4. April ...,"in Erwägung,\ndass ein Rechtsmittel gemäss Art...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0]","[1.0, 0.0, 0.0]"


In [None]:
from sentence_transformers import SentenceTransformer

print("Loading the Sentence Transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

Loading the Sentence Transformer model...




Model loaded successfully.


In [None]:
fact_vectors=[]
for i in df['facts'].values:
    fact_vectors.append(model.encode(i))

considerations_vectors=[]
for i in df['considerations'].values:
    considerations_vectors.append(model.encode(i))


In [None]:
fact_vectors=[i.tolist() for i in fact_vectors]
considerations_vectors=[i.tolist() for i in considerations_vectors]

In [None]:
df['encoded_facts']=fact_vectors
df['encoded_considerations']=considerations_vectors
df= df.drop(['facts', 'considerations'], axis=1)
df.to_csv('selected_laws_final.csv', index=False)

In [2]:
import ast
df=pd.read_csv("selected_laws_test_final.csv")


In [3]:
language_val=np.array([ast.literal_eval(i) for i in df['onehot_language']])
law_area_val=np.array([ast.literal_eval(i) for i in df['onehot_law_area']])
facts_val=np.array([ast.literal_eval(i) for i in df['encoded_facts']])
considerations_val=np.array([ast.literal_eval(i) for i in df['encoded_considerations']])
laws_val=np.array([ast.literal_eval(i) for i in df['onehot_laws']])

In [5]:
df=pd.DataFrame()

for i in range(language_val.shape[1]):  
    df['language_'+str(i+1)]=language_val[:,i]
for i in range(law_area_val.shape[1]):
    df['law_area_'+str(i+1)]=law_area_val[:,i]
for i in range(facts_val.shape[1]):
    df['facts_'+str(i+1)]=facts_val[:,i]
for i in range(considerations_val.shape[1]):
    df['considerations_'+str(i+1)]=considerations_val[:,i]
for i in range(laws_val.shape[1]):
    df['laws_'+str(i+1)]=laws_val[:,i]


  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=facts_val[:,i]
  df['facts_'+str(i+1)]=f

In [6]:
if input("You wanna overwrite this?")=='s':
    df.to_csv('sep_selected_laws_test_final.csv', index=False)