In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import faiss

In [None]:
print("Loading the Swiss Doc2Doc IR dataset...")
ds = load_dataset("rcds/swiss_doc2doc_ir")
print("Dataset loaded successfully.")

In [None]:
df=pd.DataFrame(ds['train'])

df=df.drop(['chamber', 'region', 'origin_court', 'origin_canton', 'origin_chamber', 'law_sub_area', 'year','cited_rulings','rulings'], axis=1)

import ast

df['laws'] = df['laws'].map(ast.literal_eval)
df = df.fillna('none')#law_area
df['facts'] = df['facts'].replace("", "none")
df['considerations'] = df['considerations'].replace("", "none")
df_no_empty=df[df['laws'].apply(len) != 0].reset_index(drop=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoded = encoder.fit_transform(df_no_empty[['law_area']])
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(['law_area']))
df_no_empty['onehot_law_area']=encoded_df.values.tolist()

encoder = OneHotEncoder()
encoded = encoder.fit_transform(df_no_empty[['language']])
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(['language']))
df_no_empty['onehot_language']=encoded_df.values.tolist()

df= df_no_empty.drop(['law_area', 'language'], axis=1)
df

In [None]:
from sentence_transformers import SentenceTransformer

print("Loading the Sentence Transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

In [None]:
fact_vectors=[]
for i in df['facts'].values:
    fact_vectors.append(model.encode(i))

considerations_vectors=[]
for i in df['considerations'].values:
    considerations_vectors.append(model.encode(i))

fact_vectors=[i.tolist() for i in fact_vectors]
considerations_vectors=[i.tolist() for i in considerations_vectors]

df['encoded_facts']=fact_vectors
df['encoded_considerations']=considerations_vectors
df= df.drop(['facts', 'considerations'], axis=1)
df.to_csv('selected_laws_final.csv', index=False)

In [2]:
df=pd.read_csv('selected_laws_final.csv')
df

Unnamed: 0,decision_id,facts,considerations,language,law_area
0,000127ef-17d2-4ded-8621-c0c962c18fd5,"[-0.01580595038831234, 0.04269539937376976, 0....","[-0.13322238624095917, 0.013987185433506966, -...","[1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 1.0]"
1,00015fba-e922-4f05-ae7c-7cfcb823ff54,"[-0.04270520433783531, -0.007236811798065901, ...","[-0.0926031619310379, 0.06950176507234573, 0.0...","[0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]"
2,0001f593-c8af-4b97-8811-99963dfac084,"[-0.032951369881629944, 0.09559076279401779, -...","[0.0024044474121183157, -0.04070058465003967, ...","[0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]"
3,0003a8f8-ea59-41bb-b3d6-dd58de43ef44,"[-0.098425954580307, -0.021627023816108704, -0...","[-0.06753509491682053, 0.09308979660272598, -0...","[1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]"
4,00053c67-af13-4dad-baf2-f54521bfed52,"[-0.08686883002519608, 0.009472505189478397, 0...","[-0.08257345110177994, 0.019399845972657204, -...","[0.0, 1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0]"
...,...,...,...,...,...
95351,fffa6d91-5035-4165-88c9-98dfefd4e32e,"[-0.11424097418785095, 0.0221572145819664, -0....","[-0.07749581336975098, 0.08016350865364075, -0...","[1.0, 0.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0]"
95352,fffb6fda-cb9f-4d35-b517-be5a26af9988,"[-0.08653483539819717, 0.03897037357091904, 0....","[-0.15756599605083466, 0.06037058308720589, 0....","[0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]"
95353,fffbe741-1a9a-4b9f-bc50-dc20de82b507,"[-0.03299336135387421, -0.01237813476473093, 0...","[-0.13550086319446564, 0.015467672608792782, -...","[1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 1.0]"
95354,fffceb6f-dcfb-4a4e-bc3f-e7612bc88edb,"[-0.07127238065004349, 0.041802339255809784, -...","[-0.10940168052911758, 0.025935346260666847, -...","[1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]"


In [11]:
#language, law_area, facts. considerations

import ast

vectors=[]
for i in range(df.shape[0]):
    vector=[]
    vector+=ast.literal_eval(df['language'][i])
    vector+=ast.literal_eval(df['law_area'][i])
    vector+=ast.literal_eval(df['facts'][i])
    vector+=ast.literal_eval(df['considerations'][i])
    np.array(vector)
    vectors.append(vector)
vectors = np.array(vectors).astype('float32')
dim = vectors.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(vectors)
faiss.write_index(index, 'language_languagearea_fact_consider.index')