In [3]:
import sys

sys.path.append("../src")

import pandas as pd
import getpass
import os

from rag_preprocess import Preprocessing
from date_normalization import DateNormalization
from retrieval import RetrievalAugmentedGenerator
from metrics import Metrics

# Import and preprocess our datas

In [2]:
ner_df = pd.read_csv("../ner_dfs/passages_dates.csv")
preprocess = Preprocessing(ner_df)
ner_df = preprocess.remove_stopwords()
ner_df.head()

Unnamed: 0,filename,texte,date_accident,date_consolidation
0,Agen_100515.txt,Le 12 11 2019 Cour appel Agen chambre sociale ...,1991-04-09,n.c.
1,Agen_1100752.txt,Le 12 11 2019 Cour appel Agen chambre civile A...,2005-06-10,2010-01-19
2,Agen_1613.txt,Le 12 11 2019 Cour appel Agen Audience publiqu...,1997-09-26,n.c.
3,Agen_2118.txt,Le 12 11 2019 Cour appel Agen Audience publiqu...,1982-08-07,1982-11-07
4,Agen_21229.txt,Le 12 11 2019 Cour appel Agen Audience publiqu...,1996-11-26,n.c.


# Insert your OpenAI API key here

In [3]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI key:")

# Choose your OpenAI and Embedding models

In [6]:
OPENAI_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL_NAME = "xlm-r-distilroberta-base-paraphrase-v1"

# We ask our RAG to predict the date of accident

In [None]:
query = "what is the date of accident?"
my_RAG = RetrievalAugmentedGenerator(ner_df, query, OPENAI_MODEL, EMBEDDING_MODEL_NAME)

res_date_accident = my_RAG.retriver()

# Same for the date of consolidation

In [10]:
query = "what is the date of consolidation?"
my_RAG = RetrievalAugmentedGenerator(ner_df, query, OPENAI_MODEL, EMBEDDING_MODEL_NAME)

res_date_consolidation = my_RAG.retriver()

100%|██████████| 10/10 [00:19<00:00,  1.91s/it]


# We add our results in the dataframe

In [None]:
ner_df["date_accident_pred"] = res_date_accident
ner_df["date_consolidation_pred"] = res_date_consolidation

# We clean the results to have the same format as the true labels

In [None]:
ner_df['date_accident_pred'] = ner_df['date_accident_pred'].apply(lambda x: DateNormalization(x).extract_and_reformat_date())
ner_df['date_consolidation_pred'] = ner_df['date_consolidation_pred'].apply(lambda x: DateNormalization(x).extract_and_reformat_date())

# Now, we compute our metrics

In [4]:
for_accident = Metrics(ner_df, 'date_accident', 'date_accident_pred')
for_consolidation = Metrics(ner_df, 'date_consolidation', 'date_consolidation_pred')

In [5]:
accuracy_accident = for_accident.accuracy()
accuracy_consolidation = for_consolidation.accuracy()

print(f"Accuracy accident: {round(accuracy_accident, 2)*100}%")
print(f"Accuracy consolidation: {round(accuracy_consolidation, 2)*100}%")

Accuracy accident: 52.0%
Accuracy consolidation: 42.0%


In [6]:
f1_accident = for_accident.f1()
f1_consolidation = for_consolidation.f1()

print(f"f1 accident: {round(f1_accident, 2)*100}%")
print(f"f1 consolidation: {round(f1_consolidation, 2)*100}%")

f1 accident: 68.0%
f1 consolidation: 59.0%
