In [2]:
import sys

sys.path.append("../src")

import pandas as pd
import getpass
import os

from preprocess import Preprocessing
from date_normalization import DateNormalization
from retrieval import RetrievalAugmentedGenerator
from metrics import Metrics

# Import and preprocess our datas

In [2]:
full_df = pd.read_csv("../datas/train_data.csv")
preprocess = Preprocessing(full_df)
full_df = preprocess.remove_stopwords()
full_df.head()

Unnamed: 0,ID,filename,texte,sexe,date_accident,date_consolidation
0,0,Agen_100515.txt,Le : 12/11/2019 Cour d’appel d’Agen chambre so...,homme,1991-04-09,n.c.
1,1,Agen_1100752.txt,Le : 12/11/2019 Cour d’appel d’Agen chambre ci...,homme,2005-06-10,2010-01-19
2,2,Agen_1613.txt,Le : 12/11/2019 Cour d’appel d’Agen Audience p...,femme,1997-09-26,n.c.
3,3,Agen_2118.txt,Le : 12/11/2019 Cour d’appel d’Agen Audience p...,femme,1982-08-07,1982-11-07
4,4,Agen_21229.txt,Le : 12/11/2019 Cour d’appel d’Agen Audience p...,homme,1996-11-26,n.c.


# Insert your OpenAI API key here

In [3]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI key:")

# Choose your OpenAI and Embedding models

In [4]:
OPENAI_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL_NAME = "dangvantuan/sentence-camembert-large"

# We ask our RAG to predict the date of accident

In [9]:
query = "what is the date of accident?"
my_RAG = RetrievalAugmentedGenerator(full_df, query, OPENAI_MODEL, EMBEDDING_MODEL_NAME)

res_date_accident = my_RAG.retriver()


100%|██████████| 10/10 [00:26<00:00,  2.62s/it]


# Same for the date of consolidation

In [10]:
query = "what is the date of consolidation?"
my_RAG = RetrievalAugmentedGenerator(full_df, query, OPENAI_MODEL, EMBEDDING_MODEL_NAME)

res_date_consolidation = my_RAG.retriver()

100%|██████████| 10/10 [00:19<00:00,  1.91s/it]


# We add our results in the dataframe

In [None]:
full_df["date_accident_pred"] = res_date_accident
full_df["date_consolidation_pred"] = res_date_consolidation

# We clean the results to have the same format as the true labels

In [None]:
full_df['date_accident_pred'] = full_df['date_accident_pred'].apply(lambda x: DateNormalization(x).extract_and_reformat_date())
full_df['date_consolidation_pred'] = full_df['date_consolidation_pred'].apply(lambda x: DateNormalization(x).extract_and_reformat_date())

# Now, we compute our metrics

In [8]:
for_accident = Metrics(full_df, 'date_accident', 'date_accident_pred')
for_consolidation = Metrics(full_df, 'date_consolidation', 'date_consolidation_pred')

In [14]:
accuracy_accident = for_accident.accuracy()
accuracy_consolidation = for_consolidation.accuracy()

print(f"Accuracy accident: {round(accuracy_accident, 2)*100}%")
print(f"Accuracy consolidation: {round(accuracy_consolidation, 2)*100}%")

Accuracy accident: 71.0%
Accuracy consolidation: 55.00000000000001%


In [15]:
f1_accident = for_accident.f1()
f1_consolidation = for_consolidation.f1()

print(f"f1 accident: {round(f1_accident, 2)*100}%")
print(f"f1 consolidation: {round(f1_consolidation, 2)*100}%")

f1 accident: 83.0%
f1 consolidation: 71.0%
