In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
import transformers
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from tqdm import tqdm
import re

# import datas and clean text

In [1]:
train_df = pd.read_csv("../datas/train_data.csv")

NameError: name 'pd' is not defined

In [None]:
def remove_newlines(df):
    df = df.replace("\n", '', regex=True)
    return df
train_df = remove_newlines(train_df)
train_df.head()

# Text batching and apply the NER model

In [None]:
def divide_text_into_batches(text, n_batches):
    """
    Divise the text on n batches.
    """
    batch_size = len(text) // n_batches
    return [text[i:i+batch_size] for i in range(0, len(text), batch_size)]

def extract_sentences_around_dates(batch, model, tokenizer, context_window):
    """
    extract dates and a context of N tokens before and after the dates in a batch
    """
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
    results = ner_pipeline(batch)
    dates_and_contexts = []

    for result in results:
        if result['entity'] == 'B-DATE' or result['entity'] == 'I-DATE':
            start, end = result['start'], result['end']
            pre_context = batch[max(0, start - context_window*6):start]
            post_context = batch[end:end + context_window*6]
            
            # Extract the the text around the date
            
            # before the date
            sentence_boundaries = re.search(r'([.!?]\s+)|([.!?]$)', pre_context[::-1])
            pre_sentence_boundary = -sentence_boundaries.start(0) if sentence_boundaries else -len(pre_context)
            
            # after the date
            sentence_boundaries = re.search(r'([.!?]\s+)|([.!?]$)', post_context)
            post_sentence_boundary = sentence_boundaries.end(0) if sentence_boundaries else len(post_context)
            
            full_context = pre_context[pre_sentence_boundary:] + batch[start:end] + post_context[:post_sentence_boundary]
            
            # Clean and extract words around the date
            words_around_date = re.findall(r'\w+', full_context)
            date_index = len(re.findall(r'\w+', pre_context[pre_sentence_boundary:]))
            start_context = max(0, date_index - context_window)
            end_context = min(len(words_around_date), date_index + context_window)
            
            context_sentence = ' '.join(words_around_date[start_context:end_context])
            dates_and_contexts.append((batch[start:end], context_sentence))

    return dates_and_contexts

def extract_dates_with_context_from_long_text(text, model, tokenizer, n_batches, context_window):
    """
    Handles long texts by dividing them into batches and extracting sentences around identified dates.
    """
    batches = divide_text_into_batches(text, n_batches)
    all_dates_and_contexts = []

    for batch in batches:
        batch_dates_and_contexts = extract_sentences_around_dates(batch, model, tokenizer, context_window)
        all_dates_and_contexts.extend(batch_dates_and_contexts)

    return all_dates_and_contexts


# Apply the function to all the rows of the dataframe

In [None]:
MODEL_NAME = "Jean-Baptiste/camembert-ner-with-dates"
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
N_BATCHES = 15
CONTEXT_WINDOW = 50

In [None]:
data_to_append = []

for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
    text = row["texte"]
    filename = row["filename"]
    
    dates_and_contexts = extract_dates_with_context_from_long_text(text, model, tokenizer,
                                                                   N_BATCHES, CONTEXT_WINDOW)
    
    contexts = " ".join([context for _, context in dates_and_contexts])
    
    data_to_append.append({"filename": filename, "contexts": contexts})
    
results_df = pd.DataFrame(data_to_append, columns=["filename", "contexts"])
