In [8]:
import re
from datetime import datetime
import pandas as pd
import traceback

from convert_xml_to_bio import parse_line

months = {
    "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4,
    "May": 5, "Jun": 6, "Jul": 7, "Aug": 8,
    "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12,
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7,
    "August": 8, "September": 9, "October": 10,
    "November": 11, "December": 12
}
months = {k.lower(): v for k, v in months.items()}

days = {
    "monday": 0, "tuesday": 1, "wednesday": 2,
    "thursday": 3, "friday": 4, "saturday": 5, "sunday": 6,
    "mon": 0, "tue": 1, "wed": 2,
    "thu": 3, "fri": 4, "sat": 5, "sun": 6
}
days = {k.lower(): v for k, v in days.items()}

def str2timeml_date(time_str, reference_date=datetime.now()):

    time_str = time_str.lower().strip()
    # detect June 1st, May 3rd, June 10, September 2010
    months_str = "|".join(months.keys())
    pattern = rf"({months_str})\s+\d+"
    if re.match(pattern, time_str):
        month = re.search(pattern, time_str).group(1)

        month_num = months[month]
        day = re.search(r"\d+", time_str).group(0)
        if len(day) == 4:
            # format: september 2010
            return f"{day}-{month_num:02d}"
        else:
            year = reference_date.year
            return f"{reference_date.year}-{month_num:02d}-{int(day):02d}"

    # detect monday, friday, ...
    if time_str in days.keys():
        day_num = days[time_str]
        # return the next occurrence of this day
        today = reference_date.weekday()
        days_ahead = (day_num - today + 7) % 7
        next_date = reference_date + pd.Timedelta(days=days_ahead)
        return next_date.strftime("%Y-%m-%d")
    return None


df = []
for line in open('data/experimental/eng_dataset.xml', 'r').readlines():
    try:
        bio_line, values = parse_line(line)
        if bio_line is None:
            continue

        for value in values:
            if value["annotation_type"] == "DATE":
                norm = str2timeml_date(values[0]["surface_value"])
                df.append([line, norm, value["true_value"], value["surface_value"], value["annotation_type"], norm == value["true_value"]])

    except Exception as e:
        print(f"Error processing line: {line.strip()}")
        traceback.print_exc()
        
df = pd.DataFrame(df, columns=['line', 'normalized_value', 'true_value', 'surface_value', 'annotation_type', 'correct'])

num_correct = df['correct'].sum()
print(f"Number of correct normalizations: {num_correct} out of {len(df)}")
df


Number of correct normalizations: 98 out of 320


Unnamed: 0,line,normalized_value,true_value,surface_value,annotation_type,correct
0,"We are leaving for vacation on <TIMEX3 tid=""t1...",2025-07-05,2025-07-05,July 5th,DATE,True
1,"The seminar was held on <TIMEX3 tid=""t2"" type=...",2025-06-15,2025-06-15,June 15th,DATE,True
2,"She will submit the report by <TIMEX3 tid=""t3""...",2025-06-30,2025-06-30,Monday,DATE,True
3,"I was born on <TIMEX3 tid=""t4"" type=""DATE"" val...",2025-02-14,1990-02-14,"February 14, 1990",DATE,False
4,"They got married in <TIMEX3 tid=""t5"" type=""DAT...",2010-09,2010-09,September 2010,DATE,True
...,...,...,...,...,...,...
315,The program in <LOCATION>Honiara</LOCATION> be...,,2025-06-26,two days ago,DATE,False
316,She came back from <LOCATION>Baku</LOCATION> <...,,2025-06-19,nine days ago,DATE,False
317,The package from <LOCATION>Tehran</LOCATION> a...,,2025-06-09,nineteen days ago,DATE,False
318,I traveled to <LOCATION>Gaborone</LOCATION> <T...,,2025-05-30,four weeks ago,DATE,False


In [18]:
norm

datetime.datetime(2025, 7, 5, 0, 0)