In [5]:
import re
import pandas as pd

# Load dataset
df = pd.read_csv("date_parser_testcases.csv")

# Month map (full and abbreviated)
month_map = {
    'january': '01', 'jan': '01',
    'february': '02', 'feb': '02',
    'march': '03', 'mar': '03',
    'april': '04', 'apr': '04',
    'may': '05',
    'june': '06', 'jun': '06',
    'july': '07', 'jul': '07',
    'august': '08', 'aug': '08',
    'september': '09', 'sep': '09', 'sept': '09',
    'october': '10', 'oct': '10',
    'november': '11', 'nov': '11',
    'december': '12', 'dec': '12'
}

def parse_date(text):
    text = text.lower()
    text = re.sub(r'[.]', '/', text)  # Normalize separators


    # Format: DD/MM/YYYY or D/M/YYYY → assume DD/MM
    # Matches dates like "02/04/2022" or "2/4/2022"
    match = re.search(r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{4})\b', text)
    if match:
        day, month, year = match.groups()
        return f"{int(day):02d}/{int(month):02d}/{year}"

    # Format: D/M/YY → convert 2-digit year to 4-digit
    # Matches "5/6/19", assumes 2-digit year (convert to 2019 or 1919)
    match = re.search(r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{2})\b', text)
    if match:
        day, month, year = match.groups()
        year = f"20{year}" if int(year) < 50 else f"19{year}"
        return f"{int(day):02d}/{int(month):02d}/{year}"

    # Format: YYYY/MM/DD or YYYY-MM-DD
    # Matches "2022/11/30" or "2022-11-30", assumes year comes first
    match = re.search(r'\b(\d{4})[/-](\d{1,2})[/-](\d{1,2})\b', text)
    if match:
        year, month, day = match.groups()
        return f"{int(day):02d}/{int(month):02d}/{year}"

    # Format: 1st of January 2000
    # Matches ordinal dates like "1st of January 2000", "3rd January 1999"
    match = re.search(r'\b(\d{1,2})(st|nd|rd|th)?\s+(of\s+)?([a-zA-Z]+)[,]?\s+(\d{4})\b', text)
    if match:
        day, _, _, month_str, year = match.groups()
        month = month_map.get(month_str[:3], '01')
        return f"{int(day):02d}/{month}/{year}"

    # Format: February 15th, 2022 or Feb 15, 2022
    # Matches "February 15th, 2022", "Feb 15, 2022"
    match = re.search(r'\b([a-zA-Z]+)\s+(\d{1,2})(st|nd|rd|th)?[,]?\s+(\d{4})\b', text)
    if match:
        month_str, day, _, year = match.groups()
        month = month_map.get(month_str[:3], '01')
        return f"{int(day):02d}/{month}/{year}"

    # Format: 25 Dec 2024
    # Matches "25 December 2024", "1 Jan 2001"
    match = re.search(r'\b(\d{1,2})\s+([a-zA-Z]+)\s+(\d{4})\b', text)
    if match:
        day, month_str, year = match.groups()
        month = month_map.get(month_str[:3], '01')
        return f"{int(day):02d}/{month}/{year}"

    return "Invalid"

# Apply parser
df['Parsed Output'] = df['Input'].apply(parse_date)

# Evaluate accuracy
df['Correct'] = df['Parsed Output'] == df['Expected Output']
accuracy = df['Correct'].mean() * 100

# Print summary
print(df[['Input', 'Expected Output', 'Parsed Output', 'Correct']])
print(f"\nAccuracy: {accuracy:.2f}% ({df['Correct'].sum()} out of {len(df)} correct)")

# Optional: Save to file
df.to_csv("parsed_dates_final.csv", index=False)


                                                Input Expected Output  \
0         The event will take place on March 5, 2023.      05/03/2023   
1                      Her birthday is on 07/08/1990.      07/08/1990   
2                         The deadline is 2022-12-31.      31/12/2022   
3                      We met on 1st of January 2000.      01/01/2000   
4   The concert is scheduled for 15th September, 2...      15/09/2021   
..                                                ...             ...   
95  We celebrate Independence Day on 2023-07-04, a...      04/07/2023   
96  The final date for submission is 30th November...      30/11/2022   
97  The annual conference is on 15th October 2023,...      15/10/2023   
98  His birthdate, noted as 1990-05-20, is in the ...      20/05/1990   
99  The festival will be celebrated on 12th August...      12/08/2024   

   Parsed Output  Correct  
0     05/03/2023     True  
1     07/08/1990     True  
2     31/12/2022     True  
3     01/01