In [1]:
import re
import pandas as pd

In [2]:
df = pd.read_csv("date_parser_testcases.csv")

In [3]:
df

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021
...,...,...
95,"We celebrate Independence Day on 2023-07-04, a...",04/07/2023
96,The final date for submission is 30th November...,30/11/2022
97,"The annual conference is on 15th October 2023,...",15/10/2023
98,"His birthdate, noted as 1990-05-20, is in the ...",20/05/1990


In [4]:
import re
import pandas as pd
from datetime import datetime


df = pd.read_csv("date_parser_testcases.csv")


def month_name_to_number(month):
    months = {
        "jan": "01", "january": "01",
        "feb": "02", "february": "02",
        "mar": "03", "march": "03",
        "apr": "04", "april": "04",
        "may": "05",
        "jun": "06", "june": "06",
        "jul": "07", "july": "07",
        "aug": "08", "august": "08",
        "sep": "09", "september": "09",
        "oct": "10", "october": "10",
        "nov": "11", "november": "11",
        "dec": "12", "december": "12"
    }
    return months.get(month.lower(), None)

def ensure_full_year(year):
    if len(year) == 2:
        if int(year) <= int(str(pd.Timestamp.now().year)[-2:]):
            return "20" + year  
        else:
            return "19" + year 
    return year


In [5]:
def parse_date(text):
    
    patterns = [
        
        r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s*,?\s*(\d{4})",
        r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s+(\d{4})",
        r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s*,?\s*(\d{4})",  # e.g., 21st June 2024
        r"([A-Za-z]+)\s+(\d{1,2})(st|nd|rd|th)?\s*,?\s*(\d{4})",  # e.g., June 21st 2024

        
        r"(\d{1,2})/(\d{1,2})/(\d{4})",  # e.g., 25/12/2023
        r"(\d{1,2})-(\d{1,2})-(\d{4})",  # e.g., 25-12-2023
        r"(\d{1,2})\.(\d{1,2})\.(\d{4})",  # e.g., 25.12.2023
        r"(\d{4})/(\d{1,2})/(\d{1,2})",  # e.g., 2023/12/25 (Year first)
        r"(\d{4})-(\d{1,2})-(\d{1,2})",  # e.g., 2023-12-25 (Year first)
        r"(\d{1,2})/(\d{1,2})/(\d{2})",  # e.g., 25/12/19

        r"(\d{4})\.(\d{1,2})\.(\d{1,2})",
        r"(\d{4})/(\d{1,2})/(\d{1,2})",  # e.g., 2023/12/25
        r"(\d{4})-(\d{1,2})-(\d{1,2})",
        
        
        r"([A-Za-z]+)\s+(\d{1,2})(st|nd|rd|th)?,?\s+(\d{4})",  # e.g., June 21, 2024
        r"(\d{4})\s+([A-Za-z]+)\s+(\d{1,2})(st|nd|rd|th)?",  # e.g., 2024 June 21
        
       
        r"(\d{1,2})/(\d{1,2})/(\d{2})",  # e.g., 5/6/19
        r"(\d{1,2})/(\d{1,2})/(\d{4})",  # e.g., 08/31/2021
        r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s*,?\s*including\s+(\d{4})",  # e.g., 25th Dec, including 2024
        r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s+every\s+year(?:,\s+including\s+(\d{4}))?",  # e.g., 4th of July every year, including 2022
        r"(\d{2})/(\d{2})/(\d{4})",
        r"(\d{1,2})/(\d{1,2})/(\d{4})"

    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            try:
                
                if pattern == r"(\d{1,2})/(\d{1,2})/(\d{4})":
                    month, day, year = match.groups()
                    month, day = month.zfill(2), day.zfill(2)
                    return f"{day}/{month}/{year}"
                
                if pattern == r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s*,?\s*including\s+(\d{4})":
                    day, month, year = match.groups()[0], match.groups()[2], match.groups()[3]
                    day = day.zfill(2)  
                    month = month_name_to_number(month)  
                    return f"{day}/{month}/{year}"
                
                if pattern == r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s+every\s+year(?:,\s+including\s+(\d{4}))?":
                    day, month, year = match.groups()[0], match.groups()[2], match.groups()[3]
                    day = day.zfill(2)
                    month = month_name_to_number(month)
                    if not year:
                        year = datetime.now().year
                    else:
                        year = ensure_full_year(year)
                    return f"{day}/{month}/{year}"
                
                
                if len(match.groups()) == 4:  
                    day = match.group(1) if pattern.startswith(r"(\d{1,2})") else match.group(2)
                    day = day.zfill(2)
                    month = match.group(3) if pattern.startswith(r"(\d{1,2})") else match.group(1)
                    if month.isdigit():  
                        month = month.zfill(2)
                    else:  
                        month = month_name_to_number(month)
                    year = match.group(4) if pattern.startswith(r"(\d{1,2})") else match.group(4)
                else:  
                    if pattern.startswith(r"(\d{4})"):  
                        year, month, day = match.group(1), match.group(2).zfill(2), match.group(3).zfill(2)
                    else:  
                        day, month, year = match.group(1).zfill(2), match.group(2).zfill(2), match.group(3)

                
                if year and len(year) == 2:
                    year = ensure_full_year(year)
                if month and len(month) == 1:
                    month = f"0{month}"
                if day and len(day) == 1:
                    day = f"0{day}"
                
                return f"{day}/{month}/{year}"
            except Exception as e:
                print(f"Error parsing date '{text}': {e}")
    
    return None


df['parsed_date'] = df['Input'].apply(parse_date)


print(df)

                                                Input Expected Output  \
0         The event will take place on March 5, 2023.      05/03/2023   
1                      Her birthday is on 07/08/1990.      07/08/1990   
2                         The deadline is 2022-12-31.      31/12/2022   
3                      We met on 1st of January 2000.      01/01/2000   
4   The concert is scheduled for 15th September, 2...      15/09/2021   
..                                                ...             ...   
95  We celebrate Independence Day on 2023-07-04, a...      04/07/2023   
96  The final date for submission is 30th November...      30/11/2022   
97  The annual conference is on 15th October 2023,...      15/10/2023   
98  His birthdate, noted as 1990-05-20, is in the ...      20/05/1990   
99  The festival will be celebrated on 12th August...      12/08/2024   

   parsed_date  
0   05/03/2023  
1   08/07/1990  
2   31/12/2022  
3   01/01/2000  
4   15/09/2021  
..         ...  
95  

In [6]:
df.dropna(inplace=True)

In [7]:
df.isna().sum()

Input              0
Expected Output    0
parsed_date        0
dtype: int64

In [8]:
# Apply date parsing
df['parsed_dates'] = df['Input'].apply(parse_date)
df['matches'] = df['parsed_dates'] == df['Expected Output']

# Count matches and mismatches
num_matches = df['matches'].sum()
num_mismatches = len(df) - num_matches

# Print counts
print(f"Number of matches: {num_matches}")
print(f"Number of mismatches: {num_mismatches}")

Number of matches: 94
Number of mismatches: 4


In [9]:
# Print matches and mismatches
print("\nMatches:")
matches_df = df[df['matches'] == True]
print(matches_df[['Input', 'parsed_dates', 'Expected Output']])


Matches:
                                                Input parsed_dates  \
0         The event will take place on March 5, 2023.   05/03/2023   
2                         The deadline is 2022-12-31.   31/12/2022   
3                      We met on 1st of January 2000.   01/01/2000   
4   The concert is scheduled for 15th September, 2...   15/09/2021   
5                       Let's catch up on 02.04.2022.   02/04/2022   
..                                                ...          ...   
95  We celebrate Independence Day on 2023-07-04, a...   04/07/2023   
96  The final date for submission is 30th November...   30/11/2022   
97  The annual conference is on 15th October 2023,...   15/10/2023   
98  His birthdate, noted as 1990-05-20, is in the ...   20/05/1990   
99  The festival will be celebrated on 12th August...   12/08/2024   

   Expected Output  
0       05/03/2023  
2       31/12/2022  
3       01/01/2000  
4       15/09/2021  
5       02/04/2022  
..             ...  
95

In [10]:
print("\nMismatches:")
mismatches_df = df[df['matches'] == False]
print(mismatches_df[['Input', 'parsed_dates', 'Expected Output']])


Mismatches:
                                                Input parsed_dates  \
1                      Her birthday is on 07/08/1990.   08/07/1990   
31                      The interview is on 1/2/2022.   02/01/2022   
52  Her birthday, which she celebrates on 07/08/19...   08/07/1990   
81   The job interview is on 1/2/2022, don't be late.   02/01/2022   

   Expected Output  
1       07/08/1990  
31      01/02/2022  
52      07/08/1990  
81      01/02/2022  
