In [2]:
from nltk.tokenize import word_tokenize

## task 1: tokenize

In [11]:
# read data
data_path = 'source_text.txt'

corpus = []
with open(data_path, 'r') as f:
    for line in f:
        corpus.append(line.lower())

In [12]:
sentences = []

for line in corpus:
    sentences.append(word_tokenize(line))

## task 2: replace years, decimals, date days, integers, and all other numbers

In [16]:
import re

In [465]:
# returns number matches
def number_check(word):
    spans = []
    
    for match in re.finditer(r"\d+", word):
        spans.append(match)
    
    return spans

# allows 0.001 or .001 but not 4. 
def decimal_check(word):
    spans = []
    
    for match in re.finditer(r"\d*\.\d+", word):
        spans.append(match)
    
    return spans


def integer_check(word):
    spans = []
    
    for match in re.finditer(r"\d+", word):
        spans.append(match)
    
    return spans

# checks for isbn numbers
def ISBN_check(sentence):
    spans = []
    for match in re.finditer(r"(?:isbn)?\s?(?:[0-9]{3}-)?[0-9]{1,5}-[0-9]{1,7}-[0-9]{1,6}-[0-9]", sentence):
        spans.append(match)
    return spans

# checks for doi numbers
def DOI_check(sentence):
    spans = []
    for match in re.finditer(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', sentence):
        spans.append(match)
    
    return spans

# checks for month year or
# month, year
# checks from 1600 and onwards
def month_year_check(sentence):
    spans = []
    for match in re.finditer(r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?),?.?\s+(?:(16|17|18|19|20)\d{2})', sentence):
        spans.append(match)
    return spans

# checks for month date or 
# month, date
# i.e. jul. 2
def month_date_check(sentence):
    spans = []
    for match in re.finditer(r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?),?.?\s+?(?:\d{1,2})', sentence):
        spans.append(match)
    return spans

# dd/mm/yyyy, dd-mm-yyyy, or dd.mm.yyyy
# allows from 1600 onwards
def date_sep_check(sentence):
    spans = []
    for match in re.finditer(r'(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:jan|mar|may|jul|aug|oct|dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:jan|mar|apr|may|jun|jul|aug|sep|oct|nov|dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|(?:29(\/|-|\.)(?:0?2|(?:feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:jan|feb|mar|apr|may|jun|jul|aug|sep))|(?:1[0-2]|(?:oct|nov|dec)))\4(?:(?:(16|17|18|19|20)\d{2}))', sentence):
        spans.append(match)
    
    return spans

# mm/dd/yyyy 
# allows from 1600 onwards
def month_sep_check(sentence):
    spans = []
    for match in re.finditer(r'(0[1-9]|1[0-2])\/(0[1-9]|1\d|2\d|3[01])\/(16|17|18|19|20)\d{2}', sentence):
        spans.append(match)
        
    return spans

# comma separated form: month date, year
# or month. date, year (oct. 22, 1992)
# or month. date year (oct. 22 1992)
# or month date year (oct 22 1992)
# allows from 1600 and onwards
def month_date_year_check(sentence):
    spans = []
    for match in re.finditer(r'(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?).?\s+\d{1,2},?\s+?(?:(16|17|18|19|20)\d{2})?', sentence):
        spans.append(match)
    
    return spans
# checks for standalone year mentions in text
# allows from 1800 to 2099
# i.e. John Smith was born on 1993. 
# we have to check for ISBN number FIRST before this function as it will pick up isbn numbers as well
def valid_years_in_text(sentence):
    spans = []
    for match in re.finditer(r'(18|19|20)\d{2}', sentence):
        spans.append(match)
        
    return spans

In [456]:
test_example = '''
i am john smith. i was born on february 2, 2001. however, my friend was born on 01/20/2000. can you meet us
for coffee on july 2, 2001? (yes, jul. 2). oct. 2001. I ate .000002 apples.

1982 – F.O. Schmitt Medal and Award 1983
 
 A.I. Ostashev, Sergey Pavlovich Korolyov - The Genius of the 20th Century 
 — 2010 M. of Public Educational Institution of Higher Professional Training MGUL ISBN 978-5-8135-0510-2.
 
 written on 12/30/2021!
 
  A.I. Ostashev, Sergey Pavlovich Korolyov - The Genius of the 20th Century 
 — 2010 M. of Public Educational Institution of Higher Professional Training MGUL ISBN 978-5-8135-0510-2.
'''

In [469]:
moving_result = test_example.lower()
checked_for_isbn = len(ISBN_check(moving_result)) == 0

while not checked_for_isbn:
    
    match = ISBN_check(moving_result)
    match_spans = match[0].span()
    isbn_text = moving_result[match_spans[0]: match_spans[1]]
    
    checked_for_numbers = False
    
    while not checked_for_numbers:
        number_matches = number_check(isbn_text)
        
        number_spans = number_matches[0].span()
        
        isbn_text = isbn_text[:number_spans[0]] + '#' * (number_spans[1] - number_spans[0]) + isbn_text[number_spans[1]:]
        
        if len(number_check(isbn_text)) == 0:
            checked_for_numbers = True
    
    isbn_text = re.sub(r'#+', '<other>', isbn_text)
    
    moving_result = moving_result[:match_spans[0]] + isbn_text + moving_result[match_spans[1]:]
    
    if len(ISBN_check(moving_result)) == 0:
        checked_for_isbn = True

checked_for_month_date_year = len(month_date_year_check(moving_result)) == 0

while not checked_for_month_date_year:
    match=  month_date_year_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)
    # date first
    if len(matches) != 0:
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        # now year

        date_spans = matches[1].span()

        date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]
        
        date_text = re.sub(r'@+', '<days>', date_text)
        date_text = re.sub(r'#+', '<year>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(month_date_year_check(moving_result)) == 0:
        checked_for_month_date_year = True

        
checked_for_month_sep = len(month_sep_check(moving_result)) == 0

while not checked_for_month_sep:
    match=  month_sep_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)
    # date first
    if len(matches) != 0:
        date_spans = matches[1].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        # now year

        date_spans = matches[2].span()

        date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]
        
        date_text = re.sub(r'@+', '<days>', date_text)
        date_text = re.sub(r'#+', '<year>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(month_sep_check(moving_result)) == 0:
        checked_for_month_sep = True

        
checked_for_date_sep = len(date_sep_check(moving_result)) == 0

while not checked_for_date_sep:
    match=  date_sep_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)
    # date first
    if len(matches) != 0:
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        # now year

        date_spans = matches[2].span()

        date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]
        
        date_text = re.sub(r'@+', '<days>', date_text)
        date_text = re.sub(r'#+', '<year>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(date_sep_check(moving_result)) == 0:
        checked_for_date_sep = True
        
checked_for_month_year = len(month_year_check(moving_result)) == 0

while not checked_for_month_year:
    match=  month_year_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)
    # year only
    if len(matches) != 0:
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        # now year

        date_text = re.sub(r'@+', '<year>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(month_year_check(moving_result)) == 0:
        checked_for_month_year = True
        

checked_for_month_date = len(month_date_check(moving_result)) == 0

while not checked_for_month_date:
    match=  month_date_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)
    # year only
    if len(matches) != 0:
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        # now year

        date_text = re.sub(r'@+', '<day>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(month_date_check(moving_result)) == 0:
        checked_for_month_date = True

        
checked_for_valid_years = len(valid_years_in_text(moving_result)) == 0

while not checked_for_valid_years:
    match=  valid_years_in_text(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    
    checked_for_numbers = len(number_check(date_text)) == 0

    while not checked_for_numbers:
        matches = number_check(date_text)
        
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        date_text = re.sub(r'@+', '<year>', date_text)
        
        if len(number_check(date_text)) == 0:
            checked_for_numbers = True

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
        
        
    
    if len(valid_years_in_text(moving_result)) == 0:
        checked_for_valid_years = True


checked_for_valid_years = len(valid_years_in_text(moving_result)) == 0

while not checked_for_valid_years:
    match=  valid_years_in_text(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    
    checked_for_numbers = len(number_check(date_text)) == 0

    while not checked_for_numbers:
        matches = number_check(date_text)
        
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        date_text = re.sub(r'@+', '<year>', date_text)
        
        if len(number_check(date_text)) == 0:
            checked_for_numbers = True

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
        
        
    
    if len(valid_years_in_text(moving_result)) == 0:
        checked_for_valid_years = True


checked_for_decimals = len(decimal_check(moving_result)) == 0

while not checked_for_decimals:
    match=  decimal_check(moving_result)
    match_spans = match[0].span()
    
    moving_result = moving_result[:match_spans[0]] + '@' * (match_spans[1] - match_spans[0]) + moving_result[match_spans[1]:]

    moving_result = re.sub(r'@+', '<decimal>', moving_result)
    
    if len(decimal_check(moving_result)) == 0:
        checked_for_decimals = True

In [471]:
print(moving_result)


i am john smith. i was born on february <days>, <year>. however, my friend was born on 01/<days>/<year>. can you meet us
for coffee on july <days>, <year>? (yes, jul. <day>). oct. <year>. i ate <decimal> apples.

<year> – f.o. schmitt medal and award <year>
 
 a.i. ostashev, sergey pavlovich korolyov - the genius of the 20th century 
 — <year> m. of public educational institution of higher professional training mgul isbn <other>-<other>-<other>-<other>-<other>.
 
 written on 12/<days>/<year>!
 
  a.i. ostashev, sergey pavlovich korolyov - the genius of the 20th century 
 — <year> m. of public educational institution of higher professional training mgul isbn <other>-<other>-<other>-<other>-<other>.



# questions to ask:

## difference between integers and others?
## what about numbers in DOIs? treat them as text? or no