In [12]:
from nltk.tokenize import word_tokenize, MWETokenizer

## task 1: tokenize

In [157]:
# read data
data_path = 'source_text.txt'

corpus = []
with open(data_path, 'r') as f:
    for line in f:
        corpus.append(line.lower())

In [12]:
sentences = []

for line in corpus:
    sentences.append(word_tokenize(line))

## task 2: replace years, decimals, date days, integers, and all other numbers

In [3]:
import re

In [58]:
# returns number matches
def number_check(word):
    spans = []
    
    for match in re.finditer(r"\d+", word):
        spans.append(match)
    
    return spans

# allows 0.001 or .001 but not 4. 
def decimal_check(word):
    spans = []
    
    for match in re.finditer(r"\d*\.\d+", word):
        spans.append(match)
    
    return spans


def integer_check(word):
    spans = []
    
    for match in re.finditer(r"\d+", word):
        spans.append(match)
    
    return spans

# checks for isbn numbers
def ISBN_check(sentence):
    spans = []
    for match in re.finditer(r"(?:isbn)?\s?(?:[0-9]{3}-)?[0-9]{1,5}-[0-9]{1,7}-[0-9]{1,6}-[0-9]", sentence):
        spans.append(match)
    return spans

# checks for doi numbers
def DOI_check(sentence):
    spans = []
    for match in re.finditer(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', sentence):
        spans.append(match)
    
    return spans

# checks for month year or
# month, year
# checks from 1600 and onwards
def month_year_check(sentence):
    spans = []
    for match in re.finditer(r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?),?.?\s+(?:(16|17|18|19|20)\d{2})', sentence):
        spans.append(match)
    return spans

# checks for month date or 
# month, date
# i.e. jul. 2
def month_date_check(sentence):
    spans = []
    for match in re.finditer(r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?),?.?\s+?(?:\d{1,2})', sentence):
        spans.append(match)
    return spans

# checks for date month 
# i.e. 2 jul
def date_month_check(sentence):
    spans = []
    for match in re.finditer(r'(?:\d{1,2})\s+?\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)', sentence):
        spans.append(match)
    return spans

# dd/mm/yyyy, dd-mm-yyyy, or dd.mm.yyyy
# allows from 1600 onwards
def date_sep_check(sentence):
    spans = []
    for match in re.finditer(r'(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:jan|mar|may|jul|aug|oct|dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:jan|mar|apr|may|jun|jul|aug|sep|oct|nov|dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|(?:29(\/|-|\.)(?:0?2|(?:feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:jan|feb|mar|apr|may|jun|jul|aug|sep))|(?:1[0-2]|(?:oct|nov|dec)))\4(?:(?:(16|17|18|19|20)\d{2}))', sentence):
        spans.append(match)
    
    return spans

# mm/dd/yyyy 
# allows from 1600 onwards
def month_sep_check(sentence):
    spans = []
    for match in re.finditer(r'(0[1-9]|1[0-2])\/(0[1-9]|1\d|2\d|3[01])\/(16|17|18|19|20)\d{2}', sentence):
        spans.append(match)
        
    return spans

# comma separated form: month date, year
# or month. date, year (oct. 22, 1992)
# or month. date year (oct. 22 1992)
# or month date year (oct 22 1992)
# allows from 1600 and onwards
def month_date_year_check(sentence):
    spans = []
    for match in re.finditer(r'(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?).?\s+\d{1,2},?\s+?(?:(16|17|18|19|20)\d{2})', sentence):
        spans.append(match)
    
    return spans
# checks for standalone year mentions in text
# allows from 1800 to 2099
# i.e. John Smith was born on 1993. 
# we have to check for ISBN number FIRST before this function as it will pick up isbn numbers as well
def valid_years_in_text(sentence):
    spans = []
    for match in re.finditer(r'(18|19|20)\d{2}', sentence):
        spans.append(match)
        
    return spans

In [105]:
test_example = '''
<start_of_passage>

Mom Rajawongse Kukrit Pramoj (Thai: คึกฤทธิ์ ปราโมช, RTGS: Khuek-rit Pramot, pronounced [kʰɯ́k.rít prāː.môːt]; 20 April 1911 – 9 October 1995) was a Thai politician, scholar and professor. He was Speaker of the House of Representatives of Thailand 1973–1974. He was the thirteenth Prime Minister of Thailand, serving in office from 1975 to 1976 between Seni Pramoj, his brother's, terms. Being the great-grandson of King Rama II, he was a member of the Thai royal family.
 He also portrayed the Prime Minister of the fictional country of "Sarkhan" in the 1963 motion picture The Ugly American with Marlon Brando.
 
 
 == Biography ==
 He was born on 20 April 1911 at Sing Buri Province into an cadet branch of Chakri Dynasty with Chinese ancestry.  The son of Brigadier General Prince Khamrob and his wife Daeng (Bunnag), his older brother was M.R. Seni Pramoj while his great-grandmother, Ampha, was of Chinese descent and was a consort of Rama II. He served as a corporal during the Franco-Thai War in 1940. Like many upper class Thais of his generation, his parents sent him and his siblings to boarding schools in England including Trent College. He finished his bachelor's degree in Philosophy, Politics, and Economics (PPE) from Queen's College, University of Oxford.
 Upon returning to Thailand, his first job was in the field of banking; but his true vocation was his mastery of many forms of arts, including politics and journalism. Put off by Hollywood's portrayal of revered nineteenth-century king, Mongkut, in the 1946 film Anna and the King of Siam, based on the semi-fictional biographical novel of the same name, Kukrit and his brother, Seni Pramoj, wrote The King of Siam speaks in 1948. They sent their manuscript to the American politician and diplomat Abbot Low Moffat who drew on it for his biography entitled Mongkut the King of Siam (ISBN 974-8298-12-4), and in 1961, donated the Pramoj manuscript to the Southeast Asian Collection, Asian Division, Library of Congress. He wrote for Siam Rath, the newspaper that he founded.
 <end_of_passage>
'''

In [106]:
moving_result = test_example.lower()
checked_for_isbn = len(ISBN_check(moving_result)) == 0

# checking for ISBN numbers
while not checked_for_isbn:
    
    match = ISBN_check(moving_result)
    match_spans = match[0].span()
    isbn_text = moving_result[match_spans[0]: match_spans[1]]
    
    checked_for_numbers = False
    
    while not checked_for_numbers:
        number_matches = number_check(isbn_text)
        
        number_spans = number_matches[0].span()
        
        isbn_text = isbn_text[:number_spans[0]] + '#' * (number_spans[1] - number_spans[0]) + isbn_text[number_spans[1]:]
        
        if len(number_check(isbn_text)) == 0:
            checked_for_numbers = True
    
    isbn_text = re.sub(r'#+', '<other>', isbn_text)
    
    moving_result = moving_result[:match_spans[0]] + isbn_text + moving_result[match_spans[1]:]
    
    if len(ISBN_check(moving_result)) == 0:
        checked_for_isbn = True

checked_for_month_date_year = len(month_date_year_check(moving_result)) == 0

# checking for month_date_year_check regex
while not checked_for_month_date_year:
    match=  month_date_year_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)
    if len(matches) != 0:
        
        # the first match is always date
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        # the next match is always year

        date_spans = matches[1].span()

        date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]
        
        date_text = re.sub(r'@+', '<days>', date_text)
        date_text = re.sub(r'#+', '<year>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(month_date_year_check(moving_result)) == 0:
        checked_for_month_date_year = True

        
checked_for_month_sep = len(month_sep_check(moving_result)) == 0

# checking for month_sep_check regex
while not checked_for_month_sep:
    match=  month_sep_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)
    
    if len(matches) != 0:
        
        # the second match (not first) is always date (first is month)
        date_spans = matches[1].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        # third match is always year
        date_spans = matches[2].span()

        date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]
        
        date_text = re.sub(r'@+', '<days>', date_text)
        date_text = re.sub(r'#+', '<year>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(month_sep_check(moving_result)) == 0:
        checked_for_month_sep = True

        
checked_for_date_sep = len(date_sep_check(moving_result)) == 0

# checking for date_sep_check regex
while not checked_for_date_sep:
    match=  date_sep_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)

    if len(matches) != 0:
        
        # first match is always date
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        # third match (not second) is always year

        date_spans = matches[2].span()

        date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]
        
        date_text = re.sub(r'@+', '<days>', date_text)
        date_text = re.sub(r'#+', '<year>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(date_sep_check(moving_result)) == 0:
        checked_for_date_sep = True
        
checked_for_month_year = len(month_year_check(moving_result)) == 0

# check for month_year_check regex
while not checked_for_month_year:
    match=  month_year_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)
    
    if len(matches) != 0:
        
        # the only match will be year only
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        date_text = re.sub(r'@+', '<year>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(month_year_check(moving_result)) == 0:
        checked_for_month_year = True
        

checked_for_month_date = len(month_date_check(moving_result)) == 0

# check for month_date_check regex
# notice that this check must be run after the month_year_check
# as month_date_check would have picked up on month_year_check candidates first
while not checked_for_month_date:
    match=  month_date_check(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    matches = number_check(date_text)

    if len(matches) != 0:
        # first match is always date
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        date_text = re.sub(r'@+', '<days>', date_text)

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
    
    if len(month_date_check(moving_result)) == 0:
        checked_for_month_date = True

        
checked_for_valid_years = len(valid_years_in_text(moving_result)) == 0

# check for remaining numbers that fall in the valid year range
while not checked_for_valid_years:
    match=  valid_years_in_text(moving_result)
    match_spans = match[0].span()
    
    date_text = moving_result[match_spans[0]: match_spans[1]]
    
    checked_for_numbers = len(number_check(date_text)) == 0

    while not checked_for_numbers:
        matches = number_check(date_text)
        
        date_spans = matches[0].span()

        date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

        date_text = re.sub(r'@+', '<year>', date_text)
        
        if len(number_check(date_text)) == 0:
            checked_for_numbers = True

    moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]
        
        
    
    if len(valid_years_in_text(moving_result)) == 0:
        checked_for_valid_years = True


checked_for_decimals = len(decimal_check(moving_result)) == 0

# check for decimals
while not checked_for_decimals:
    match=  decimal_check(moving_result)
    match_spans = match[0].span()
    
    moving_result = moving_result[:match_spans[0]] + '@' * (match_spans[1] - match_spans[0]) + moving_result[match_spans[1]:]

    moving_result = re.sub(r'@+', '<decimal>', moving_result)
    
    if len(decimal_check(moving_result)) == 0:
        checked_for_decimals = True

In [90]:
print(moving_result)


<start_of_passage>

mom rajawongse kukrit pramoj (thai: คึกฤทธิ์ ปราโมช, rtgs: khuek-rit pramot, pronounced [kʰɯ́k.rít prāː.môːt]; 20 april <year> – 9 october <year>) was a thai politician, scholar and professor. he was speaker of the house of representatives of thailand <year>–<year>. he was the thirteenth prime minister of thailand, serving in office from <year> to <year> between seni pramoj, his brother's, terms. being the great-grandson of king rama ii, he was a member of the thai royal family.
 he also portrayed the prime minister of the fictional country of "sarkhan" in the <year> motion picture the ugly american with marlon brando.
 
 
 == biography ==
 he was born on 20 april <year> at sing buri province into an cadet branch of chakri dynasty with chinese ancestry.  the son of brigadier general prince khamrob and his wife daeng (bunnag), his older brother was m.r. seni pramoj while his great-grandmother, ampha, was of chinese descent and was a consort of rama ii. he served as 

we have to still pick up integers and others. We can pick those up AFTER the tokenization. Notice how regex had to be run before tokenization.

In [107]:
tokenized_words = word_tokenize(moving_result)

for i in range(len(tokenized_words)):
    if tokenized_words[i].isnumeric():
        tokenized_words[i] = '<integer>'
    else:
        tokenized_words[i] = re.sub('\d+', '<other>', tokenized_words[i])

## issue: when you tokenize after putting the "<>" tags in, it tokenizes those characters separately. is this expected or do we need to fix this?

In [113]:
# just go through and recombine manually

# tokenized_results = word_tokenize(moving_result)
new_results = []
i = 0
while i < len(tokenized_words) - 1:
    if tokenized_words[i] == '<' and tokenized_words[i+2] == '>':
        
        if tokenized_words[i+1] in ['other', 'integer', 'days', 'year', 'decimal', 'end_of_passage', 'start_of_passage']:
            new_results.append('<' + tokenized_words[i+1] + '>')
            
            i += 3
    else:
        new_results.append(tokenized_words[i])
        i += 1

In [114]:
new_results[-20:]

['asian',
 'division',
 ',',
 'library',
 'of',
 'congress',
 '.',
 'he',
 'wrote',
 'for',
 'siam',
 'rath',
 ',',
 'the',
 'newspaper',
 'that',
 'he',
 'founded',
 '.',
 '<end_of_passage>']

In [173]:
## writing in python script for publishing

def tokenize_and_tag(corpus):
    '''
    accepts corpus, a string that contains the entire text
    '''
    
    moving_result = corpus.lower()
    checked_for_isbn = len(ISBN_check(moving_result)) == 0

    # checking for ISBN numbers
    while not checked_for_isbn:

        match = ISBN_check(moving_result)
        match_spans = match[0].span()
        isbn_text = moving_result[match_spans[0]: match_spans[1]]

        checked_for_numbers = False

        while not checked_for_numbers:
            number_matches = number_check(isbn_text)

            number_spans = number_matches[0].span()

            isbn_text = isbn_text[:number_spans[0]] + '#' * (number_spans[1] - number_spans[0]) + isbn_text[number_spans[1]:]

            if len(number_check(isbn_text)) == 0:
                checked_for_numbers = True

        isbn_text = re.sub(r'#+', '<other>', isbn_text)

        moving_result = moving_result[:match_spans[0]] + isbn_text + moving_result[match_spans[1]:]

        if len(ISBN_check(moving_result)) == 0:
            checked_for_isbn = True

    checked_for_month_date_year = len(month_date_year_check(moving_result)) == 0

    # checking for month_date_year_check regex
    while not checked_for_month_date_year:
        match=  month_date_year_check(moving_result)
        match_spans = match[0].span()

        date_text = moving_result[match_spans[0]: match_spans[1]]
        matches = number_check(date_text)
        if len(matches) != 0:

            # the first match is always date
            date_spans = matches[0].span()

            date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            # the next match is always year
            date_spans = matches[1].span()
            

            date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            date_text = re.sub(r'@+', '<days>', date_text)
            date_text = re.sub(r'#+', '<year>', date_text)

        moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]

        if len(month_date_year_check(moving_result)) == 0:
            checked_for_month_date_year = True


    checked_for_month_sep = len(month_sep_check(moving_result)) == 0

    # checking for month_sep_check regex
    while not checked_for_month_sep:
        match=  month_sep_check(moving_result)
        match_spans = match[0].span()

        date_text = moving_result[match_spans[0]: match_spans[1]]
        matches = number_check(date_text)

        if len(matches) != 0:

            # the second match (not first) is always date (first is month)
            date_spans = matches[1].span()

            date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            # third match is always year
            date_spans = matches[2].span()

            date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            date_text = re.sub(r'@+', '<days>', date_text)
            date_text = re.sub(r'#+', '<year>', date_text)

        moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]

        if len(month_sep_check(moving_result)) == 0:
            checked_for_month_sep = True


    checked_for_date_sep = len(date_sep_check(moving_result)) == 0

    # checking for date_sep_check regex
    while not checked_for_date_sep:
        match=  date_sep_check(moving_result)
        match_spans = match[0].span()

        date_text = moving_result[match_spans[0]: match_spans[1]]
        matches = number_check(date_text)

        if len(matches) != 0:

            # first match is always date
            date_spans = matches[0].span()

            date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            # third match (not second) is always year

            date_spans = matches[2].span()

            date_text = date_text[:date_spans[0]] + '#' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            date_text = re.sub(r'@+', '<days>', date_text)
            date_text = re.sub(r'#+', '<year>', date_text)

        moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]

        if len(date_sep_check(moving_result)) == 0:
            checked_for_date_sep = True
    
    
    checked_for_date_month = len(date_month_check(moving_result)) == 0

    # check for month_date_check regex
    # notice that this check must be run after the month_year_check
    # as month_date_check would have picked up on month_year_check candidates first
    while not checked_for_date_month:
        match=  date_month_check(moving_result)
        match_spans = match[0].span()

        date_text = moving_result[match_spans[0]: match_spans[1]]
        matches = number_check(date_text)

        if len(matches) != 0:
            # first match is always date
            date_spans = matches[0].span()

            date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            date_text = re.sub(r'@+', '<days>', date_text)

        moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]

        if len(date_month_check(moving_result)) == 0:
            checked_for_date_month = True
            
            
    checked_for_month_year = len(month_year_check(moving_result)) == 0

    # check for month_year_check regex
    while not checked_for_month_year:
        match=  month_year_check(moving_result)
        match_spans = match[0].span()

        date_text = moving_result[match_spans[0]: match_spans[1]]
        matches = number_check(date_text)

        if len(matches) != 0:

            # the only match will be year only
            date_spans = matches[0].span()

            date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            date_text = re.sub(r'@+', '<year>', date_text)

        moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]

        if len(month_year_check(moving_result)) == 0:
            checked_for_month_year = True


    checked_for_month_date = len(month_date_check(moving_result)) == 0

    # check for month_date_check regex
    # notice that this check must be run after the month_year_check
    # as month_date_check would have picked up on month_year_check candidates first
    while not checked_for_month_date:
        match=  month_date_check(moving_result)
        match_spans = match[0].span()

        date_text = moving_result[match_spans[0]: match_spans[1]]
        matches = number_check(date_text)

        if len(matches) != 0:
            # first match is always date
            date_spans = matches[0].span()

            date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            date_text = re.sub(r'@+', '<days>', date_text)

        moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]

        if len(month_date_check(moving_result)) == 0:
            checked_for_month_date = True


    checked_for_valid_years = len(valid_years_in_text(moving_result)) == 0

    # check for remaining numbers that fall in the valid year range
    while not checked_for_valid_years:
        match=  valid_years_in_text(moving_result)
        match_spans = match[0].span()

        date_text = moving_result[match_spans[0]: match_spans[1]]

        checked_for_numbers = len(number_check(date_text)) == 0

        while not checked_for_numbers:
            matches = number_check(date_text)

            date_spans = matches[0].span()

            date_text = date_text[:date_spans[0]] + '@' * (date_spans[1] - date_spans[0]) + date_text[date_spans[1]:]

            date_text = re.sub(r'@+', '<year>', date_text)

            if len(number_check(date_text)) == 0:
                checked_for_numbers = True

        moving_result = moving_result[:match_spans[0]] + date_text + moving_result[match_spans[1]:]



        if len(valid_years_in_text(moving_result)) == 0:
            checked_for_valid_years = True


    checked_for_decimals = len(decimal_check(moving_result)) == 0

    # check for decimals
    while not checked_for_decimals:
        match=  decimal_check(moving_result)
        match_spans = match[0].span()

        moving_result = moving_result[:match_spans[0]] + '@' * (match_spans[1] - match_spans[0]) + moving_result[match_spans[1]:]

        moving_result = re.sub(r'@+', '<decimal>', moving_result)

        if len(decimal_check(moving_result)) == 0:
            checked_for_decimals = True
    
    tokenized_words = word_tokenize(moving_result)

    for i in range(len(tokenized_words)):
        if tokenized_words[i].isnumeric():
            tokenized_words[i] = '<integer>'
        else:
            if re.search(r"\d,\d", tokenized_words[i]):
                tokenized_words[i] = '<integer>'
            else:    
                tokenized_words[i] = re.sub('\d+', '<other>', tokenized_words[i])
    
    
    new_results = []
    i = 0
    while i < len(tokenized_words):
        if tokenized_words[i] == '<' and tokenized_words[i+2] == '>':

            if tokenized_words[i+1] in ['other', 'integer', 'days', 'year', 'decimal', 'end_of_passage', 'start_of_passage']:
                new_results.append('<' + tokenized_words[i+1] + '>')

                i += 3
        else:
            new_results.append(tokenized_words[i])
            i += 1
    
    return new_results
    
    

In [174]:
tokenized = []

for line in corpus:
    if len(line.strip()) != 0:
        tokenized.append(tokenize_and_tag(line))

In [175]:
tokenized[120:140]

[['clark',
  'was',
  'born',
  'in',
  'colchester',
  ',',
  'england',
  ',',
  'and',
  'educated',
  'at',
  'boarding',
  'school',
  'and',
  'later',
  'at',
  'the',
  'thames',
  'nautical',
  'training',
  'college',
  ',',
  'then',
  'known',
  'as',
  'hms',
  'worcester',
  '.',
  'in',
  '<year>',
  ',',
  'unable',
  'to',
  'join',
  'the',
  'royal',
  'navy',
  'because',
  'of',
  'a',
  'visual',
  'defect',
  ',',
  'he',
  'joined',
  'the',
  'british',
  'merchant',
  'navy',
  ',',
  'serving',
  'with',
  'the',
  'union-castle',
  'line',
  'on',
  'the',
  'liberty',
  'ship',
  'samflora',
  ',',
  'and',
  'completing',
  'his',
  'cadet',
  'training',
  'during',
  'a',
  'two-year',
  'cruise',
  'without',
  'home',
  'leave',
  '.',
  'upon',
  'discharge',
  'from',
  'the',
  'samflora',
  'he',
  'joined',
  'the',
  'straits',
  'steamship',
  'company',
  ',',
  'based',
  'in',
  'singapore',
  ',',
  'as',
  'a',
  'junior',
  'officer',
  'o

- make sure to take out <start_of_passage>, <end_of_passage> tokens
- make sure to take out nonEnglish words
- make sure to get rid of '==' and '===' vocab

In [160]:
tokenize_and_tag('on 6 November 1986, 3 years, 8 months and 16 days later, having travelled some 71,000 km eastwards, around and about the ')

['on',
 '<days>',
 'november',
 '<year>',
 ',',
 '<integer>',
 'years',
 ',',
 '<integer>',
 'months',
 'and',
 '<integer>',
 'days',
 'later',
 ',',
 'having',
 'travelled',
 'some',
 '<other><integer><other>',
 'km',
 'eastwards',
 ',',
 'around',
 'and',
 'about',
 'the']

In [148]:
word_tokenize('on 6 November 1986, 3 years, 8 months and 16 days later, having travelled some 71,000 km eastwards, around and about the ')

['on',
 '6',
 'November',
 '1986',
 ',',
 '3',
 'years',
 ',',
 '8',
 'months',
 'and',
 '16',
 'days',
 'later',
 ',',
 'having',
 'travelled',
 'some',
 '71,000',
 'km',
 'eastwards',
 ',',
 'around',
 'and',
 'about',
 'the']

In [151]:
'81000.'.isnumeric()

False

In [153]:
re.sub(r"(\d),(\d)", r"\1\2", '81,000')

'81000'

In [179]:
tokenize_and_tag(' Axell, Herbert; Hosking, Eric (photographer) (21st). Minsmere: Portrait of a Bird Reserve. Hutchinson. ISBN 978-0-09-128840-2.')

['axell',
 ',',
 'herbert',
 ';',
 'hosking',
 ',',
 'eric',
 '(',
 'photographer',
 ')',
 '(',
 '<other>st',
 ')',
 '.',
 'minsmere',
 ':',
 'portrait',
 'of',
 'a',
 'bird',
 'reserve',
 '.',
 'hutchinson',
 '.',
 'isbn',
 '<other>',
 '-',
 '<other>',
 '-',
 '<other>',
 '-',
 '<other>',
 '-',
 '<other>',
 '.']