Name: Sathish Kumar Rajendiran
Chapter :  Regular Expressions
Date: 10/18/2020
Week: 3

In [14]:
#import libraries

# standard library
import os
import sys
from datetime import datetime
import time

# csv, xls, pandas & json
import pandas as pd
import json
import csv
import xlrd

# Language Processing
import nltk
from nltk import FreqDist

# web requests
from urllib import request


##  Regular Expression to match non-alphabetic characters
import re

os.getcwd()

'/Users/sathishrajendiran/ist664-nlp'

In [47]:
#  sample text
text= '''That U.S.A. poster-print from NG costs $12.40, and there are 1,259,000 copies.'''

In [59]:
#Phone Numbers
if re.search('[A-Z][.]|[A-Z]\w\b',text):
    text_phone = re.findall(r'\b(?:[a-zA-Z]\.){2,}|[A-Z]\w\b',text)
    print('Found a match:\n',text_phone )
else:
    print('No match.')

Found a match:
 ['U.S.A.', 'NG']


In [60]:
# get the book Emma from the Gutenberg collection and keep as raw text
file0 = nltk.corpus.gutenberg.fileids( ) [0]
emmatext = nltk.corpus.gutenberg.raw(file0)
print(type(emmatext))
print(len(emmatext))
# display the first 150 characters of the str emmatext
emmatext[:150]


<class 'str'>
887071


'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to'

In [61]:
# print the first 150 characters in the str emmatext as one string
print(emmatext[:150])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to


In [63]:
# print the first 20 characters in emmatext by iterating over the characters
for c in emmatext[:20]:
    print(c)

[
E
m
m
a
 
b
y
 
J
a
n
e
 
A
u
s
t
e
n


In [64]:
## Review of strings and string operations
# replace end-of-line character with a space
newemmatext = emmatext.replace('\n', ' ')
newemmatext[:150]

'[Emma by Jane Austen 1816]  VOLUME I  CHAPTER I   Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to'

In [65]:
### Development of regular expressions for tokenizing text
import re
# pattern to match words, i.e. anything with a sequence of word characters, ignores special chars
shorttext = 'That book is interesting.'
pword = re.compile('\w+')
print(re.findall(pword, shorttext))

['That', 'book', 'is', 'interesting']


In [66]:
specialtext = 'That U.S.A. poster-print costs $12.40, but with 10% off.'
print(re.findall(pword, specialtext))

['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40', 'but', 'with', '10', 'off']


In [67]:
# pattern to match words with internal hyphens
ptoken = re.compile('(\w+(-\w+)*)')
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

[('That', ''), ('U', ''), ('S', ''), ('A', ''), ('poster-print', '-print'), ('costs', ''), ('12', ''), ('40', ''), ('but', ''), ('with', ''), ('10', ''), ('off', '')]
[('end-of-line', '-line'), ('character', '')]


In [68]:
# ignore the group of the inner parentheses 
ptoken = re.compile('(\w+(?:-\w+)*)')
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

['That', 'U', 'S', 'A', 'poster-print', 'costs', '12', '40', 'but', 'with', '10', 'off']
['end-of-line', 'character']


In [69]:

# abbreviations like U.S.A.
pabbrev = re.compile('((?:[A-Z]\.)+)')
print(re.findall(pabbrev, specialtext))

['U.S.A.']


In [70]:
# combine this pattern with the words to make more general tokens
ptoken = re.compile('(\w+(?:-\w+)*|(?:[A-Z]\.)+)')
print(re.findall(ptoken, specialtext))

['That', 'U', 'S', 'A', 'poster-print', 'costs', '12', '40', 'but', 'with', '10', 'off']


In [71]:
# switch the order of the patterns to first match abbreviations and then other words
ptoken = re.compile('((?:[A-Z]\.)+|\w+(?:-\w+)*)')
print(re.findall(ptoken, specialtext))



['That', 'U.S.A.', 'poster-print', 'costs', '12', '40', 'but', 'with', '10', 'off']


In [72]:
# add expression for currency
ptoken = re.compile('((?:[A-Z]\.)+|\w+(?:-\w+)*|\$?\d+(?:\.\d+)?)')
print(re.findall(ptoken, specialtext))

['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', 'but', 'with', '10', 'off']


In [73]:
# this is an equivalent regular expression except that it has extra parentheses
ptoken = re.compile(r'''((?:[A-Z]\.)+) # abbreviations, e.g. U.S.A.
   | (\w+(?:-\w+)*) # words with internal hyphens
   | (\$?\d+(?:\.\d+)?) # currency, like $12.40
   ''', re.X) # verbose flag

print(re.findall(ptoken, specialtext))

[('', 'That', ''), ('U.S.A.', '', ''), ('', 'poster-print', ''), ('', 'costs', ''), ('', '', '$12.40'), ('', 'but', ''), ('', 'with', ''), ('', '10', ''), ('', 'off', '')]


In [74]:
## More about findall()
# using the findall() function to find 2 parts of each match
email_text = "For more information, send a request to info@ischool.syr.edu. Or you can directly contact our information staff at HelpfulHenry@syr.edu and SageSue@syr.edu."


In [75]:
# re with two parentheses to match username and domain in every email address
pemail = re.compile('([a-zA-Z]+)@([a-z.]+)')
matches = re.findall(pemail, email_text)
for m in matches:
    # format function puts each argument into the output string where the {} is
    email = 'User: {}, Domain:{}'.format(m[0],m[1])
    print(email)

User: info, Domain:ischool.syr.edu.
User: HelpfulHenry, Domain:syr.edu
User: SageSue, Domain:syr.edu.


In [79]:
### using NLTK's regular expression tokenizer
# first define a multi-line string that is a regular expression
pattern = r''' (?x) 	# set flag to allow verbose regexps
        (?:[A-Z]\.)+    # abbreviations, e.g. U.S.A.
        | \$?\d+(?:\.\d+)?%?    # currency and percentages, $12.40, 50%
        | \w+(?:-\w+)*  # words with internal hyphens
        | \.\.\.        # ellipsis
        | [][.,;”’?():-_%#’]    # separate tokens
        '''

# the nltk regular expression tokenizer compiles the re pattern, applies it to the text
#  and uses the matching groups to return a list of only the matched tokens
print(nltk.regexp_tokenize(shorttext, pattern))
print(nltk.regexp_tokenize(specialtext, pattern))

# compare with built-in word tokenizer
print(nltk.word_tokenize(specialtext))


['That', 'book', 'is', 'interesting', '.']
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', ',', 'but', 'with', '10%', 'off', '.']
['That', 'U.S.A.', 'poster-print', 'costs', '$', '12.40', ',', 'but', 'with', '10', '%', 'off', '.']


In [80]:
# Tokenizer for Twitter derived tweetmotif from the ARK, developed at CMU
tweetPattern = r''' (?x)	# set flag to allow verbose regexps
      (?:https?://|www)\S+      # simple URLs
      | (?::-\)|;-\))		# small list of emoticons
      | &(?:amp|lt|gt|quot);    # XML or HTML entity
      | \#\w+                 # hashtags
      | @\w+                  # mentions   
      | \d+:\d+               # timelike pattern
      | \d+\.\d+              # number with a decimal
      | (?:\d+,)+?\d{3}(?=(?:[^,]|$))   # number with a comma
      | (?:[A-Z]\.)+                    # simple abbreviations
      | (?:--+)               # multiple dashes
      | \w+(?:-\w+)*          # words with internal hyphens or apostrophes
      | ['\".?!,:;/]+         # special characters
      '''


In [81]:
tweet1 = "@natalieohayre I agree #hc09 needs reform- but not by crooked politicians who r clueless about healthcare! #tcot #fishy NO GOV'T TAKEOVER!"

tweet2 = "To Sen. Roland Burris: Affordable, quality health insurance can't wait http://bit.ly/j63je #hc09 #IL #60660"

tweet3 = "RT @karoli: RT @Seriou: .@whitehouse I will stand w/ Obama on #healthcare,  I trust him. #p2 #tlot"

In [82]:
print(nltk.regexp_tokenize(tweet1,tweetPattern))
print(nltk.regexp_tokenize(tweet2,tweetPattern))
print(nltk.regexp_tokenize(tweet3,tweetPattern))

['@natalieohayre', 'I', 'agree', '#hc09', 'needs', 'reform', 'but', 'not', 'by', 'crooked', 'politicians', 'who', 'r', 'clueless', 'about', 'healthcare', '!', '#tcot', '#fishy', 'NO', 'GOV', "'", 'T', 'TAKEOVER', '!']
['To', 'Sen', '.', 'Roland', 'Burris', ':', 'Affordable', ',', 'quality', 'health', 'insurance', 'can', "'", 't', 'wait', 'http://bit.ly/j63je', '#hc09', '#IL', '#60660']
['RT', '@karoli', ':', 'RT', '@Seriou', ':', '.', '@whitehouse', 'I', 'will', 'stand', 'w', '/', 'Obama', 'on', '#healthcare', ',', 'I', 'trust', 'him', '.', '#p2', '#tlot']


In [83]:
# NLTK built-in tokenizer (more detailed version from TweetMotif)
from nltk.tokenize import TweetTokenizer
ttokenizer = TweetTokenizer()
print(ttokenizer.tokenize(tweet1))


['@natalieohayre', 'I', 'agree', '#hc09', 'needs', 'reform', '-', 'but', 'not', 'by', 'crooked', 'politicians', 'who', 'r', 'clueless', 'about', 'healthcare', '!', '#tcot', '#fishy', 'NO', "GOV'T", 'TAKEOVER', '!']


In [84]:
# sentence example for the question

sent = "Mr. Black and Mrs. Brown attended the lecture by Dr. Gray, but Gov. White wasn't there."
print(nltk.regexp_tokenize(sent, pattern))

['Mr', '.', 'Black', 'and', 'Mrs', '.', 'Brown', 'attended', 'the', 'lecture', 'by', 'Dr', '.', 'Gray', ',', 'but', 'Gov', '.', 'White', 'wasn', 't', 'there', '.']
