# Regular expressions, text normalization, and edit distance

## Part 0: Initialization & Setup

In [None]:
# importing required libraries
import re
import nltk
from nltk.corpus import movie_reviews
import string
import pandas as pd
from nltk.corpus import stopwords

nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Part 1: Regular Expressions

### Extracting license plate numbers, IDs, emails and mailing addresses from a document


#### Document creation

In [None]:
sentence = 'I am 20 years old. My previous license plate number was 4XUI302 and my new one is 3A-278. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.cg or jane.doe@sjsu.edu'
sentence

'I am 20 years old. My previous license plate number was 4XUI302 and my new one is 3A-278. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.cg or jane.doe@sjsu.edu'

Extracting license plate numbers

In [None]:
# The format of license plate number is a digit then 2 or 3 letters (one of which can be a "-"), and then 3 digits

regex = re.compile(r'(\d{1}[A-Za-z-]{2,3}\d{3})')
lincense_plate_numbers = regex.findall(sentence)
lincense_plate_numbers

['4XUI302', '3A-278']

### Exercise 1-1: Extract the ID numbers from the document.

In [None]:
# The format of the IDs is one character/letter and then 6 digits
regex = re.compile(r'\b[A-Za-z]\d{6}\b')
ids = regex.findall(sentence)
ids

['J987492']

### Exercise 1-2: Extract the email IDs from the document

In [None]:
regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
emails = regex.findall(sentence)
emails

['myemail123+spam@google.cg', 'jane.doe@sjsu.edu']

### Exercise 1-3: Extract the mailing address from the document

In [None]:
regex = re.compile(r'\b\d+\s+[A-Za-z\s]*,\s*[A-Za-z\s]*,\s*[A-Z]{2}\.\b')
mailing_address = regex.findall(sentence)
mailing_address

[]

### Exercise 1-4: Anonymize the license plate numbers by replacing them with the text "LP_NUM"

The re.sub function is described here: https://docs.python.org/3/library/re.html

In [None]:
# Now replacing license plate numbers with the string "LP_NUM"
regex = re.compile(r'(\d{1}[A-Za-z-]{2,3}\d{3})')
sentence_modified = re.sub(regex, 'LP_NUM', sentence)
sentence_modified

'I am 20 years old. My previous license plate number was LP_NUM and my new one is LP_NUM. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.cg or jane.doe@sjsu.edu'

### Exercise 1-5: Replace the ID numbers with the text "ID_NUM"

In [None]:
regex = re.compile(r'\b[A-Za-z]\d{6}\b')
sentence_modified = re.sub(regex, 'LP_NUM', sentence)
sentence_modified

'I am 20 years old. My previous license plate number was 4XUI302 and my new one is 3A-278. My ID is LP_NUM and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.cg or jane.doe@sjsu.edu'

## Part 2: Text Processing

Count the number of words in the movie_reviews dataset (dataset uploaded in the beginning of this notebook under "Part 0: Initialization and Setup")

In [None]:
# print number of words in the movie review dataset
len(movie_reviews.words())

1583820

Load the standard list of punctuation marks

In [None]:
punctuations = string.punctuation
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

Remove punctation from movie reviews


In [None]:
words_wo_puncts = [x for x in movie_reviews.words() if x not in punctuations]
len(words_wo_puncts)

1338788

Count the number of unique words

In [None]:
unique_words = set(words_wo_puncts)
len(unique_words)

39737

Find the 20 most frequent words in the dataset

In [None]:
# top 20 highest freq words
pd.Series(words_wo_puncts).value_counts()[:20]

the     76529
a       38106
and     35576
of      34123
to      31937
is      25195
in      21822
s       18513
it      16107
that    15924
as      11378
with    10792
for      9961
his      9587
this     9578
film     9517
i        8889
he       8864
but      8634
on       7385
dtype: int64

Load the standard list of stopwords

In [None]:
# getting english stopwords
eng_stopwords = stopwords.words('english')
eng_stopwords

Count the number of stopwords

In [None]:
len(eng_stopwords)

179

### Exercise 2-1: Remove the stopwords from the dataset (similarly to how we removed punctuation above)

In [None]:
words_wo_puncts_stopwords = [x for x in movie_reviews.words() if x not in eng_stopwords]
len(words_wo_puncts_stopwords)

955610

### Exercise 2-2: Find the number of uniques words in the dataset now that the stop words have been removed

In [None]:
# unique words without stopwords
unique_words = set(words_wo_puncts_stopwords)
len(unique_words)

39617

### Exercise 2-3: Find the top 20 highest frequency words now that we have removed the stopwords

In [None]:
# top 20 highest freq words after removing stopwords

words_wo_puncts = [x for x in words_wo_puncts_stopwords.words() if x not in punctuations]
len(words_wo_puncts)

pd.Series(words_wo_puncts).value_counts()[:20]


AttributeError: ignored

Find the words that are used only once in the corpus (and print the first few).  

In [None]:
# 20 words that are used only once in corpus using hapaxes() function
nltk.FreqDist(words_wo_puncts_stopwords).hapaxes()[:20]

['looooot',
 'schnazzy',
 'timex',
 'indiglo',
 'jessalyn',
 'gilsig',
 'ruber',
 'jaleel',
 'balki',
 'wavers',
 'statistics',
 'snapshot',
 'guesswork',
 'maryam',
 'daylights',
 'terraformed',
 'stagnated',
 'napolean',
 'millimeter',
 'enmeshed']

### Exercise 2-4: Use the PorterStemmer to stem the words in the dataset.

Display the first few words.

In [None]:
from nltk.stem import PorterStemmer

words = nltk.word_tokenize(movie_reviews.words)

# Create a Porter Stemmer instance
stemmer = PorterStemmer()

# Stem the words in the dataset
stemmed_words = [stemmer.stem(word) for word in words]

# Display the first few stemmed words
print("Stemmed words:", stemmed_words[:5])


LookupError: ignored

### Exercise 2-5: Use the WordNetLemmatizer to lemmatize the words in the dataset.

Display the first few words.

In [None]:
from nltk import WordNetLemmatizer


### Exercise 2-6:
a) How many unique words are there once stemming is applied? (show the that performs the computation and outputs the result)

b) How many unique words are there once lemmatization is applied? (show the code that performs the computation and outputs the result)

## Part 3. Tokenization

### Exercise 3-1: Use the Penn Tree Bank tokenizer to tokenize the sentence below

Print the tokens that the tokenizer produces.

In [None]:
from nltk.tokenize import TreebankWordTokenizer
s = 'Please pay $100.55 to settle your bill.  Send confirmation to confirm@gmail.com.'

# Creating a Penn Treebank tokenizer instance
tokenizer = TreebankWordTokenizer()

# Tokenize the sentence
tokens = tokenizer.tokenize(s)
tokens


['Please',
 'pay',
 '$',
 '100.55',
 'to',
 'settle',
 'your',
 'bill.',
 'Send',
 'confirmation',
 'to',
 'confirm',
 '@',
 'gmail.com',
 '.']

## Part 4: Levenshtein Distance & Alignment

Relevant nltk documentation: https://www.nltk.org/api/nltk.metrics.distance.html

### Exercise 4-1: Use the nltk functions edit_distance to compute the Levenshtein edit-distance between the strings "intention" and "execution"

In [None]:
from nltk.metrics.distance import edit_distance

from nltk.metrics import edit_distance

# Define the two strings
string1 = "intention"
string2 = "execution"

# Compute the Levenshtein edit distance
distance = edit_distance(string1, string2)

# Print the result
print("Levenshtein edit distance between '{}' and '{}' is: {}".format(string1, string2, distance))


Levenshtein edit distance between 'intention' and 'execution' is: 5


### Exercise 4-2: Use the nltk function edit_distance_align to compute the minimum Levenshtein edit-distance based alignment mapping between the two strings "intention" and "execution"

In [None]:
from nltk.metrics.distance import edit_distance_align

# Define the two strings
string1 = "intention"
string2 = "execution"

# Compute the alignment mapping
alignment_mapping = edit_distance_align(string1, string2)

alignment_mapping

[(0, 0),
 (1, 1),
 (2, 2),
 (3, 3),
 (4, 4),
 (5, 5),
 (6, 6),
 (7, 7),
 (8, 8),
 (9, 9)]