# NLP - Regex

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords = stopwords.words('english')[0:30]
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shubham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
### Read in Semi-Structured Text data
raw_data = open("SMSSpamCollection.tsv").read()

# print raw data
raw_data[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [15]:
# replace tabs with new lines
parsed_data = raw_data.replace("\t", "\n")

# create list of sentences
parsed_data = parsed_data.split("\n")

parsed_data[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [27]:
# separating the text and labels from the data
# every alternate element is label

label_list = parsed_data[0::2]
text_list = parsed_data[1::2]

print("Labels in data\n\n")
print(label_list[0:20])
print("\n\nText in data\n\n")
print(text_list[0:20])

Labels in data


['ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham']


Text in data


["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune", 'WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 0

In [24]:
# printing the length of lists to create a data frame from it
print(len(label_list))
print(len(text_list))

5571
5570


In [28]:
# printing the last element to see if we can do something with different sizes of list
print(label_list[-5:])

# deleting the last element as it is blank
label_list = label_list[-2::-1]

print(len(label_list))
print(len(text_list))

['ham', 'ham', 'ham', 'ham', '']
5570
5570


In [29]:
# creating the pandas dataframe
import pandas as pd

full_corpus = pd.DataFrame({'label':label_list, 'text':text_list})
full_corpus.head()

Unnamed: 0,label,text
0,ham,I've been searching for the right words to tha...
1,ham,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,spam,I HAVE A DATE ON SUNDAY WITH WILL!!


## We can read the dataset in similar way if there is "\t" separator

In [38]:
full_corpus = pd.read_csv("SMSSpamCollection.tsv", header=None, sep='\t')
full_corpus.columns = ['label', 'body_text']
full_corpus.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


# Exporing the Data 

In [39]:
# What is shape of data
print("input data has {} rows and {} columns".format(len(full_corpus), len(full_corpus.columns)))

input data has 5568 rows and 2 columns


In [40]:
# How many spams/ hams are there?
print("out of {} rows, {} are spam, {} are hams".format(len(full_corpus),
                                                        len(full_corpus[full_corpus['label']=='spam']),
                                                        len(full_corpus[full_corpus['label']=='ham'])))

out of 5568 rows, 746 are spam, 4822 are hams


In [41]:
# How much missing data is there?
print("number of null in label : {}".format(full_corpus['label'].isnull().sum()))
print("number of null in Text : {}".format(full_corpus['body_text'].isnull().sum()))

number of null in label : 0
number of null in Text : 0


# Regular Expressions
Text string for describing a search pattern

##### Use cases
1. Confirming passwords meet criteria
2. Searching URL for some substring
3. Searching for files on your computer
4. Document scrapping

### Usefull methods for tokenizing 
1. findall()
2. split()

### Usefull regexes for tokenizing
1. '\W' & '\w' - words
2. '\S' & '\s' - whitespaces

In [42]:
import re

re_test = "This is a made up string to test 2 different regex methods"
re_test_messy = "This      is a made up      string to test 2 different regex methods"
re_test_messy1 = "This-is-a-made/up.string*to>>>>>test-----2*******different-regex-methods"

### Splitting a sentence into a list words

### 1. split()

In [53]:
print(re.split('\s', re_test))

['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']


In [54]:
print(re.split('\s', re_test_messy))

['This', '', '', '', '', '', 'is', 'a', 'made', 'up', '', '', '', '', '', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']


In [56]:
print(re.split('\s+', re_test_messy))

['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']


In [55]:
print(re.split('\s', re_test_messy1))

['This-is-a-made/up.string*to>>>>>test-----2*******different-regex-methods']


In [57]:
print(re.split('\W+', re_test_messy))

['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']


### 2. findall()

In [58]:
print(re.findall('\s+', re_test_messy))

['      ', ' ', ' ', ' ', '      ', ' ', ' ', ' ', ' ', ' ', ' ']


In [59]:
print(re.findall('\S+', re_test_messy))

['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']


In [60]:
print(re.findall('\w+', re_test_messy1))

['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']


In [61]:
print(re.findall('\W+', re_test_messy1))

['-', '-', '-', '/', '.', '*', '>>>>>', '-----', '*******', '-', '-']


## Finding the Specific String

In [75]:
re_test = "I try to followUP PEP1 and PE guidelines"

# find characters of lowr case words
print(re.findall('[a-z]', re_test))

# find lowercase words
print(re.findall('[a-z]+', re_test))

# find uppercase words
print(re.findall('[A-Z]+', re_test))


# find numeric words
print(re.findall('[0-9]+', re_test))

# find lowercase and uppercase words
print(re.findall('[a-z]+[A-Z]+', re_test))

# find numerical and uppercase words
print(re.findall('[A-Z]+[1-9]+', re_test))

['t', 'r', 'y', 't', 'o', 'f', 'o', 'l', 'l', 'o', 'w', 'a', 'n', 'd', 'g', 'u', 'i', 'd', 'e', 'l', 'i', 'n', 'e', 's']
['try', 'to', 'follow', 'and', 'guidelines']
['I', 'UP', 'PEP', 'PE']
['1']
['followUP']
['PEP1']


## Replacing the Specific String

In [77]:
# replace the string that has uppercase and numerical string
print(re.sub('[A-Z]+[0-9]+', "XXXXX String is replaced XXXXX", re_test))

I try to followUP XXXXX String is replaced XXXXX and PE guidelines


# Other methods
search()
match()
fullmatch()
finditer()
escape()