# NLP Basics

#### Download NLTK data

In [3]:
import nltk
#nltk.download()

In [4]:
#dir(nltk)

#### What can you do with NLTK?

In [6]:
from nltk.corpus import stopwords

# First 10 stopwords in the Englsih Language
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [7]:
# Start at zero elementh, go through 499, with increments of 25
stopwords.words('english')[0:500:25]

['i', 'herself', 'been', 'with', 'here', 'very', 'doesn', 'won']

#### Read in semi-structured text data

In [9]:
# Read in the raw text
rawData = open("data/SMSSpamCollection.tsv").read()

# Print the raw data (0 to 499 position)
rawData[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [10]:
mem = rawData[0:500]

len(mem)

500

In [11]:
# Replace all tab characters ('\t') in the raw data with newline characters ('\n')
# Split the modified string into a list of strings using newline characters ('\n') as the delimiter
parsedData = rawData.replace('\t', '\n').split('\n')

# Show the first 10 components (10 lines)
parsedData[0:10]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham',
 "Nah I don't think he goes to usf, he lives around here though",
 'ham',
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 'ham',
 'I HAVE A DATE ON SUNDAY WITH WILL!!']

In [12]:
# Extract every other element from `parsedData` starting from index 0, until the end, increments of 2
labelList = parsedData[0::2]

# Extract every other element from `parsedData` starting from index 1, until the end, increments of 2
textList = parsedData[1::2]

# Print the first 5 elements of the `labelList` to check the labels
print(labelList[0:5])

# Print the first 5 elements of the `textList` to check the corresponding text values
print(textList[0:5])

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


In [13]:
# Let's see the lenght of both lists, it will be important soon

print(len(labelList))
print(len(textList))

5571
5570


To create a Pandas dictionary, both lists have to be the same lenght!!!!!

In [15]:
# Go to very end, count five backwards, and print out those last five
print(labelList[-5:])

['ham', 'ham', 'ham', 'ham', '']


The last one in labelList is a empty elementh.

In [17]:
import pandas as pd

# Pandas Dictionary:

fullCorpus = pd.DataFrame({
    'label': labelList[:-1],   #### <= Do not grab the last one, so both will be 5570
    'body_list': textList
})

fullCorpus.head()

Unnamed: 0,label,body_list
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


#### Direct approach

In [19]:
dataset = pd.read_csv("data/SMSSpamCollection.tsv", sep="\t", header=None)

dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [20]:
fullCorpus = pd.read_csv('data/SMSSpamCollection.tsv', sep='\t', header=None)

fullCorpus.columns = ['label', 'body_text']   # Creates a header

fullCorpus.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


#### Explore the dataset

In [22]:
# What is the shape of the dataset?

print("Input data has {} rows and {} columns".format(len(fullCorpus), len(fullCorpus.columns)))

Input data has 5568 rows and 2 columns


In [23]:
# How many spam/ham are there?

print("Out of {} rows, {} are spam, {} are ham".format(len(fullCorpus),
                                                       len(fullCorpus[fullCorpus['label']=='spam']),
                                                       len(fullCorpus[fullCorpus['label']=='ham'])))

Out of 5568 rows, 746 are spam, 4822 are ham


In [24]:
# How much missing data is there?

print("Number of null in label: {}".format(fullCorpus['label'].isnull().sum()))      # '.sum()' will sum all the trues
print("Number of null in text: {}".format(fullCorpus['body_text'].isnull().sum()))   # '.sum()' will sum all the trues

Number of null in label: 0
Number of null in text: 0


## Using regular expressions in Python

Python's `re` package is the most commonly used regex resource. More details can be found [here](https://docs.python.org/3/library/re.html).

In [26]:
import re

re_test = 'This is a made up string to test 2 different regex methods'

re_test_messy = 'This      is a made up     string to test 2    different regex methods'

re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

#### Splitting a sentence into a list of words

In [28]:
# re_test = 'This is a made up string to test 2 different regex methods'

re.split('\s', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [29]:
# re_test_messy = 'This      is a made up     string to test 2    different regex methods'

re.split('\s', re_test_messy)

['This',
 '',
 '',
 '',
 '',
 '',
 'is',
 'a',
 'made',
 'up',
 '',
 '',
 '',
 '',
 'string',
 'to',
 'test',
 '2',
 '',
 '',
 '',
 'different',
 'regex',
 'methods']

In [30]:
# re_test_messy = 'This      is a made up     string to test 2    different regex methods'

re.split('\s+', re_test_messy)  # Suppress the multiple (+) spaces

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [31]:
# re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

re.split('\s+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [32]:
# re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

re.split('\W+', re_test_messy1)    # Suppress everthing but letters and numbers, (+) for multiples
                                   # Capital W plus looks for one or more non-word characters

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

The other option is, instead of searching for what splits the words, just **search for the actual words themselves** and ignore the characters that split the words. This can be done using the `findall()` method. 

In [34]:
# re_test = 'This is a made up string to test 2 different regex methods'

re.findall('\S+', re_test)  # Instead of looking for one or more white space characters, it looks for one or more non white space characters

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [35]:
# re_test_messy = 'This      is a made up     string to test 2    different regex methods'

re.findall('\S+', re_test_messy)  # Instead of looking for one or more white space characters, it looks for one or more non white space characters

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [36]:
# re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

re.findall('\S+', re_test_messy1)  # Instead of looking for one or more white space characters, it looks for one or more non white space characters

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [37]:
# re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

re.findall('\w+', re_test_messy1)

# Lowercase 'w+' will search for one or more word characters, so basically it will search for tokens that resemble a word, 
# so it will actually look for letters.

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

#### Replacing a specific string

In [39]:
pep8_test = 'I try to follow PEP8 guidelines'

pep7_test = 'I try to follow PEP7 guidelines'

peep8_test = 'I try to follow PEEP8 guidelines'

In [40]:
# pep8_test = 'I try to follow PEP8 guidelines'

re.findall('[a-z]+', pep8_test)    # Find all occurrences of lowercase alphabetic sequences in `pep8_test`

['try', 'to', 'follow', 'guidelines']

In [41]:
# pep8_test = 'I try to follow PEP8 guidelines'

re.findall('[A-Z]+', pep8_test)    # Find all occurrences of upper case alphabetic sequences in `pep8_test`

['I', 'PEP']

In [42]:
# pep8_test = 'I try to follow PEP8 guidelines'

re.findall('[A-Z]+[0-9]+', peep8_test)  # Search for is strings of one or more characters with letters A to Z capitalized and number from 0 to 9

['PEEP8']

In [43]:
# pep8_test = 'I try to follow PEP8 guidelines'

re.sub('[A-Z]+[0-9]+', 'Python Styleguide', peep8_test)    # Replace all occurrences of the pattern '[A-Z]+[0-9]+' 
                                                           # in the `peep8_test` string with the string 'Python Styleguide'

'I try to follow Python Styleguide guidelines'

#### Other examples of regex methods

- re.search()
- re.match()
- re.fullmatch()
- re.finditer()
- re.escape()

## Pre-processing text data

Cleaning up the text data is necessary to highlight attributes that you're going to want your machine learning system to pick up on. Cleaning (or pre-processing) the data typically consists of a number of steps:
1. **Remove punctuation**
2. **Tokenization**
3. **Remove stopwords**
4. Lemmatize/Stem

The first three steps are covered in this chapter as they're implemented in pretty much any text cleaning pipeline. Lemmatizing and stemming are covered in the next chapter as they're helpful but not critical.

In [46]:
# 100 - how many characters we can see in a panda's data frame
pd.set_option('display.max_colwidth', 100)   

data = pd.read_csv("data/SMSSpamCollection.tsv", sep='\t', header=None)
data.columns = ['label', 'body_text']

data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [47]:
# What does the cleaned version look like?
data_cleaned = pd.read_csv("data/SMSSpamCollection_cleaned.tsv", sep='\t')
data_cleaned.head()

Unnamed: 0,label,body_text,body_text_nostop
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"['ive', 'searching', 'right', 'words', 'thank', 'breather', 'promise', 'wont', 'take', 'help', '..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"['free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","['nah', 'dont', 'think', 'goes', 'usf', 'lives', 'around', 'though']"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"['even', 'brother', 'like', 'speak', 'treat', 'like', 'aids', 'patent']"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"['date', 'sunday']"


#### Remove punctuation

In [49]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [50]:
"I like NLP." == "I like NLP"

False

In [51]:
# Define a function to remove punctuation from a given text
def remove_punct(text):
    
    # Create list with all characters in `text` that are not punctuation marks
    text_nopunct =[char for char in text if char not in string.punctuation]
    
    # Return the cleaned text without punctuation
    return text_nopunct

# Apply the `remove_punct` function to the 'body_text' column of the DataFrame `data`
# and store the results in a new column 'body_text_clean'
data['body_text_clean'] = data['body_text'].apply(lambda x: remove_punct(x))

data.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[I, v, e, , b, e, e, n, , s, e, a, r, c, h, i, n, g, , f, o, r, , t, h, e, , r, i, g, h, t,..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[F, r, e, e, , e, n, t, r, y, , i, n, , 2, , a, , w, k, l, y, , c, o, m, p, , t, o, , w,..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[N, a, h, , I, , d, o, n, t, , t, h, i, n, k, , h, e, , g, o, e, s, , t, o, , u, s, f, ,..."
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[E, v, e, n, , m, y, , b, r, o, t, h, e, r, , i, s, , n, o, t, , l, i, k, e, , t, o, , s,..."
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[I, , H, A, V, E, , A, , D, A, T, E, , O, N, , S, U, N, D, A, Y, , W, I, T, H, , W, I, L, L]"


In [52]:
# Define a function to remove punctuation from a given text
def remove_punct(text):
    
    # Create a new string by joining all characters in `text` that are not punctuation marks
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    
    # Return the cleaned text without punctuation
    return text_nopunct

# Apply the `remove_punct` function to the 'body_text' column of the DataFrame `data`
# and store the results in a new column 'body_text_clean'
data['body_text_clean'] = data['body_text'].apply(lambda x: remove_punct(x))

data.head()

Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


#### Tokenization

In [54]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)     # Split the input text into tokens using any non-word character as the delimiter
    return tokens

data['body_text_tokenized'] = data['body_text_clean'].apply(lambda x: tokenize(x.lower()))

data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"


In [55]:
'NLP' == 'nlp'

False

#### Remove stopwords

In [57]:
import nltk

stopword = nltk.corpus.stopwords.words('english')

In [58]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data['body_text_nostop'] = data['body_text_tokenized'].apply(lambda x: remove_stopwords(x))

data.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized,body_text_nostop
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"
