# Supplemental Data Cleaning: Using a Lemmatizer

### Test out WordNet lemmatizer (read more about WordNet [here](https://wordnet.princeton.edu/))

In [1]:
import nltk

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [6]:
# Example 1 using Stemming
print(ps.stem('meanness'))
print(ps.stem('meaning')) #not associated with mean
print(ps.stem('mean'))

mean
mean
mean


In [9]:
# Example 1 using Lemmatizing
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning')) #not associated with mean
print(wn.lemmatize('mean'))

meanness
meaning
mean


In [10]:
# Example 2 using Stemming
print(ps.stem('goose'))
print(ps.stem('geese'))

goos
gees


In [11]:
# Example 2 using Lemmatizing
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

goose
goose


### Read in raw text

In [13]:
import pandas as pd
import re
import string
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')

data = pd.read_csv("data/SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

### Clean up text

In [14]:
def clean_text(text):
    '''Function to clean the given text'''
    # Remove punctuation
    text = "".join([word for word in text if word not in string.punctuation])
    # Tokenize text
    tokens = re.split('\W+', text)
    # Remove stopwords
    text = [word for word in tokens if word not in stopwords]
    return text


# Clean the text
data['body_text_nostop'] = data['body_text'].apply(lambda x: clean_text(x.lower()))

# Check if cleaned
data.head()

Unnamed: 0,label,body_text,body_text_nostop
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr..."


### Lemmatize text

In [15]:
def lemmatizing(tokenized_text):
    '''Function that performs Lemmatizing'''
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

# Lemmatize text
data['body_text_lemmatized'] = data['body_text_nostop'].apply(lambda x: lemmatizing(x))

# Check if lemmatized
data.head()

Unnamed: 0,label,body_text,body_text_nostop,body_text_lemmatized
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, go, usf, life, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, caller, pre..."
