# Supplemental Data Cleaning :Using Steamming
* **Steaming** is the process for reducing the inflection in words to thier root forms such as mapping a group of words to the same stem

### Test out porter stemmer

In [60]:
import nltk
ps = nltk.PorterStemmer()
dir(ps)

['MARTIN_EXTENSIONS',
 'NLTK_EXTENSIONS',
 'ORIGINAL_ALGORITHM',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_abc_impl',
 '_apply_rule_list',
 '_contains_vowel',
 '_ends_cvc',
 '_ends_double_consonant',
 '_has_positive_measure',
 '_is_consonant',
 '_measure',
 '_replace_suffix',
 '_step1a',
 '_step1b',
 '_step1c',
 '_step2',
 '_step3',
 '_step4',
 '_step5a',
 '_step5b',
 'mode',
 'pool',
 'stem',
 'unicode_repr',
 'vowels']

In [61]:
print(ps.stem('play'))
print(ps.stem('played'))
print(ps.stem('playing'))
print(ps.stem('player'))

play
play
play
player


# Read in row text

In [62]:
import re
import string
import pandas as pd
pd.set_option('display.max_colwidth' , 100)
stopwords = nltk.corpus.stopwords.words('english')
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.columns = ['lable' , 'body_text']
data.head()

Unnamed: 0,lable,body_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


# Clean up text

In [63]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation]) # Removing puncituation
    tokenize = re.split('\W+' , text)                                        # Tokenization
    text = [word for word in tokenize if word not in stopwords]             # Removing stopwords
    return text
data['body_text_nostopwords'] = data['body_text'].apply(lambda x: clean_text(x.lower()))
data.head()

Unnamed: 0,lable,body_text,body_text_nostopwords
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"


# Stem text

In [64]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text
data['body_text_stremmed'] = data['body_text_nostopwords'].apply(lambda x : stemming(x))
data.head()

Unnamed: 0,lable,body_text,body_text_nostopwords,body_text_stremmed
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"


In [65]:
sentence = 'coursera is the best platform for learning new technoligies and keeping in trend'
ps.stem(sentense) #didn't work

'coursera is the best platform for learning new technoligies and keeping in trend'

# stemming On Sentences

In [66]:
from nltk.tokenize import sent_tokenize , word_tokenize
def stemSentence(sentence):
    token_words = word_tokenize(sentence)
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(ps.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)
x = stemSentence(sentence)
print(x)

coursera is the best platform for learn new technoligi and keep in trend 


# stemming On Document

In [67]:
file = open('DeepLearningStem.txt')
file.read()

"Hello everyone and welcome to the complete iOS App Development bootcamp.\nMy name is Angela and I head up the team at the London App Brewery as part of my job.\nI develop iOS and WatchOS applications for our corporate clients and also I teach on our in-person app development\nSo I'm here to tell you about all the awesome things that you can look forward to as a part of this course.\nFirstly, you're going to be learning how to build 20 mobile apps, after which you'll be fluently programming\nin swift and we're going to be taking you from complete beginner to app development professional because\nyou'll have understood how to build your very own cloud based database,\nHow to develop real time chat applications as well as using APIs and REST to grab data from the internet\nand most excitingly of all we're going to teach you how to build intelligent apps using machine learning\nand also how to build augmented reality app using ARKit and CoreML - Apple's latest frameworks in iOS\n11. These

In [68]:
file = open('DeepLearningStem.txt')
my_file_list = file.readlines()
my_file_list

['Hello everyone and welcome to the complete iOS App Development bootcamp.\n',
 'My name is Angela and I head up the team at the London App Brewery as part of my job.\n',
 'I develop iOS and WatchOS applications for our corporate clients and also I teach on our in-person app development\n',
 "So I'm here to tell you about all the awesome things that you can look forward to as a part of this course.\n",
 "Firstly, you're going to be learning how to build 20 mobile apps, after which you'll be fluently programming\n",
 "in swift and we're going to be taking you from complete beginner to app development professional because\n",
 "you'll have understood how to build your very own cloud based database,\n",
 'How to develop real time chat applications as well as using APIs and REST to grab data from the internet\n',
 "and most excitingly of all we're going to teach you how to build intelligent apps using machine learning\n",
 "and also how to build augmented reality app using ARKit and CoreML

In [69]:
from nltk.tokenize import sent_tokenize , word_tokenize
def stemSentence(sentence):
    token_words = word_tokenize(sentence)
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(ps.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)
print(my_file_list[5])
print('Stem Sentences\n')
x = stemSentence(my_file_list[5])
print(x)

in swift and we're going to be taking you from complete beginner to app development professional because

Stem Sentences

in swift and we 're go to be take you from complet beginn to app develop profession becaus 


# Supplemental Data Cleaning :Using Lemmatizer

In [74]:
import nltk

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
dir(wn)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 'lemmatize',
 'unicode_repr']

In [75]:
print(ps.stem('meanness'))
print(ps.stem('meaning'))

mean
mean


In [76]:
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning'))

meanness
meaning


In [77]:
print(ps.stem('goose'))
print(ps.stem('geese'))

goos
gees


In [79]:
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

goose
goose


# Read in row text

In [80]:
import re
import string
import pandas as pd
pd.set_option('display.max_colwidth' , 100)
stopwords = nltk.corpus.stopwords.words('english')
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.columns = ['lable' , 'body_text']
data.head()

Unnamed: 0,lable,body_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


# Clean up text

In [82]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation]) # Removing puncituation
    tokenize = re.split('\W+' , text)                                        # Tokenization
    text = [word for word in tokenize if word not in stopwords]             # Removing stopwords
    return text
data['body_text_nostopwords'] = data['body_text'].apply(lambda x: clean_text(x.lower()))
data.head()

Unnamed: 0,lable,body_text,body_text_nostopwords
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"


# Lemmatize Text

In [85]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text ]
    return text
data['body_text_lemmatized'] = data['body_text_nostopwords'].apply(lambda x : lemmatizing(x))
data.head()

Unnamed: 0,lable,body_text,body_text_nostopwords,body_text_lemmatized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, go, usf, life, around, though]"
