# Text preprocessing using NLTK

## I. Text tokenization

https://www.nltk.org/api/nltk.tokenize.html


### I.1. Sentence tokenization

In [1]:
from nltk.tokenize import sent_tokenize
text = 'This is a text written by Mr. Aries. It uses U.S. english to illustrate sentence tokenization.'
sents = sent_tokenize(text)
sents

['This is a text written by Mr. Aries.',
 'It uses U.S. english to illustrate sentence tokenization.']

In [2]:
fr_text = "Ce texte est écrit par M. Aries. Il a comme but d'illustrer la segmentation d'un texte en français."
fr_sents = sent_tokenize(fr_text, language='french')
fr_sents

['Ce texte est écrit par M. Aries.',
 "Il a comme but d'illustrer la segmentation d'un texte en français."]

### I.2. Words tokenization

In [3]:
from nltk.tokenize import word_tokenize
text = 'This is a text written by Mr. Aries. It uses U.S. english to illustrate word\'s tokenization.'
words = word_tokenize(text)
words

['This',
 'is',
 'a',
 'text',
 'written',
 'by',
 'Mr.',
 'Aries',
 '.',
 'It',
 'uses',
 'U.S.',
 'english',
 'to',
 'illustrate',
 'word',
 "'s",
 'tokenization',
 '.']

In [4]:
fr_text = "Ce texte est écrit par M. Aries. Il a comme but d'illustrer la segmentation d'un texte en français."
fr_words = word_tokenize(fr_text, language='french')
fr_words

['Ce',
 'texte',
 'est',
 'écrit',
 'par',
 'M.',
 'Aries',
 '.',
 'Il',
 'a',
 'comme',
 'but',
 "d'illustrer",
 'la',
 'segmentation',
 "d'un",
 'texte',
 'en',
 'français',
 '.']

In [5]:
from nltk.tokenize import TreebankWordTokenizer
# The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. 
s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
tokens = TreebankWordTokenizer().tokenize(s)
tokens

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them.',
 'Thanks',
 '.']

### I.3. Other forms of tokenization

In [6]:
from nltk.tokenize import RegexpTokenizer
s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokens = tokenizer.tokenize(s)
tokens

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [7]:
from nltk.tokenize import regexp_tokenize
s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
tokens = regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+')
tokens

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [8]:
from nltk.tokenize import SyllableTokenizer
SSP = SyllableTokenizer()
syllables = SSP.tokenize('justification')
syllables

['jus', 'ti', 'fi', 'ca', 'tion']

In [9]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
s = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tokens = tknzr.tokenize(s)
tokens

['This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '->',
 '<--']

In [10]:
from nltk.tokenize import MWETokenizer
tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')], separator='-')
tokenizer.add_mwe(('in', 'spite', 'of'))
tokens = tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
tokens

['In', 'a-little', 'or', 'a-little-bit', 'or', 'a-lot', 'in-spite-of']

## II. StopWords filtering

In [11]:
from nltk.corpus import stopwords

stopwords.fileids()

['danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'portuguese',
 'russian',
 'spanish',
 'swedish',
 'turkish']

In [12]:
esw = stopwords.words('english')

esw[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [13]:
words = ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 
         'Please', 'buy', 'me', 'two', 'of', 'them',  '.', 'Thanks', '.']
filtered = [w for w in words if not w.lower() in esw]

filtered

['Good',
 'muffins',
 'cost',
 '$3.88',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'two',
 '.',
 'Thanks',
 '.']

## III. Stemming

https://www.nltk.org/api/nltk.stem.html


In [14]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

lstemmer = LancasterStemmer()
pstemmer = PorterStemmer()

word = 'electricity'

lstem = lstemmer.stem(word)
pstem = pstemmer.stem(word)

lstem, pstem

('elect', 'electr')

In [15]:
from nltk.stem.isri import ISRIStemmer
stemmer = ISRIStemmer()
word = 'أتكلمونني'
stem = stemmer.stem(word)
stem

'كلمو'

In [16]:
from nltk.stem import snowball
dir(snowball)

['ArabicStemmer',
 'DanishStemmer',
 'DutchStemmer',
 'EnglishStemmer',
 'FinnishStemmer',
 'FrenchStemmer',
 'GermanStemmer',
 'HungarianStemmer',
 'ItalianStemmer',
 'NorwegianStemmer',
 'PorterStemmer',
 'PortugueseStemmer',
 'RomanianStemmer',
 'RussianStemmer',
 'SnowballStemmer',
 'SpanishStemmer',
 'StemmerI',
 'SwedishStemmer',
 '_LanguageSpecificStemmer',
 '_ScandinavianStemmer',
 '_StandardStemmer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'demo',
 'porter',
 'prefix_replace',
 're',
 'stopwords',
 'suffix_replace']

In [17]:
from nltk.stem.snowball import ArabicStemmer
stemmer = ArabicStemmer()
word = 'أتكلمونني'
stem = stemmer.stem(word)
stem

'اتكلم'

In [18]:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)
stem = stemmer.stem('cars')
stem

'car'

## IV. Lemmatization

In [19]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

word = 'wolves'
stem = stemmer.stem(word)
lemma = lemmatizer.lemmatize(word)

stem, lemma

('wolv', 'wolf')

## V. Distance

In [20]:
# generalized hamming distance
from nltk.metrics.segmentation import ghd

# To use traditional hamming
d = ghd('010010100110', '010110110100', ins_cost=1.0, del_cost=1.0, shift_cost_coeff=1.0) 

d

3.0

In [21]:
# lavenstein
from nltk.metrics.distance import edit_distance

d1 = edit_distance('intention', 'execution', substitution_cost=2)
d2 = edit_distance('intention', 'execution', substitution_cost=2, transpositions=True)#Damerau–Levenshtein


d1, d2

(8, 8)

In [22]:
from nltk.metrics.distance import jaro_similarity

d = jaro_similarity('amibe', 'immature')

d

0.6833333333333332