## Stop Words:

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
#NLTK supports 22 languages for removing the stop words
stopw = stopwords.words('english')

In [3]:
text = 'epic adventure in time, space and life.'

In [4]:
cleanwords = [word for word in text.split() if word not in stopw]
cleanwords

['epic', 'adventure', 'time,', 'space', 'life.']

In [5]:
text1 = 'this is just a programm'

In [6]:
cleanwords1 = [word for word in text1.split() if word not in stopw]
cleanwords1

['programm']

In [7]:
def remove_stopwords(text, lang='english'):
    words = word_tokenize(text)
    stopw = stopwords.words(lang)
    stopw_removed = [word for word in words if word.lower() not in stopw]
    return " ".join(stopw_removed)

In [8]:
print(remove_stopwords('This a great course of NLP with Python and NLTK'))

great course NLP Python NLTK


## Removing rare words

In [9]:
msg = 'One of the basic rules of the universe is that nothing is perfect. Perfection simply doesnt exist, without imperfection, neither you nor I would exist. Sthepen Hawking.'

In [10]:
msg = msg.lower()
msg

'one of the basic rules of the universe is that nothing is perfect. perfection simply doesnt exist, without imperfection, neither you nor i would exist. sthepen hawking.'

In [11]:
token = word_tokenize(msg)
token

['one',
 'of',
 'the',
 'basic',
 'rules',
 'of',
 'the',
 'universe',
 'is',
 'that',
 'nothing',
 'is',
 'perfect',
 '.',
 'perfection',
 'simply',
 'doesnt',
 'exist',
 ',',
 'without',
 'imperfection',
 ',',
 'neither',
 'you',
 'nor',
 'i',
 'would',
 'exist',
 '.',
 'sthepen',
 'hawking',
 '.']

In [12]:
from nltk import FreqDist

In [13]:
freqd = FreqDist(token)

In [14]:
freqd.items()

dict_items([('one', 1), ('of', 2), ('the', 2), ('basic', 1), ('rules', 1), ('universe', 1), ('is', 2), ('that', 1), ('nothing', 1), ('perfect', 1), ('.', 3), ('perfection', 1), ('simply', 1), ('doesnt', 1), ('exist', 2), (',', 2), ('without', 1), ('imperfection', 1), ('neither', 1), ('you', 1), ('nor', 1), ('i', 1), ('would', 1), ('sthepen', 1), ('hawking', 1)])

In [15]:
import operator
sorted_freq = sorted(freqd.items(), key=operator.itemgetter(1))
sorted_freq

[('one', 1),
 ('basic', 1),
 ('rules', 1),
 ('universe', 1),
 ('that', 1),
 ('nothing', 1),
 ('perfect', 1),
 ('perfection', 1),
 ('simply', 1),
 ('doesnt', 1),
 ('without', 1),
 ('imperfection', 1),
 ('neither', 1),
 ('you', 1),
 ('nor', 1),
 ('i', 1),
 ('would', 1),
 ('sthepen', 1),
 ('hawking', 1),
 ('of', 2),
 ('the', 2),
 ('is', 2),
 ('exist', 2),
 (',', 2),
 ('.', 3)]

In [16]:
rarewords = sorted_freq[:10]
rarewords

[('one', 1),
 ('basic', 1),
 ('rules', 1),
 ('universe', 1),
 ('that', 1),
 ('nothing', 1),
 ('perfect', 1),
 ('perfection', 1),
 ('simply', 1),
 ('doesnt', 1)]

In [18]:
rare = []
for i in rarewords:
    rare.append(i[0])
print(rare)

['one', 'basic', 'rules', 'universe', 'that', 'nothing', 'perfect', 'perfection', 'simply', 'doesnt']


In [19]:
finalwords = [w for w in token if w not in rare]
finalwords

['of',
 'the',
 'of',
 'the',
 'is',
 'is',
 '.',
 'exist',
 ',',
 'without',
 'imperfection',
 ',',
 'neither',
 'you',
 'nor',
 'i',
 'would',
 'exist',
 '.',
 'sthepen',
 'hawking',
 '.']

## Spell correction

In [20]:
from nltk.metrics import edit_distance

In [21]:
#Calculate the Levenshtein edit-distance between two strings.
#The edit distance is the number of characters that need to be
# substituted, inserted, or deleted, to transform s1 into s2
edit_distance("rain", "shine")

3

In [22]:
# transforming "rain" to "shine" requires three steps,
# consisting of two substitutions and one insertion:
# "rain" -> "sain" -> "shin" -> "shine".  These operations could have
# been done in other orders, but at least three steps are needed.

## Reference:

http://norvig.com/spell-correct.html