# Natural Language Processing

## Removing Stopwords

In [None]:
#Import stopwords from nltk module
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#Get the list of stopwords from the english language
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
sample_text = "the great aim of education is not knowledge but action"

In [None]:
#Split the given sample text using split function
sample_words = sample_text.split()
print(sample_words)

['the', 'great', 'aim', 'of', 'education', 'is', 'not', 'knowledge', 'but', 'action']


In [None]:
#Remove the stopwords
sample_words = [word for word in sample_words if word not in stopwords.words('english')]
print(sample_words)

['great', 'aim', 'education', 'knowledge', 'action']


In [None]:
#Again join back the words using space
sample_text = " ".join(sample_words)
print(sample_text)

great aim education knowledge action


## Tokenisation

The notebook contains three types of tokenisation techniques:
1. Word tokenisation
2. Sentence tokenisation
3. Tweet tokenisation
4. Custom tokenisation using regular expressions

### 1. Word tokenisation

In [None]:
document = "At nine o'clock I visited him myself. It looks like religious mania, and he'll soon think that he himself is God."
print(document)

At nine o'clock I visited him myself. It looks like religious mania, and he'll soon think that he himself is God.


Tokenising on spaces using python

In [None]:
print(document.split())

['At', 'nine', "o'clock", 'I', 'visited', 'him', 'myself.', 'It', 'looks', 'like', 'religious', 'mania,', 'and', "he'll", 'soon', 'think', 'that', 'he', 'himself', 'is', 'God.']


Tokenising using nltk word tokeniser

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
words = word_tokenize(document)

In [None]:
print(words)

['At', 'nine', "o'clock", 'I', 'visited', 'him', 'myself', '.', 'It', 'looks', 'like', 'religious', 'mania', ',', 'and', 'he', "'ll", 'soon', 'think', 'that', 'he', 'himself', 'is', 'God', '.']


NLTK's word tokeniser not only breaks on whitespaces but also breaks contraction words such as he'll into "he" and "'ll". On the other hand it doesn't break "o'clock" and treats it as a separate token.

### 2. Sentence tokeniser

Tokenising based on sentence requires you to split on the period ('.'). Let's use nltk sentence tokeniser.

In [None]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(document)

In [None]:
print(sentences)

["At nine o'clock I visited him myself.", "It looks like religious mania, and he'll soon think that he himself is God."]


### 3. Tweet tokeniser

A problem with word tokeniser is that it fails to tokeniser emojis and other complex special characters such as word with hashtags. Emojis are common these days and people use them all the time.

In [None]:
message = "i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎"

In [None]:
print(word_tokenize(message))

['i', 'recently', 'watched', 'this', 'show', 'called', 'mindhunters', ':', ')', '.', 'i', 'totally', 'loved', 'it', '😍', '.', 'it', 'was', 'gr8', '<', '3', '.', '#', 'bingewatching', '#', 'nothingtodo', '😎']


The word tokeniser breaks the emoji '<3' into '<' and '3' which is something that we don't want. Emojis have their own significance in areas like sentiment analysis where a happy face and sad face can salone prove to be a really good predictor of the sentiment. Similarly, the hashtags are broken into two tokens. A hashtag is used for searching specific topics or photos in social media apps such as Instagram and facebook. So there, you want to use the hashtag as is.

Let's use the tweet tokeniser of nltk to tokenise this message.

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [None]:
tknzr.tokenize(message)

['i',
 'recently',
 'watched',
 'this',
 'show',
 'called',
 'mindhunters',
 ':)',
 '.',
 'i',
 'totally',
 'loved',
 'it',
 '😍',
 '.',
 'it',
 'was',
 'gr8',
 '<3',
 '.',
 '#bingewatching',
 '#nothingtodo',
 '😎']

As you can see, it handles all the emojis and the hashtags pretty well.

Now, there is a tokeniser that takes a regular expression and tokenises and returns result based on the pattern of regular expression.

Let's look at how you can use regular expression tokeniser.

In [None]:
from nltk.tokenize import regexp_tokenize
message = "i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎"
pattern = "#[\w]+"

In [None]:
regexp_tokenize(message, pattern)

['#bingewatching', '#nothingtodo']

## Bag of words Model

In [None]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

Build a basic bag of words model on three sample documents

In [None]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']


In [None]:
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)

    return document

documents = [preprocess(document) for document in documents]
print(documents)


['gangs wasseypur great movie .', 'success movie depends performance actors .', 'new movies releasing week .']


Creating bag of words model using count vectorizer function

In [None]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the rown and column number of cells which have 1 as value

  (0, 2)	1
  (0, 10)	1
  (0, 3)	1
  (0, 4)	1
  (1, 4)	1
  (1, 9)	1
  (1, 1)	1
  (1, 7)	1
  (1, 0)	1
  (2, 6)	1
  (2, 5)	1
  (2, 8)	1
  (2, 11)	1


In [None]:
# print the full sparse matrix
print(bow_model.toarray())

[[0 0 1 1 1 0 0 0 0 0 1 0]
 [1 1 0 0 1 0 0 1 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 0 0 1]]


In [None]:
print(bow_model.shape)
print(vectorizer.get_feature_names_out())

(3, 12)
['actors' 'depends' 'gangs' 'great' 'movie' 'movies' 'new' 'performance'
 'releasing' 'success' 'wasseypur' 'week']


### Let's create a bag of words model on the spam dataset.

In [None]:
# load data
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [None]:
spam = spam.iloc[0:50,:]
print(spam)

   label  \
0    ham   
1    ham   
2   spam   
3    ham   
4    ham   
5   spam   
6    ham   
7    ham   
8   spam   
9   spam   
10   ham   
11  spam   
12  spam   
13   ham   
14   ham   
15  spam   
16   ham   
17   ham   
18   ham   
19  spam   
20   ham   
21   ham   
22   ham   
23   ham   
24   ham   
25   ham   
26   ham   
27   ham   
28   ham   
29   ham   
30   ham   
31   ham   
32   ham   
33   ham   
34  spam   
35   ham   
36   ham   
37   ham   
38   ham   
39   ham   
40   ham   
41   ham   
42  spam   
43   ham   
44   ham   
45   ham   
46   ham   
47   ham   
48   ham   
49   ham   

                                                                                                message  
0   Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...  
1                                                                         Ok lar... Joking wif u oni...  
2   Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 200

In [None]:
# extract the messages from the dataframe
messages = spam.message
print(messages)

0     Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...
1                                                                           Ok lar... Joking wif u oni...
2     Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3                                                       U dun say so early hor... U c already then say...
4                                           Nah I don't think he goes to usf, he lives around here though
5     FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...
6                           Even my brother is not like to speak with me. They treat me like aids patent.
7     As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...
8     WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...
9     Had your mobile 11 months or more? U R e

In [None]:
# convert messages into list
messages = [message for message in messages]
print(messages)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though", "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv", 'Even my brother is not like to speak with me. They treat me like aids patent.', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune", 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.', 'Had your mobile 

In [None]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
print(messages)

['go jurong point , crazy .. available bugis n great world la e buffet ... cine got amore wat ...', 'ok lar ... joking wif u oni ...', "free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question ( std txt rate ) & c 's apply 08452810075over18 's", 'u dun say early hor ... u c already say ...', "nah n't think goes usf , lives around though", "freemsg hey darling 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chgs send , £1.50 rcv", 'even brother like speak . treat like aids patent .', "per request 'melle melle ( oru minnaminunginte nurungu vettam ) ' set callertune callers . press * 9 copy friends callertune", 'winner ! ! valued network customer selected receivea £900 prize reward ! claim call 09061701461. claim code kl341 . valid 12 hours .', 'mobile 11 months ? u r entitled update latest colour mobiles camera free ! call mobile update co free 08002986030', "'m gon na home soon n't want talk stuff anymore tonight , k ? 've cried enoug

In [None]:
# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [None]:
# look at the dataframe
pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,000,07732584351,08000930705,08002986030,08452810075over18,09061701461,100,11,12,150p,...,worried,www,xuhui,xxx,xxxmobilemovieclub,yeah,yes,yummy,yup,ú1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
print(vectorizer.get_feature_names_out())

['000' '07732584351' '08000930705' '08002986030' '08452810075over18'
 '09061701461' '100' '11' '12' '150p' '16' '20' '2005' '21st' '2nd'
 '4403ldnw1a7rw18' '4txt' '50' '6days' '81010' '87077' '87121' '87575'
 '8am' '900' 'abiola' 'actin' 'aft' 'ahead' 'ahhh' 'aids' 'already'
 'alright' 'always' 'amore' 'amp' 'anymore' 'anything' 'apologetic'
 'apply' 'arabian' 'ard' 'around' 'ask' 'available' 'back' 'badly' 'bit'
 'blessing' 'breather' 'brother' 'buffet' 'bugis' 'burns' 'bus' 'ca'
 'call' 'callers' 'callertune' 'calls' 'camcorder' 'camera' 'car' 'cash'
 'catch' 'caught' 'chances' 'charged' 'cheers' 'chgs' 'child' 'cine'
 'claim' 'clear' 'click' 'co' 'code' 'colour' 'com' 'comin' 'comp'
 'confirm' 'convincing' 'copy' 'cost' 'could' 'crave' 'crazy' 'credit'
 'cried' 'csh11' 'cup' 'cuppa' 'customer' 'da' 'darling' 'date' 'day'
 'dbuk' 'decide' 'decided' 'delivery' 'dinner' 'done' 'dont' 'dun' 'early'
 'eat' 'eating' 'eg' 'egg' 'eh' 'endowed' 'england' 'enough' 'entitled'
 'entry' 'even' '

* A lot of duplicate tokens such as 'win'and 'winner'; 'reply' and 'replying'; 'want' and 'wanted' etc.

## Stemming

In [None]:
# import libraries
import pandas as pd
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [None]:
text = "Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire."
print(text)

Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire.


In [None]:
tokens = text.lower()
print(tokens)

very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire.


In [None]:
tokens = word_tokenize(tokens)
print(tokens)

['very', 'orderly', 'and', 'methodical', 'he', 'looked', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'ticking', 'a', 'sonorous', 'sermon', 'under', 'his', 'flapped', 'newly', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pitted', 'its', 'gravity', 'and', 'longevity', 'against', 'the', 'levity', 'and', 'evanescence', 'of', 'the', 'brisk', 'fire', '.']


In [None]:
stemmer = PorterStemmer()
porter_stemmed = [stemmer.stem(token) for token in tokens]
print(porter_stemmed)
len(porter_stemmed)

['veri', 'orderli', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'hi', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']


47

In [None]:
# snowball stemmer
stemmer = SnowballStemmer("english")
snowball_stemmed = [stemmer.stem(token) for token in tokens]
print(snowball_stemmed)
len(snowball_stemmed)

['veri', 'order', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'his', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']


47

In [None]:
df = pd.DataFrame({'token': tokens, 'porter_stemmed': porter_stemmed, 'snowball_stemmed': snowball_stemmed})
df = df[['token', 'porter_stemmed', 'snowball_stemmed']]

In [None]:
df[(df.token != df.porter_stemmed) | (df.token != df.snowball_stemmed)]

Unnamed: 0,token,porter_stemmed,snowball_stemmed
0,very,veri,veri
1,orderly,orderli,order
3,methodical,method,method
5,looked,look,look
18,ticking,tick,tick
20,sonorous,sonor,sonor
23,his,hi,his
24,flapped,flap,flap
25,newly,newli,newli
32,pitted,pit,pit


In [None]:
len(df[(df.token != df.porter_stemmed) | (df.token != df.snowball_stemmed)])

15

In [None]:
df[(df.token == df.porter_stemmed) | (df.token == df.snowball_stemmed)]

Unnamed: 0,token,porter_stemmed,snowball_stemmed
2,and,and,and
4,he,he,he
6,",",",",","
7,with,with,with
8,a,a,a
9,hand,hand,hand
10,on,on,on
11,each,each,each
12,knee,knee,knee
13,",",",",","


In [None]:
len(df[(df.token == df.porter_stemmed) | (df.token == df.snowball_stemmed)])

33

## Lemmatization

In [None]:
### import necessary libraries
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
text = "Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire."
print(text)

Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire.


In [None]:
# tokenise text
tokens = word_tokenize(text)

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
print(lemmatized)

['Very', 'orderly', 'and', 'methodical', 'he', 'looked', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'ticking', 'a', 'sonorous', 'sermon', 'under', 'his', 'flapped', 'newly', 'bought', 'waist-coat', ',', 'a', 'though', 'it', 'pitted', 'it', 'gravity', 'and', 'longevity', 'against', 'the', 'levity', 'and', 'evanescence', 'of', 'the', 'brisk', 'fire', '.']


### Let's compare stemming and lemmatization

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokens]
print(stemmed)

['veri', 'orderli', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'hi', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']


In [None]:
import pandas as pd
df = pd.DataFrame(data={'token': tokens, 'stemmed': stemmed, 'lemmatized': lemmatized})
df = df[['token', 'stemmed', 'lemmatized']]
df[(df.token != df.stemmed) | (df.token != df.lemmatized)]

Unnamed: 0,token,stemmed,lemmatized
0,Very,veri,Very
1,orderly,orderli,orderly
3,methodical,method,methodical
5,looked,look,looked
18,ticking,tick,ticking
20,sonorous,sonor,sonorous
23,his,hi,his
24,flapped,flap,flapped
25,newly,newli,newly
29,as,as,a


* Lemmatising is faster than stemming in this case because the nltk lemmatiser also takes another argument called the part-of-speech (POS) tag of the input word.
* The default part-of-speech tag is 'noun'..
* You will learn more about part-of-speech tagging later in this course.
* Right now, the stemmer will have more accuracy than the lemmatiser because each word is lemmatised assuming it's a noun. To lemmatise efficiently, you need to pass it's POS tag manually.

## TF-IDF model

In [None]:
# load all necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('max_colwidth', 100)

Let's build a basic bag of words model on three sample documents

In [None]:
documents = ["Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar.", "The success of a song depends on the music.", "There is a new movie releasing this week. The movie is fun to watch."]
print(documents)

['Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar.', 'The success of a song depends on the music.', 'There is a new movie releasing this week. The movie is fun to watch.']


In [None]:
documents = ["Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline",
             "The beer at Vapour, Bangalore was amazing. My favorites are the wheat beer and the ale beer.",
             "Vapour, Bangalore has the best view in Bangalore."]
print(documents)

['Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline', 'The beer at Vapour, Bangalore was amazing. My favorites are the wheat beer and the ale beer.', 'Vapour, Bangalore has the best view in Bangalore.']


In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

# add stemming and lemmatisation in the preprocess function
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # stem
    #words = [stemmer.stem(word) for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [None]:
documents = [preprocess(document) for document in documents]
print(documents)

['vapour , bangalore really great terrace seating awesome view bangalore skyline', 'beer vapour , bangalore amazing . favorites wheat beer ale beer .', 'vapour , bangalore best view bangalore .']


Creating bag of words model using count vectorizer function

In [None]:
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(documents)
print(tfidf_model)  # returns the row number and column number of cells which have 1 as value

  (0, 10)	0.34663478992044555
  (0, 13)	0.2636246924033099
  (0, 2)	0.34663478992044555
  (0, 9)	0.34663478992044555
  (0, 11)	0.34663478992044555
  (0, 7)	0.34663478992044555
  (0, 8)	0.34663478992044555
  (0, 3)	0.40945618183743365
  (0, 12)	0.20472809091871683
  (1, 0)	0.2701947410011521
  (1, 14)	0.2701947410011521
  (1, 6)	0.2701947410011521
  (1, 1)	0.2701947410011521
  (1, 4)	0.8105842230034562
  (1, 3)	0.15958136664279549
  (1, 12)	0.15958136664279549
  (2, 5)	0.5486117771118656
  (2, 13)	0.4172333972107692
  (2, 3)	0.6480379064629606
  (2, 12)	0.3240189532314803


In [None]:
# print the full sparse matrix
print(tfidf_model.toarray())

[[0.         0.         0.34663479 0.40945618 0.         0.
  0.         0.34663479 0.34663479 0.34663479 0.34663479 0.34663479
  0.20472809 0.26362469 0.        ]
 [0.27019474 0.27019474 0.         0.15958137 0.81058422 0.
  0.27019474 0.         0.         0.         0.         0.
  0.15958137 0.         0.27019474]
 [0.         0.         0.         0.64803791 0.         0.54861178
  0.         0.         0.         0.         0.         0.
  0.32401895 0.4172334  0.        ]]


In [None]:
pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,ale,amazing,awesome,bangalore,beer,best,favorites,great,really,seating,skyline,terrace,vapour,view,wheat
0,0.0,0.0,0.346635,0.409456,0.0,0.0,0.0,0.346635,0.346635,0.346635,0.346635,0.346635,0.204728,0.263625,0.0
1,0.270195,0.270195,0.0,0.159581,0.810584,0.0,0.270195,0.0,0.0,0.0,0.0,0.0,0.159581,0.0,0.270195
2,0.0,0.0,0.0,0.648038,0.0,0.548612,0.0,0.0,0.0,0.0,0.0,0.0,0.324019,0.417233,0.0


Let's create a tf-idf model on the spam dataset.

In [None]:
# load data
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


**Let's take a subset of data (first 50 rows only) and create bag of word model on that.**

In [None]:
spam = spam.iloc[0:50,:]
print(spam)

   label  \
0    ham   
1    ham   
2   spam   
3    ham   
4    ham   
5   spam   
6    ham   
7    ham   
8   spam   
9   spam   
10   ham   
11  spam   
12  spam   
13   ham   
14   ham   
15  spam   
16   ham   
17   ham   
18   ham   
19  spam   
20   ham   
21   ham   
22   ham   
23   ham   
24   ham   
25   ham   
26   ham   
27   ham   
28   ham   
29   ham   
30   ham   
31   ham   
32   ham   
33   ham   
34  spam   
35   ham   
36   ham   
37   ham   
38   ham   
39   ham   
40   ham   
41   ham   
42  spam   
43   ham   
44   ham   
45   ham   
46   ham   
47   ham   
48   ham   
49   ham   

                                                                                                message  
0   Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...  
1                                                                         Ok lar... Joking wif u oni...  
2   Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 200

In [None]:
# extract the messages from the dataframe
messages = [message for message in spam.message]
print(messages)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though", "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv", 'Even my brother is not like to speak with me. They treat me like aids patent.', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune", 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.', 'Had your mobile 

In [None]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
print(messages)

['go jurong point , crazy .. available bugis n great world la e buffet ... cine got amore wat ...', 'ok lar ... joking wif u oni ...', "free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question ( std txt rate ) & c 's apply 08452810075over18 's", 'u dun say early hor ... u c already say ...', "nah n't think goes usf , lives around though", "freemsg hey darling 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chgs send , £1.50 rcv", 'even brother like speak . treat like aids patent .', "per request 'melle melle ( oru minnaminunginte nurungu vettam ) ' set callertune callers . press * 9 copy friends callertune", 'winner ! ! valued network customer selected receivea £900 prize reward ! claim call 09061701461. claim code kl341 . valid 12 hours .', 'mobile 11 months ? u r entitled update latest colour mobiles camera free ! call mobile update co free 08002986030', "'m gon na home soon n't want talk stuff anymore tonight , k ? 've cried enoug

In [None]:
# bag of words model
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(messages)

In [None]:
# Let's look at the dataframe
tfidf = pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names_out())
tfidf

Unnamed: 0,000,07732584351,08000930705,08002986030,08452810075over18,09061701461,100,11,12,150p,...,worried,www,xuhui,xxx,xxxmobilemovieclub,yeah,yes,yummy,yup,ú1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.198284,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.256871,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.230701,0.0,0.0,0.230701,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.230794,0.0,0.0,0.0,0.230794,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# token names
print(vectorizer.get_feature_names_out())

['000' '07732584351' '08000930705' '08002986030' '08452810075over18'
 '09061701461' '100' '11' '12' '150p' '16' '20' '2005' '21st' '2nd'
 '4403ldnw1a7rw18' '4txt' '50' '6days' '81010' '87077' '87121' '87575'
 '8am' '900' 'abiola' 'actin' 'aft' 'ahead' 'ahhh' 'aids' 'already'
 'alright' 'always' 'amore' 'amp' 'anymore' 'anything' 'apologetic'
 'apply' 'arabian' 'ard' 'around' 'ask' 'available' 'back' 'badly' 'bit'
 'blessing' 'breather' 'brother' 'buffet' 'bugis' 'burns' 'bus' 'ca'
 'call' 'callers' 'callertune' 'calls' 'camcorder' 'camera' 'car' 'cash'
 'catch' 'caught' 'chances' 'charged' 'cheers' 'chgs' 'child' 'cine'
 'claim' 'clear' 'click' 'co' 'code' 'colour' 'com' 'comin' 'comp'
 'confirm' 'convincing' 'copy' 'cost' 'could' 'crave' 'crazy' 'credit'
 'cried' 'csh11' 'cup' 'cuppa' 'customer' 'da' 'darling' 'date' 'day'
 'dbuk' 'decide' 'decided' 'delivery' 'dinner' 'done' 'dont' 'dun' 'early'
 'eat' 'eating' 'eg' 'egg' 'eh' 'endowed' 'england' 'enough' 'entitled'
 'entry' 'even' '

## Soundex

Let's create a function which calculates the soundex of any given string

In [None]:
def get_soundex(token):
    """Get the soundex code for the string"""
    token = token.upper()

    soundex = ""

    # first letter of input is always the first letter of soundex
    soundex += token[0]

    # create a dictionary which maps letters to respective soundex codes. Vowels and 'H', 'W' and 'Y' will be represented by '.'
    dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}

    for char in token[1:]:
        for key in dictionary.keys():
            if char in key:
                code = dictionary[key]
                if code != soundex[-1]:
                    soundex += code

    # remove vowels and 'H', 'W' and 'Y' from soundex
    soundex = soundex.replace(".", "")

    # trim or pad to make soundex a 4-character code
    soundex = soundex[:4].ljust(4, "0")

    return soundex

Let's see what's the soudex of 'Bombay' and 'Bambai'

In [None]:
print(get_soundex("Bombay"))
print(get_soundex("Bambai"))

B510
B510


Let's see soundex of 'Aggrawal', 'Agrawal', 'Aggarwal' and 'Agarwal'

In [None]:
print(get_soundex("Aggrawal"))
print(get_soundex("Agrawal"))
print(get_soundex("Aggarwal"))
print(get_soundex("Agarwal"))

A264
A264
A264
A264


# Edit Distance

## Levenshtein Edit Distance
The levenshtein distance calculates the number of steps (insertions, deletions or substitutions) required to go from source string to target string.

In [None]:
def lev_distance(source='', target=''):
    """Make a Levenshtein Distances Matrix"""

    # get length of both strings
    n1, n2 = len(source), len(target)

    # create matrix using length of both strings - source string sits on columns, target string sits on rows
    matrix = [ [ 0 for i1 in range(n1 + 1) ] for i2 in range(n2 + 1) ]

    # fill the first row - (0 to n1-1)
    for i1 in range(1, n1 + 1):
        matrix[0][i1] = i1

    # fill the first column - (0 to n2-1)
    for i2 in range(1, n2 + 1):
        matrix[i2][0] = i2

    # fill the matrix
    for i2 in range(1, n2 + 1):
        for i1 in range(1, n1 + 1):

            # check whether letters being compared are same
            if (source[i1-1] == target[i2-1]):
                value = matrix[i2-1][i1-1]               # top-left cell value
            else:
                value = min(matrix[i2-1][i1]   + 1,      # left cell value     + 1
                            matrix[i2][i1-1]   + 1,      # top cell  value     + 1
                            matrix[i2-1][i1-1] + 1)      # top-left cell value + 1

            matrix[i2][i1] = value

    # return bottom-right cell value
    return matrix[-1][-1]

In [None]:
lev_distance('cat', 'cta')

2

## Levenshtein distance in nltk library

In [None]:
# import library
from nltk.metrics.distance import edit_distance

In [None]:
edit_distance("perspective","prospective")

2

## Damerau-Levenshtein Distance
The Damerau-Levenshtein distance allows transpositions (swap of two letters which are adjacent to each other) as well.

In [None]:
edit_distance("perspective","prospective", transpositions=False, )

2

## Spell Corrector

In [None]:
import re
from collections import Counter

In [None]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [None]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('big.txt').read()))

In [None]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [None]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [None]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [None]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [None]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [None]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [None]:
def prob(word, N=sum(all_words.values())):
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [None]:
print(len(set(edits_one("emfasize"))))
print(edits_one("emfasize"))

442
{'emfaspize', 'emcasize', 'emfaxize', 'emfasizea', 'emfasiza', 'emfasizen', 'emfasizet', 'emfasizqe', 'emfasjize', 'emfasmize', 'eomfasize', 'eymfasize', 'emfasbize', 'emfasieze', 'emfaswze', 'emiasize', 'emfaswize', 'emfasxize', 'emeasize', 'emfasizr', 'emfastze', 'eyfasize', 'emfasizn', 'emfasijze', 'emfasiize', 'emfazsize', 'emfzsize', 'cemfasize', 'mefasize', 'emefasize', 'femfasize', 'emfaskze', 'emfasyze', 'eqmfasize', 'emfasvze', 'emfiasize', 'emfasize', 'emfacsize', 'emfasiqe', 'ermfasize', 'efmasize', 'emfasoize', 'edfasize', 'ejmfasize', 'uemfasize', 'emfansize', 'emfasiue', 'emfasizc', 'emfasizze', 'emfasoze', 'jemfasize', 'pmfasize', 'emfasihe', 'emfasaize', 'emfasine', 'emfasire', 'emfagize', 'qemfasize', 'emyasize', 'ekmfasize', 'emfasizye', 'emfrasize', 'emfasfze', 'emkfasize', 'emfasizeo', 'amfasize', 'emffasize', 'emfksize', 'emfasidze', 'emfoasize', 'eemfasize', 'hemfasize', 'emfasizde', 'emnfasize', 'demfasize', 'pemfasize', 'emfafize', 'emfasizz', 'eufasize', 'e

In [None]:
print(known(edits_one("monney")))

{'monkey', 'money'}


In [None]:
# Let's look at words that are two edits away
print(len(set(edits_two("emfasize"))))
print(known(edits_one("emfasize")))

90902
set()


In [None]:
# Let's look at possible corrections of a word
print(possible_corrections("emfasize"))

{'emphasize'}


In [None]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [None]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [None]:
# test spell check
print(spell_check("monney"))

Did you mean money?


In [None]:
!pip install spell_corrector

[31mERROR: Could not find a version that satisfies the requirement spell_corrector (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for spell_corrector[0m[31m
[0m

In [None]:
from spell_corrector import rectify
correct = rectify("laern")
print(correct)

ModuleNotFoundError: No module named 'spell_corrector'