In [4]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
text = open('text_doc.txt').read()
text

'Hello everyone! This is a text analysis assignment. We will be using preprocessing methods like Tokenization, POS tagging , stop words removal, stemming and lemmatization.'

## Tokenization
### Tokenization is the process of separating words form sentence

In [9]:
token_words = nltk.word_tokenize(text)
print(token_words)

['Hello', 'everyone', '!', 'This', 'is', 'a', 'text', 'analysis', 'assignment', '.', 'We', 'will', 'be', 'using', 'preprocessing', 'methods', 'like', 'Tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', 'and', 'lemmatization', '.']


### Pos tagging
### Labeling of the words according to their words types

In [13]:
tagged = nltk.pos_tag(token_words)
print(tagged)

[('Hello', 'NNP'), ('everyone', 'NN'), ('!', '.'), ('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('text', 'JJ'), ('analysis', 'NN'), ('assignment', 'NN'), ('.', '.'), ('We', 'PRP'), ('will', 'MD'), ('be', 'VB'), ('using', 'VBG'), ('preprocessing', 'VBG'), ('methods', 'NNS'), ('like', 'IN'), ('Tokenization', 'NNP'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]


###  Stop words removal
#### It is a process of removing  words which don't have strong meaning like articles , 'and', "it's"

In [14]:
from nltk.corpus import stopwords

In [18]:
stop_words = stopwords.words('english')
print(stop_words[:10])
len(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


179

In [24]:
tokens = []
for word in token_words:
    tokens.append(word.lower())
print(tokens)

['hello', 'everyone', '!', 'this', 'is', 'a', 'text', 'analysis', 'assignment', '.', 'we', 'will', 'be', 'using', 'preprocessing', 'methods', 'like', 'tokenization', ',', 'pos', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', 'and', 'lemmatization', '.']


In [25]:
clean_tokens = []
for i in tokens:
    if i not in stop_words:
        clean_tokens.append(i)
print(clean_tokens)

['hello', 'everyone', '!', 'text', 'analysis', 'assignment', '.', 'using', 'preprocessing', 'methods', 'like', 'tokenization', ',', 'pos', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', 'lemmatization', '.']


### Stemming
#### Stemmingn is the process of finding the root form of the word

In [26]:
from nltk.stem import PorterStemmer

In [27]:
stemmer = PorterStemmer()
stem_token = []
for word in clean_tokens:
    stem_token.append(stemmer.stem(word))
print(stem_token)

['hello', 'everyon', '!', 'text', 'analysi', 'assign', '.', 'use', 'preprocess', 'method', 'like', 'token', ',', 'po', 'tag', ',', 'stop', 'word', 'remov', ',', 'stem', 'lemmat', '.']


### Lemmatization
#### Lemmatization is find the the root form of the word which has meaning 

In [32]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [33]:
lemmatized_token = []
for word in clean_tokens:
    lemmatized_token.append(lemmatizer.lemmatize(word))
print(lemmatized_token)

['hello', 'everyone', '!', 'text', 'analysis', 'assignment', '.', 'using', 'preprocessing', 'method', 'like', 'tokenization', ',', 'po', 'tagging', ',', 'stop', 'word', 'removal', ',', 'stemming', 'lemmatization', '.']


## Term Frequency and Inveerse Document Frequency

In [38]:
corpus = token_words
set_words = set(corpus)
print(len(set_words))
print(set_words)

27
{'stemming', 'This', 'Hello', 'a', 'like', 'be', ',', '.', 'preprocessing', 'words', 'removal', 'will', 'and', 'Tokenization', 'text', '!', 'POS', 'is', 'We', 'lemmatization', 'analysis', 'everyone', 'stop', 'tagging', 'using', 'assignment', 'methods'}


### Term frequency = count of term in document / number of words in document

In [43]:
token_words.append('a')
numwords = dict.fromkeys(set_words, 0)
for word in token_words:
    numwords[word] += 1

In [44]:
print(numwords)

{'stemming': 1, 'This': 1, 'Hello': 1, 'a': 3, 'like': 1, 'be': 1, ',': 3, '.': 2, 'preprocessing': 1, 'words': 1, 'removal': 1, 'will': 1, 'and': 1, 'Tokenization': 1, 'text': 1, '!': 1, 'POS': 1, 'is': 1, 'We': 1, 'lemmatization': 1, 'analysis': 1, 'everyone': 1, 'stop': 1, 'tagging': 1, 'using': 1, 'assignment': 1, 'methods': 1}


In [59]:
tfDict = {}
cnt = len(token_words)
for word, count in numwords.items():
    tfDict[word] = count/cnt
tfDict

{'stemming': 0.03125,
 'This': 0.03125,
 'Hello': 0.03125,
 'a': 0.09375,
 'like': 0.03125,
 'be': 0.03125,
 ',': 0.09375,
 '.': 0.0625,
 'preprocessing': 0.03125,
 'words': 0.03125,
 'removal': 0.03125,
 'will': 0.03125,
 'and': 0.03125,
 'Tokenization': 0.03125,
 'text': 0.03125,
 '!': 0.03125,
 'POS': 0.03125,
 'is': 0.03125,
 'We': 0.03125,
 'lemmatization': 0.03125,
 'analysis': 0.03125,
 'everyone': 0.03125,
 'stop': 0.03125,
 'tagging': 0.03125,
 'using': 0.03125,
 'assignment': 0.03125,
 'methods': 0.03125}

In [58]:
import math
n = len(token_words)

idfDict = dict.fromkeys(set_words ,0)

for word in token_words:
    idfDict[word]+=1
    
for word, val in idfDict.items():
    idfDict[word] = math.log(n/val)
idfDict

{'stemming': 3.4657359027997265,
 'This': 3.4657359027997265,
 'Hello': 3.4657359027997265,
 'a': 2.367123614131617,
 'like': 3.4657359027997265,
 'be': 3.4657359027997265,
 ',': 2.367123614131617,
 '.': 2.772588722239781,
 'preprocessing': 3.4657359027997265,
 'words': 3.4657359027997265,
 'removal': 3.4657359027997265,
 'will': 3.4657359027997265,
 'and': 3.4657359027997265,
 'Tokenization': 3.4657359027997265,
 'text': 3.4657359027997265,
 '!': 3.4657359027997265,
 'POS': 3.4657359027997265,
 'is': 3.4657359027997265,
 'We': 3.4657359027997265,
 'lemmatization': 3.4657359027997265,
 'analysis': 3.4657359027997265,
 'everyone': 3.4657359027997265,
 'stop': 3.4657359027997265,
 'tagging': 3.4657359027997265,
 'using': 3.4657359027997265,
 'assignment': 3.4657359027997265,
 'methods': 3.4657359027997265}