**Text Analytics**
1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.

# Setup

In [2]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m0m
Collecting tqdm
  Downloading tqdm-4.63.1-py2.py3-none-any.whl (76 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 KB[0m [31m735.8 kB/s[0m eta [36m0:00:00[0mMB/s[0m eta [36m0:00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (764 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m764.9/764.9 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hInstalling collected packages: tqdm, regex, nltk
Successfully installed nltk-3.7 regex-2022.3.15 tqdm-4.63.1


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/pict/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/pict/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pict/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /home/pict/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /home/pict/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial import distance
import pandas as pd
import numpy as np

# Reading the data from the text file

In [6]:
with open('./paragraph.txt') as f:
    paragraph = f.read()
    paragraph = paragraph.lower()

In [7]:
paragraph

'r jbh pb uywe0pi] uegurv0w \nfoigb8eytfp u0etu iryue vifhoaiq0yr uifbc78p of\najd ro o fioj feofh ,hfo  \n uh ou o; u0 ewi{kpo\n hvisu biay o oiu8tfijig\n fuih fojpofj;kd pnkhsdl f;josdhljdoho;\n dhuicab vfhusrfrihfisbjkzjdghihsp\\\n fd fk jdogihgdjfiuisa fiej \n fhdsg fi gh l idoufekf \n aidsi hfyb i iuhihfi i \n hihih i ifh o klfid fy\n bfsiv pgpork;gjwiyi'

# Tokenization
Tokenization is the first step when working with language tasks, it simplifies the input data by splitting it into sentences or words, as per the requirement

In [8]:
# Sentence tokenization
sentence_tokens = sent_tokenize(paragraph)

In [9]:
print('Number of sentence tokens :', len(sentence_tokens))
print('Sentence tokens :', sentence_tokens)

Number of sentence tokens : 1
Sentence tokens : ['r jbh pb uywe0pi] uegurv0w \nfoigb8eytfp u0etu iryue vifhoaiq0yr uifbc78p of\najd ro o fioj feofh ,hfo  \n uh ou o; u0 ewi{kpo\n hvisu biay o oiu8tfijig\n fuih fojpofj;kd pnkhsdl f;josdhljdoho;\n dhuicab vfhusrfrihfisbjkzjdghihsp\\\n fd fk jdogihgdjfiuisa fiej \n fhdsg fi gh l idoufekf \n aidsi hfyb i iuhihfi i \n hihih i ifh o klfid fy\n bfsiv pgpork;gjwiyi']


In [10]:
# Word tokenization
word_tokens = word_tokenize(paragraph)

In [11]:
print('Number of word tokens :', len(word_tokens))
print('Word tokens :', word_tokens)

Number of word tokens : 66
Word tokens : ['r', 'jbh', 'pb', 'uywe0pi', ']', 'uegurv0w', 'foigb8eytfp', 'u0etu', 'iryue', 'vifhoaiq0yr', 'uifbc78p', 'of', 'ajd', 'ro', 'o', 'fioj', 'feofh', ',', 'hfo', 'uh', 'ou', 'o', ';', 'u0', 'ewi', '{', 'kpo', 'hvisu', 'biay', 'o', 'oiu8tfijig', 'fuih', 'fojpofj', ';', 'kd', 'pnkhsdl', 'f', ';', 'josdhljdoho', ';', 'dhuicab', 'vfhusrfrihfisbjkzjdghihsp\\', 'fd', 'fk', 'jdogihgdjfiuisa', 'fiej', 'fhdsg', 'fi', 'gh', 'l', 'idoufekf', 'aidsi', 'hfyb', 'i', 'iuhihfi', 'i', 'hihih', 'i', 'ifh', 'o', 'klfid', 'fy', 'bfsiv', 'pgpork', ';', 'gjwiyi']


# POS Tagging and Stop words removal

In [12]:
stop_words = set(stopwords.words('english'))
print('Stop words :', stop_words)

Stop words : {'am', "should've", 'were', 'same', 'or', 'that', 'ain', 'for', 't', 'all', 'not', 'mightn', "she's", 'about', 'when', 'mustn', 'again', 'she', 'o', 'hadn', 'yours', 'them', 'and', 'wasn', 'ma', "doesn't", 'further', 've', "mightn't", 'doing', "don't", 'nor', 'had', 'after', 'he', 'as', 'at', 'theirs', 'her', 'himself', 'then', 'here', 'herself', 'against', 'their', 'where', 'isn', "it's", 'once', 'shouldn', "mustn't", 'an', 'more', 'aren', 'yourselves', 'on', 'was', 'it', 'who', 'out', "haven't", 'are', 'hasn', 'can', 'if', 'needn', "isn't", 'did', 'myself', 'd', 'some', 'itself', "didn't", 'most', 'whom', 'wouldn', 'has', 'no', 'own', 'these', 'is', 'a', 'over', 'won', "wasn't", 'which', 'me', 'below', 'under', 'couldn', "weren't", "you'd", 'up', 'the', 'from', 's', 'your', 'to', 'only', 'between', 'have', 'our', 'such', 'each', 'i', "that'll", 'during', 'his', 'we', 'having', "wouldn't", 'you', 'didn', 'other', "hasn't", 'with', 'ourselves', 'those', 'before', 'down', '

In [13]:
word_tokens = [word_token for word_token in word_tokens if word_token not in stop_words]

In [14]:
print('Filtered word tokens :', word_tokens)

Filtered word tokens : ['r', 'jbh', 'pb', 'uywe0pi', ']', 'uegurv0w', 'foigb8eytfp', 'u0etu', 'iryue', 'vifhoaiq0yr', 'uifbc78p', 'ajd', 'ro', 'fioj', 'feofh', ',', 'hfo', 'uh', 'ou', ';', 'u0', 'ewi', '{', 'kpo', 'hvisu', 'biay', 'oiu8tfijig', 'fuih', 'fojpofj', ';', 'kd', 'pnkhsdl', 'f', ';', 'josdhljdoho', ';', 'dhuicab', 'vfhusrfrihfisbjkzjdghihsp\\', 'fd', 'fk', 'jdogihgdjfiuisa', 'fiej', 'fhdsg', 'fi', 'gh', 'l', 'idoufekf', 'aidsi', 'hfyb', 'iuhihfi', 'hihih', 'ifh', 'klfid', 'fy', 'bfsiv', 'pgpork', ';', 'gjwiyi']


In [15]:
'''
CC coordinating conjunction 
CD cardinal digit 
DT determiner 
EX existential there (like: “there is” … think of it like “there exists”) 
FW foreign word 
IN preposition/subordinating conjunction 
JJ adjective – ‘big’ 
JJR adjective, comparative – ‘bigger’ 
JJS adjective, superlative – ‘biggest’ 
LS list marker 1) 
MD modal – could, will 
NN noun, singular ‘- desk’ 
NNS noun plural – ‘desks’ 
NNP proper noun, singular – ‘Harrison’ 
NNPS proper noun, plural – ‘Americans’ 
PDT predeterminer – ‘all the kids’ 
POS possessive ending parent’s 
PRP personal pronoun –  I, he, she 
PRP$ possessive pronoun – my, his, hers 
RB adverb – very, silently, 
RBR adverb, comparative – better 
RBS adverb, superlative – best 
RP particle – give up 
TO – to go ‘to’ the store. 
UH interjection – errrrrrrrm 
VB verb, base form – take 
VBD verb, past tense – took 
VBG verb, gerund/present participle – taking 
VBN verb, past participle – taken 
VBP verb, sing. present, non-3d – take 
VBZ verb, 3rd person sing. present – takes 
WDT wh-determiner – which 
WP wh-pronoun – who, what 
WP$ possessive wh-pronoun, eg- whose 
WRB wh-abverb, eg- where, when
'''
tagged = nltk.pos_tag(word_tokens)

In [16]:
print('POS Tagged form of filtered word tokens :')
for tag in tagged:
    print(tag)

POS Tagged form of filtered word tokens :
('r', 'NN')
('jbh', 'NN')
('pb', 'NN')
('uywe0pi', 'JJ')
(']', 'NNP')
('uegurv0w', 'NN')
('foigb8eytfp', 'NN')
('u0etu', 'JJ')
('iryue', 'NN')
('vifhoaiq0yr', 'NN')
('uifbc78p', 'JJ')
('ajd', 'NN')
('ro', 'NN')
('fioj', 'NN')
('feofh', 'NN')
(',', ',')
('hfo', 'NN')
('uh', 'JJ')
('ou', 'NN')
(';', ':')
('u0', 'JJ')
('ewi', 'FW')
('{', '(')
('kpo', 'VB')
('hvisu', 'NN')
('biay', 'NN')
('oiu8tfijig', 'NN')
('fuih', 'NN')
('fojpofj', 'NN')
(';', ':')
('kd', 'CC')
('pnkhsdl', 'VB')
('f', 'NN')
(';', ':')
('josdhljdoho', 'NN')
(';', ':')
('dhuicab', 'CC')
('vfhusrfrihfisbjkzjdghihsp\\', 'FW')
('fd', 'JJ')
('fk', 'NN')
('jdogihgdjfiuisa', 'NN')
('fiej', 'NN')
('fhdsg', 'NN')
('fi', 'NN')
('gh', 'NN')
('l', 'NN')
('idoufekf', 'NN')
('aidsi', 'NN')
('hfyb', 'NN')
('iuhihfi', 'NN')
('hihih', 'NN')
('ifh', 'NN')
('klfid', 'NN')
('fy', 'NN')
('bfsiv', 'NN')
('pgpork', 'NN')
(';', ':')
('gjwiyi', 'NN')


# Stemming

In [17]:
ps = PorterStemmer()

In [18]:
print('Results of Stemming')
stemmed = {word: ps.stem(word) for word in word_tokens}
for pair in stemmed.items():
    print('{0} --> {1}'.format(pair[0], pair[1]))

Results of Stemming
r --> r
jbh --> jbh
pb --> pb
uywe0pi --> uywe0pi
] --> ]
uegurv0w --> uegurv0w
foigb8eytfp --> foigb8eytfp
u0etu --> u0etu
iryue --> iryu
vifhoaiq0yr --> vifhoaiq0yr
uifbc78p --> uifbc78p
ajd --> ajd
ro --> ro
fioj --> fioj
feofh --> feofh
, --> ,
hfo --> hfo
uh --> uh
ou --> ou
; --> ;
u0 --> u0
ewi --> ewi
{ --> {
kpo --> kpo
hvisu --> hvisu
biay --> biay
oiu8tfijig --> oiu8tfijig
fuih --> fuih
fojpofj --> fojpofj
kd --> kd
pnkhsdl --> pnkhsdl
f --> f
josdhljdoho --> josdhljdoho
dhuicab --> dhuicab
vfhusrfrihfisbjkzjdghihsp\ --> vfhusrfrihfisbjkzjdghihsp\
fd --> fd
fk --> fk
jdogihgdjfiuisa --> jdogihgdjfiuisa
fiej --> fiej
fhdsg --> fhdsg
fi --> fi
gh --> gh
l --> l
idoufekf --> idoufekf
aidsi --> aidsi
hfyb --> hfyb
iuhihfi --> iuhihfi
hihih --> hihih
ifh --> ifh
klfid --> klfid
fy --> fy
bfsiv --> bfsiv
pgpork --> pgpork
gjwiyi --> gjwiyi


# Lemmatization

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
print('Results of Lemmatization')
lemmatized = {word: lemmatizer.lemmatize(word) for word in word_tokens}
for pair in lemmatized.items():
    print('{0} --> {1}'.format(pair[0], pair[1]))

Results of Lemmatization
r --> r
jbh --> jbh
pb --> pb
uywe0pi --> uywe0pi
] --> ]
uegurv0w --> uegurv0w
foigb8eytfp --> foigb8eytfp
u0etu --> u0etu
iryue --> iryue
vifhoaiq0yr --> vifhoaiq0yr
uifbc78p --> uifbc78p
ajd --> ajd
ro --> ro
fioj --> fioj
feofh --> feofh
, --> ,
hfo --> hfo
uh --> uh
ou --> ou
; --> ;
u0 --> u0
ewi --> ewi
{ --> {
kpo --> kpo
hvisu --> hvisu
biay --> biay
oiu8tfijig --> oiu8tfijig
fuih --> fuih
fojpofj --> fojpofj
kd --> kd
pnkhsdl --> pnkhsdl
f --> f
josdhljdoho --> josdhljdoho
dhuicab --> dhuicab
vfhusrfrihfisbjkzjdghihsp\ --> vfhusrfrihfisbjkzjdghihsp\
fd --> fd
fk --> fk
jdogihgdjfiuisa --> jdogihgdjfiuisa
fiej --> fiej
fhdsg --> fhdsg
fi --> fi
gh --> gh
l --> l
idoufekf --> idoufekf
aidsi --> aidsi
hfyb --> hfyb
iuhihfi --> iuhihfi
hihih --> hihih
ifh --> ifh
klfid --> klfid
fy --> fy
bfsiv --> bfsiv
pgpork --> pgpork
gjwiyi --> gjwiyi


# Term-Frequency and Inverse Document Frequency

In [21]:
def arr_convert_1d(arr):
    arr = np.array(arr)
    arr = np.concatenate( arr, axis=0 )
    arr = np.concatenate( arr, axis=0 )
    return arr

In [22]:
cos = []
def cosine(trans):
    cos.append(cosine_similarity(trans[0], trans[1]))

In [23]:
manhatten = []
def manhatten_distance(trans):
    manhatten.append(pairwise_distances(trans[0], trans[1], metric = 'manhattan'))

In [24]:
euclidean = []
def euclidean_function(vectors):
    euc=euclidean_distances(vectors[0], vectors[1])
    euclidean.append(euc)

In [25]:
def tfidf(str1, str2):
    vect = TfidfVectorizer()
    vect.fit(word_tokens)
    corpus = [str1,str2]
    trans = vect.transform(corpus)
    euclidean_function(trans)
    cosine(trans)
    manhatten_distance(trans)
    return convert()

In [26]:
def convert():
    dataf = pd.DataFrame()
    lis2 = arr_convert_1d(manhatten)
    dataf['manhatten'] = lis2
    lis2 = arr_convert_1d(cos)
    dataf['cos_sim'] = lis2
    lis2 = arr_convert_1d(euclidean)
    dataf['euclidean'] = lis2
    return dataf

In [28]:
str1 = 'rsfhcui'
str2 = 'ukjiorgd'
newData = tfidf(str1,str2);
print(newData);

   manhatten  cos_sim  euclidean
0        0.0      0.0        0.0
1        0.0      0.0        0.0
