

### Text Analytics
1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.

# Setup

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/pict/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pict/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pict/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/pict/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/pict/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
xy
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial import distance
import pandas as pd
import numpy as np

# Reading the data from the text file

In [None]:
with open('./paragraph.txt') as f:
    paragraph = f.read()
    paragraph = paragraph.lower()

In [None]:
paragraph



# Tokenization
Tokenization is the first step when working with language tasks, it simplifies the input data by splitting it into sentences or words, as per the requirement

In [None]:
# Sentence tokenization
sentence_tokens = sent_tokenize(paragraph)

In [None]:
print('Number of sentence tokens :', len(sentence_tokens))
print('Sentence tokens :', sentence_tokens)

Number of sentence tokens : 7


In [None]:
# Word tokenization
word_tokens = word_tokenize(paragraph)

In [None]:
print('Number of word tokens :', len(word_tokens))
print('Word tokens :', word_tokens)

Number of word tokens : 236


# POS Tagging and Stop words removal

In [None]:
stop_words = set(stopwords.words('english'))
print('Stop words :', stop_words)

Stop words : {'won', 'in', 'through', 'so', 'yourself', 'hadn', 'but', 'nor', "didn't", 'only', 'o', 'herself', 'up', 'doing', 'any', 'most', 'with', 'by', 'were', 'me', 'very', 'an', 'to', "you'd", "she's", 'again', 'don', 'itself', 'while', 'whom', 'll', "needn't", 'ourselves', 'himself', 'it', 'themselves', 'was', "isn't", 'or', 'from', 'which', 'why', 's', 'each', 'wouldn', 'between', 'wasn', 'm', 'been', 'a', 'have', 'where', "weren't", "mightn't", 'do', 'into', 'under', 'too', 'my', 'you', 'he', "aren't", 'ma', 'what', 'your', 'our', 'can', 'i', "wasn't", 'some', 'as', 'out', 'until', 'that', "haven't", 'because', 'below', 'she', "you've", "hasn't", 'having', "mustn't", 'not', 'should', 'haven', 'had', 'who', 'will', 'has', 'them', "wouldn't", 're', 'both', 'be', 'him', 'weren', 'ain', 'its', 'at', "that'll", 'shouldn', 've', "hadn't", 'against', "don't", 'down', 'how', 'other', "doesn't", 'same', 'didn', 'about', 'yourselves', "you're", 'those', 'shan', 'couldn', 'above', "shoul

In [None]:
word_tokens = [word_token for word_token in word_tokens if word_token not in stop_words]

In [None]:
print('Filtered word tokens :', word_tokens)



In [None]:
'''
CC coordinating conjunction 
CD cardinal digit 
DT determiner 
EX existential there (like: “there is” … think of it like “there exists”) 
FW foreign word 
IN preposition/subordinating conjunction 
JJ adjective – ‘big’ 
JJR adjective, comparative – ‘bigger’ 
JJS adjective, superlative – ‘biggest’ 
LS list marker 1) 
MD modal – could, will 
NN noun, singular ‘- desk’ 
NNS noun plural – ‘desks’ 
NNP proper noun, singular – ‘Harrison’ 
NNPS proper noun, plural – ‘Americans’ 
PDT predeterminer – ‘all the kids’ 
POS possessive ending parent’s 
PRP personal pronoun –  I, he, she 
PRP$ possessive pronoun – my, his, hers 
RB adverb – very, silently, 
RBR adverb, comparative – better 
RBS adverb, superlative – best 
RP particle – give up 
TO – to go ‘to’ the store. 
UH interjection – errrrrrrrm 
VB verb, base form – take 
VBD verb, past tense – took 
VBG verb, gerund/present participle – taking 
VBN verb, past participle – taken 
VBP verb, sing. present, non-3d – take 
VBZ verb, 3rd person sing. present – takes 
WDT wh-determiner – which 
WP wh-pronoun – who, what 
WP$ possessive wh-pronoun, eg- whose 
WRB wh-abverb, eg- where, when
'''
tagged = nltk.pos_tag(word_tokens)

In [None]:
print('POS Tagged form of filtered word tokens :')
for tag in tagged:
    print(tag)

POS Tagged form of filtered word tokens :
('european', 'JJ')
('union', 'NN')
('said', 'VBD')
('joined', 'JJ')
('members', 'NNS')
('council', 'VBP')
('baltic', 'JJ')
('sea', 'NN')
('states', 'NNS')
('(', '(')
('cbss', 'NN')
(')', ')')
('suspending', 'VBG')
('russia', 'JJ')
('belarus', 'NN')
('council', 'NN')
("'s", 'POS')
('activities', 'NNS')
('.', '.')
('``', '``')
('decision', 'NN')
('part', 'NN')
('european', 'VBP')
('union', 'NN')
("'s", 'POS')
('like-minded', 'JJ')
('partners', 'NNS')
('response', 'NN')
('russia', 'NN')
("'s", 'POS')
('invasion', 'NN')
('ukraine', 'JJ')
('involvement', 'NN')
('belarus', 'NN')
('unprovoked', 'VBD')
('unjustified', 'JJ')
('aggression', 'NN')
(',', ',')
("''", "''")
('said', 'VBD')
('saturday', 'NN')
('.', '.')
('russia', 'NN')
('declared', 'VBD')
('partial', 'JJ')
('ceasefire', 'NN')
('saturday', 'NN')
('allow', 'VB')
('humanitarian', 'JJ')
('corridors', 'NNS')
('ukrainian', 'JJ')
('cities', 'NNS')
('mariupol', 'VBP')
('volnovakha', 'NN')
(',', ',')

# Stemming

In [None]:
ps = PorterStemmer()

In [None]:
print('Results of Stemming')
stemmed = {word: ps.stem(word) for word in word_tokens}
for pair in stemmed.items():
    print('{0} --> {1}'.format(pair[0], pair[1]))

Results of Stemming
european --> european
union --> union
said --> said
joined --> join
members --> member
council --> council
baltic --> baltic
sea --> sea
states --> state
( --> (
cbss --> cbss
) --> )
suspending --> suspend
russia --> russia
belarus --> belaru
's --> 's
activities --> activ
. --> .
`` --> ``
decision --> decis
part --> part
like-minded --> like-mind
partners --> partner
response --> respons
invasion --> invas
ukraine --> ukrain
involvement --> involv
unprovoked --> unprovok
unjustified --> unjustifi
aggression --> aggress
, --> ,
'' --> ''
saturday --> saturday
declared --> declar
partial --> partial
ceasefire --> ceasefir
allow --> allow
humanitarian --> humanitarian
corridors --> corridor
ukrainian --> ukrainian
cities --> citi
mariupol --> mariupol
volnovakha --> volnovakha
defence --> defenc
ministry --> ministri
civilians --> civilian
leave --> leav
city --> citi
five-hour --> five-hour
period --> period
morning --> morn
authorities --> author
allowed --> allow

# Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
print('Results of Lemmatization')
lemmatized = {word: lemmatizer.lemmatize(word) for word in word_tokens}
for pair in lemmatized.items():
    print('{0} --> {1}'.format(pair[0], pair[1]))

Results of Lemmatization
european --> european
union --> union
said --> said
joined --> joined
members --> member
council --> council
baltic --> baltic
sea --> sea
states --> state
( --> (
cbss --> cbss
) --> )
suspending --> suspending
russia --> russia
belarus --> belarus
's --> 's
activities --> activity
. --> .
`` --> ``
decision --> decision
part --> part
like-minded --> like-minded
partners --> partner
response --> response
invasion --> invasion
ukraine --> ukraine
involvement --> involvement
unprovoked --> unprovoked
unjustified --> unjustified
aggression --> aggression
, --> ,
'' --> ''
saturday --> saturday
declared --> declared
partial --> partial
ceasefire --> ceasefire
allow --> allow
humanitarian --> humanitarian
corridors --> corridor
ukrainian --> ukrainian
cities --> city
mariupol --> mariupol
volnovakha --> volnovakha
defence --> defence
ministry --> ministry
civilians --> civilian
leave --> leave
city --> city
five-hour --> five-hour
period --> period
morning --> morn

# Term-Frequency and Inverse Document Frequency

In [None]:
def arr_convert_1d(arr):
    arr = np.array(arr)
    arr = np.concatenate( arr, axis=0 )
    arr = np.concatenate( arr, axis=0 )
    return arr

In [None]:
cos = []
def cosine(trans):
    cos.append(cosine_similarity(trans[0], trans[1]))

In [None]:
manhatten = []
def manhatten_distance(trans):
    manhatten.append(pairwise_distances(trans[0], trans[1], metric = 'manhattan'))

In [None]:
euclidean = []
def euclidean_function(vectors):
    euc=euclidean_distances(vectors[0], vectors[1])
    euclidean.append(euc)

In [None]:
def tfidf(str1, str2):
    vect = TfidfVectorizer()
    vect.fit(word_tokens)
    corpus = [str1,str2]
    trans = vect.transform(corpus)
    euclidean_function(trans)
    cosine(trans)
    manhatten_distance(trans)
    return convert()

In [None]:
def convert():
    dataf = pd.DataFrame()
    lis2 = arr_convert_1d(manhatten)
    dataf['manhatten'] = lis2
    lis2 = arr_convert_1d(cos)
    dataf['cos_sim'] = lis2
    lis2 = arr_convert_1d(euclidean)
    dataf['euclidean'] = lis2
    return dataf

In [None]:
str1 = 'russia'
str2 = 'ukraine'
newData = tfidf(str1,str2);
print(newData);

   manhatten  cos_sim  euclidean
0        2.0      0.0   1.414214
