## Text Analytics
1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document
Frequency.

In [1]:
# py -m install nltk
import nltk

In [2]:
# nltk.download('punkt')
# nltk.download('all')

In [3]:
# sample text
text="""Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""

In [4]:
# tokenizing the sentence
from nltk.tokenize import sent_tokenize

In [5]:
tokenized = sent_tokenize(text)
print(tokenized)

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and city is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard"]


In [6]:
# tokenizing the words
from nltk.tokenize import word_tokenize

In [7]:
tokenized_words = word_tokenize(text)
print(tokenized_words)

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


In [8]:
# nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [9]:
print(stopwords)

{'won', 'ourselves', 'at', 'didn', 'have', 'with', 'after', 're', "shan't", 'any', "she's", 'their', 'isn', 'too', 'of', 'here', 'into', 'on', 'can', 'now', 'o', 'no', 'her', 'had', 'below', "didn't", "doesn't", 'itself', 'wouldn', 'his', 'don', 'aren', 'between', 'them', 'hers', 'she', 'or', 'all', 'my', 'while', 'whom', 'but', 'when', 'mustn', 'doesn', "hasn't", 'off', 'are', 'other', 'y', 'having', 'been', 'nor', 'yourselves', 'll', 'not', 'shouldn', 'himself', 'under', 'ours', 'me', 'very', 'during', 'about', 'such', 'and', "it's", 'few', 'same', 'was', 'd', "couldn't", 'some', 'doing', "haven't", 'out', "isn't", "mightn't", 'you', 'yours', 'each', 'up', 'only', "wouldn't", 'yourself', 'what', 'will', 'if', 'to', 'a', 'by', 'herself', 'haven', 'it', 'these', 'your', 'down', 'weren', 'did', "don't", 'he', 'should', 'over', 'themselves', 've', 'more', 'were', 'again', 's', 'its', "shouldn't", 'there', 'hasn', 'how', 'then', "aren't", 'where', 'mightn', 'ain', "you've", "you'd", 'thei

In [10]:
# removing stopwords 

filtered_words = []
for w in tokenized_words:
    if w not in stopwords:
        filtered_words.append(w)

print(filtered_words)

['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


In [11]:
# stemming 

from nltk.stem import PorterStemmer
ps = PorterStemmer()

stemmed_words=[]
for w in filtered_words:
    stemmed_words.append(ps.stem(w))

print("Stemmed Words:",stemmed_words)

Stemmed Words: ['hello', 'mr.', 'smith', ',', 'today', '?', 'the', 'weather', 'great', ',', 'citi', 'awesom', '.', 'the', 'sky', 'pinkish-blu', '.', 'you', "n't", 'eat', 'cardboard']


In [12]:
# nltk.download('wordnet')

In [13]:
# Lemmatization

from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

lemmatized_words = []
for w in filtered_words:
    lemmatized_words.append(lem.lemmatize(w))

print("lemmatized Sentence: ", lemmatized_words)

lemmatized Sentence:  ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


In [14]:
# Pos tagging (part of speech)

# !py -m pip install spacy
import spacy

In [15]:
# ! py -m spacy download en_core_web_sm

In [16]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

In [17]:
for token in doc:
    print(token, "|", token.pos_,"|", spacy.explain(token.pos_),"|",token.tag_, spacy.explain(token.tag_))
    print("")

Hello | INTJ | interjection | UH interjection

Mr. | PROPN | proper noun | NNP noun, proper singular

Smith | PROPN | proper noun | NNP noun, proper singular

, | PUNCT | punctuation | , punctuation mark, comma

how | SCONJ | subordinating conjunction | WRB wh-adverb

are | AUX | auxiliary | VBP verb, non-3rd person singular present

you | PRON | pronoun | PRP pronoun, personal

doing | VERB | verb | VBG verb, gerund or present participle

today | NOUN | noun | NN noun, singular or mass

? | PUNCT | punctuation | . punctuation mark, sentence closer

The | DET | determiner | DT determiner

weather | NOUN | noun | NN noun, singular or mass

is | AUX | auxiliary | VBZ verb, 3rd person singular present

great | ADJ | adjective | JJ adjective (English), other noun-modifier (Chinese)

, | PUNCT | punctuation | , punctuation mark, comma

and | CCONJ | coordinating conjunction | CC conjunction, coordinating

city | NOUN | noun | NN noun, singular or mass

is | AUX | auxiliary | VBZ verb, 3rd p

In [18]:
# Term Frequency and Inverse Document Frequency

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

result = tfidf.fit_transform(tokenized_words)
print("TF-IDF Values : \n",result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())

print('\nWord indexes:')
print(tfidf.vocabulary_,"\n")

TF-IDF Values : 
   (0, 9)	1.0
  (1, 12)	1.0
  (2, 16)	1.0
  (4, 10)	1.0
  (5, 1)	1.0
  (6, 20)	1.0
  (7, 6)	1.0
  (8, 18)	1.0
  (10, 17)	1.0
  (11, 19)	1.0
  (12, 11)	1.0
  (13, 8)	1.0
  (15, 0)	1.0
  (16, 5)	1.0
  (17, 11)	1.0
  (18, 2)	1.0
  (20, 17)	1.0
  (21, 15)	1.0
  (22, 11)	1.0
  (23, 3)	0.7071067811865475
  (23, 13)	0.7071067811865475
  (25, 20)	1.0
  (26, 14)	1.0
  (28, 7)	1.0
  (29, 4)	1.0

tf-idf values in matrix form:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         1.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  1.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        