In [None]:

import nltk   # Natural Language Tool Kit


In [None]:
nltk.download('punkt')    # punkt is the required package for tokenization.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Tokenization
from nltk import word_tokenize, sent_tokenize
sent = "Hello everyone!, Welcome to my blog post on Medium. We are studying Natural Language Processing."
print(word_tokenize(sent))
print()
print(sent_tokenize(sent))


['Hello', 'everyone', '!', ',', 'Welcome', 'to', 'my', 'blog', 'post', 'on', 'Medium', '.', 'We', 'are', 'studying', 'Natural', 'Language', 'Processing', '.']

['Hello everyone!, Welcome to my blog post on Medium.', 'We are studying Natural Language Processing.']


In [None]:
# stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords        # the corpus module is an extremely useful one.

stop_words = stopwords.words('english')  # this is the full list of all stop-words stored in nltk

token = word_tokenize(sent)
cleaned_token = []
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
print("This is the unclean version:", token)
print("This is the cleaned version:", cleaned_token)


This is the unclean version: ['Hello', 'everyone', '!', ',', 'Welcome', 'to', 'my', 'blog', 'post', 'on', 'Medium', '.', 'We', 'are', 'studying', 'Natural', 'Language', 'Processing', '.']
This is the cleaned version: ['Hello', 'everyone', '!', ',', 'Welcome', 'blog', 'post', 'Medium', '.', 'We', 'studying', 'Natural', 'Language', 'Processing', '.']


In [None]:
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')
text="This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"
word_tokens = nltk.word_tokenize(text)
stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]
print (stemmed_word)


['this', 'is', 'a', 'demo', 'text', 'for', 'nlp', 'use', 'nltk', '.', 'full', 'form', 'of', 'nltk', 'is', 'natur', 'languag', 'toolkit']


In [None]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
#is based on The Porter Stemming Algorithm
stopword = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()
text = "the dogs are barking outside. Are the cats in the garden?"
word_tokens = nltk.word_tokenize(text)
lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
print (lemmatized_word)


['the', 'dog', 'are', 'barking', 'outside', '.', 'Are', 'the', 'cat', 'in', 'the', 'garden', '?']


In [None]:
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
text = "the dogs are barking outside."
word = nltk.word_tokenize(text)
pos_tag = nltk.pos_tag(word)
print (pos_tag)


[('the', 'DT'), ('dogs', 'NNS'), ('are', 'VBP'), ('barking', 'VBG'), ('outside', 'IN'), ('.', '.')]


In [None]:
# 2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:

d0 = 'New York Times'
d1 = 'New York Post'
d2 = 'Los Angles Times'

series = [d0, d1, d2]


In [None]:
# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(series)


In [None]:
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())



Word indexes:
{'new': 2, 'york': 5, 'times': 4, 'post': 3, 'los': 1, 'angles': 0}

tf-idf value:
  (0, 4)	0.5773502691896257
  (0, 5)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (1, 3)	0.680918560398684
  (1, 5)	0.5178561161676974
  (1, 2)	0.5178561161676974
  (2, 0)	0.6227660078332259
  (2, 1)	0.6227660078332259
  (2, 4)	0.4736296010332684

tf-idf values in matrix form:
[[0.         0.         0.57735027 0.         0.57735027 0.57735027]
 [0.         0.         0.51785612 0.68091856 0.         0.51785612]
 [0.62276601 0.62276601 0.         0.         0.4736296  0.        ]]


# Text Analytics - Theory with Viva Q&A

## 1. What is Text Analytics?

**Answer:**  
Text Analytics is the process of extracting meaningful information from unstructured text data. It involves techniques that convert text into structured data, making it easier to analyze patterns, trends, and insights. Text analytics is used in applications such as sentiment analysis, document classification, keyword extraction, and topic modeling.

---

## 2. Why is Text Analytics important?

**Answer:**  
Text data is one of the most abundant forms of data (emails, reviews, social media posts, etc.). Text Analytics allows organizations to:  
- Understand customer sentiment  
- Automate document classification  
- Extract actionable insights from large volumes of text  
- Improve decision-making based on text data

---

## 3. What are common operations performed in Text Analytics?

**Answer:**  
- Tokenization (splitting text into words or sentences)  
- Removing stopwords (common words like "the", "is", etc.)  
- Stemming and Lemmatization (reducing words to their root form)  
- Named Entity Recognition (identifying names, organizations, locations, etc.)  
- Part-of-Speech Tagging (identifying nouns, verbs, adjectives, etc.)

---

## 4. What is Natural Language Toolkit (NLTK)?

**Answer:**  
NLTK is a popular Python library used for performing natural language processing (NLP) and text analysis tasks. It provides easy-to-use interfaces for text preprocessing, linguistic data, and text mining operations.

---

## 5. What text analysis operations can be done using NLTK?

**Answer:**  
- Tokenization  
- Stopword removal  
- Stemming and Lemmatization  
- Part-of-Speech tagging  
- Named Entity Recognition  
- Concordance and word frequency analysis

---

## 6. What is TF-IDF?

**Answer:**  
TF-IDF stands for **Term Frequency - Inverse Document Frequency**. It is a statistical measure used to evaluate how important a word is in a document relative to a collection of documents (corpus).

---

## 7. How does TF-IDF work?

**Answer:**  
- **Term Frequency (TF)** measures how frequently a word occurs in a document.  
- **Inverse Document Frequency (IDF)** measures how important a word is by reducing the weight of commonly used words across all documents.  
TF-IDF gives a high score to words that are frequent in a document but rare in the corpus, making them important for that document.

---

## 8. Why is TF-IDF used in text analysis?

**Answer:**  
TF-IDF helps in:  
- Extracting important keywords from documents  
- Building document similarity models  
- Improving the performance of text classification models  
- Reducing the effect of common but uninformative words

---

## 9. What is Bag of Words (BoW)?

**Answer:**  
Bag of Words is a text representation technique where a document is represented as a collection of its words, disregarding grammar and word order but keeping track of word frequency.

---

## 10. What are the key characteristics of Bag of Words?

**Answer:**  
- It represents text as a vector of word counts or word frequencies.  
- The vocabulary is built from all unique words in the corpus.  
- The position and context of words are ignored.  
- It is simple to implement and works well for many tasks, but it can lead to sparse and high-dimensional feature spaces.

---

## 11. How does Bag of Words differ from TF-IDF?

**Answer:**  
- **BoW** counts how many times a word appears in a document without considering its importance.  
- **TF-IDF** adjusts word frequencies by reducing the weight of common words across documents, giving importance to rare and informative words.

---

## 12. What are the limitations of Bag of Words?

**Answer:**  
- It ignores the context and semantics of words.  
- It does not capture the order of words (syntax).  
- It can create very large and sparse matrices when applied to large corpora.

