# Installations and Imports

In [124]:
import nltk as nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import math

In [125]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [126]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [127]:
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [128]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

1.  text
2. tokenize
3. stop words removal
4. POS tagging
4. stemming
5. lemmatization
6. tf-idf


# General

In [129]:
text_file_path = "doc_01.txt"

In [130]:
f = open(text_file_path)
text = f.read()
text

'Between 2016 and 2019, the state forest department under the\xa0BJP\xa0government had launched ‘Green Maharashtra’ drive with an aim to plant 50 crore trees across the state in the four-year period. In October 2019, the government had claimed it had surpassed the target by planting 33 crore trees in July-September 2019.\xa0The Indian Express\xa0had found that non-forest agencies — such as gram panchayats — which were tasked with planting trees had not uploaded the mandatory audio-visual proof of the tree plantation drives on the specially created portal.\nIn Pune Revenue Division, it was claimed the gram panchayats planted 1.7 crore saplings; however, no evidence was uploaded for 87 per cent (1.49 crore) saplings. Also, out of the 59 government agencies involved in the drive as many as 38 had not submitted survival reports about the saplings.\nThis year, the targets set by the forest department were comparatively modest. For example, Pune Circle — which comprises three divisions in Pu

In [131]:
# text = '''Natural language processing (NLP) is a field of artificial intelligence concerned with the interaction between computers and humans in natural language. It aims to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful. NLP techniques are used in a wide range of applications, including machine translation, sentiment analysis, information extraction, and text summarization. One of the key challenges in NLP is dealing with the ambiguity and variability of natural language, which can make it difficult for computers to accurately process and understand text. However, recent advances in machine learning and deep learning have led to significant improvements in NLP performance, making it an increasingly important area of research and development.
# Machine learning (ML) is a subset of artificial intelligence that focuses on the development of algorithms that can learn from and make predictions or decisions based on data. ML algorithms can be categorized into supervised learning, unsupervised learning, and reinforcement learning, depending on the type of training data and the learning task. Supervised learning involves training a model on labeled data, while unsupervised learning involves training on unlabeled data. Reinforcement learning involves training a model to interact with an environment and learn from feedback. ML techniques have applications in various domains, including image recognition, speech recognition, medical diagnosis, and autonomous vehicles.
# Data science is an interdisciplinary field that combines techniques from statistics, computer science, and domain-specific knowledge to extract insights and knowledge from data. It involves various stages of the data lifecycle, including data collection, data cleaning, data analysis, and data visualization. Data scientists use a variety of tools and techniques, such as machine learning, statistical modeling, and data mining, to uncover patterns and trends in data and make data-driven decisions. Data science has applications in numerous industries, including healthcare, finance, marketing, and e-commerce.'''

In [132]:
tokens = word_tokenize(text)

In [133]:
tokens[:5]

['Between', '2016', 'and', '2019', ',']

In [134]:
stopwords_corpus = stopwords.words("english")

In [135]:
stopwords_corpus[:5]

['i', 'me', 'my', 'myself', 'we']

In [136]:
def remove_stopwords(tokens,stopwords):
  filtered_tokens = []
  for token in tokens:
    if(token not in stopwords):
      filtered_tokens.append(token)
  return filtered_tokens

In [137]:
tokens_without_stopwords = remove_stopwords(tokens,stopwords_corpus)

In [138]:
assert len(tokens_without_stopwords)<=len(tokens)

In [139]:
pos_tagged_tokens = nltk.pos_tag(tokens_without_stopwords)

In [140]:
pos_tagged_tokens[:10]

[('Between', 'IN'),
 ('2016', 'CD'),
 ('2019', 'CD'),
 (',', ','),
 ('state', 'NN'),
 ('forest', 'JJS'),
 ('department', 'NN'),
 ('BJP', 'NNP'),
 ('government', 'NN'),
 ('launched', 'VBD')]

In [141]:
def stem_tokens(tokens):
  stemmer = PorterStemmer()
  stemmed_tokens = []
  for token in tokens:
    stemmed_token = stemmer.stem(token)
    stemmed_tokens.append(stemmed_token)
  return stemmed_tokens

In [142]:
stemmed_tokens = stem_tokens(tokens_without_stopwords)

In [143]:
stemmed_tokens[:10]

['between',
 '2016',
 '2019',
 ',',
 'state',
 'forest',
 'depart',
 'bjp',
 'govern',
 'launch']

In [144]:
def lemmatize_tokens(tokens):
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = []
  for token in tokens:
    lemmatized_token = lemmatizer.lemmatize(token)
    lemmatized_tokens.append(lemmatized_token)
  return lemmatized_tokens

In [145]:
lemmatized_tokens = lemmatize_tokens(tokens_without_stopwords)

In [146]:
lemmatized_tokens[:5]

['Between', '2016', '2019', ',', 'state']

# TF-IDF

In [147]:
documents = text.split("\n")

In [148]:
documents

['Between 2016 and 2019, the state forest department under the\xa0BJP\xa0government had launched ‘Green Maharashtra’ drive with an aim to plant 50 crore trees across the state in the four-year period. In October 2019, the government had claimed it had surpassed the target by planting 33 crore trees in July-September 2019.\xa0The Indian Express\xa0had found that non-forest agencies — such as gram panchayats — which were tasked with planting trees had not uploaded the mandatory audio-visual proof of the tree plantation drives on the specially created portal.',
 'In Pune Revenue Division, it was claimed the gram panchayats planted 1.7 crore saplings; however, no evidence was uploaded for 87 per cent (1.49 crore) saplings. Also, out of the 59 government agencies involved in the drive as many as 38 had not submitted survival reports about the saplings.',
 'This year, the targets set by the forest department were comparatively modest. For example, Pune Circle — which comprises three division

In [149]:
def get_document_tokens(documents):
  document_tokens = []
  for document in documents:
    tokens = word_tokenize(document)
    document_tokens.append(tokens)
  return document_tokens

In [150]:
document_tokens = get_document_tokens(documents)

In [151]:
# document_tokens

In [152]:
len(document_tokens[0])

96

In [153]:
def remove_stopwords_from_document_tokens(document_tokens,stopwords_corpus):
  document_tokens_without_stopwords = []
  for document in document_tokens:
    document_without_stopwords = remove_stopwords(document,stopwords_corpus)
    document_tokens_without_stopwords.append(document_without_stopwords)
  return document_tokens_without_stopwords

In [154]:
document_tokens_without_stopwords = remove_stopwords_from_document_tokens(document_tokens,stopwords_corpus)

In [155]:
def get_term_frequency(document_tokens):
  term_frequencies = []
  for document in document_tokens:
    term_frequency={}
    for token in document :
      try :
        term_frequency[token]+=1
      except:
        term_frequency[token] = 1
    n = len(document)
    unique_tokens = set(document)
    for token in unique_tokens:
      term_frequency[token] /= n
    term_frequencies.append(term_frequency)
  return term_frequencies

In [156]:
term_frequency = get_term_frequency(document_tokens_without_stopwords)

In [157]:
term_frequency

[{'Between': 0.015625,
  '2016': 0.015625,
  '2019': 0.046875,
  ',': 0.03125,
  'state': 0.03125,
  'forest': 0.015625,
  'department': 0.015625,
  'BJP': 0.015625,
  'government': 0.03125,
  'launched': 0.015625,
  '‘': 0.015625,
  'Green': 0.015625,
  'Maharashtra': 0.015625,
  '’': 0.015625,
  'drive': 0.015625,
  'aim': 0.015625,
  'plant': 0.015625,
  '50': 0.015625,
  'crore': 0.03125,
  'trees': 0.046875,
  'across': 0.015625,
  'four-year': 0.015625,
  'period': 0.015625,
  '.': 0.046875,
  'In': 0.015625,
  'October': 0.015625,
  'claimed': 0.015625,
  'surpassed': 0.015625,
  'target': 0.015625,
  'planting': 0.03125,
  '33': 0.015625,
  'July-September': 0.015625,
  'The': 0.015625,
  'Indian': 0.015625,
  'Express': 0.015625,
  'found': 0.015625,
  'non-forest': 0.015625,
  'agencies': 0.015625,
  '—': 0.03125,
  'gram': 0.015625,
  'panchayats': 0.015625,
  'tasked': 0.015625,
  'uploaded': 0.015625,
  'mandatory': 0.015625,
  'audio-visual': 0.015625,
  'proof': 0.015625

In [158]:
def get_inverse_document_frequency(term_frequency):
  document_cnt = len(term_frequency)
  inverse_document_frequencies = []
  for document in term_frequency:
    tokens = list(document.keys())
    inverse_document_frequency = {}
    for token in tokens:
      document_freq=1
      for doc in term_frequency:
        if(token in doc.keys()):
          document_freq+=1
      inverse_document_frequency[token] = math.log(document_cnt/document_freq)
    inverse_document_frequencies.append(inverse_document_frequency)
  return inverse_document_frequencies

In [159]:
inverse_document_frequency = get_inverse_document_frequency(term_frequency)

In [160]:
inverse_document_frequency

[{'Between': 2.1400661634962708,
  '2016': 1.7346010553881064,
  '2019': 1.7346010553881064,
  ',': 0.4353180712578455,
  'state': 2.1400661634962708,
  'forest': 1.2237754316221157,
  'department': 1.7346010553881064,
  'BJP': 2.1400661634962708,
  'government': 1.4469189829363254,
  'launched': 2.1400661634962708,
  '‘': 1.2237754316221157,
  'Green': 1.7346010553881064,
  'Maharashtra': 1.4469189829363254,
  '’': 1.2237754316221157,
  'drive': 1.2237754316221157,
  'aim': 1.7346010553881064,
  'plant': 1.4469189829363254,
  '50': 1.4469189829363254,
  'crore': 1.0414538748281612,
  'trees': 1.4469189829363254,
  'across': 2.1400661634962708,
  'four-year': 2.1400661634962708,
  'period': 2.1400661634962708,
  '.': 0.4353180712578455,
  'In': 0.8873031950009027,
  'October': 2.1400661634962708,
  'claimed': 1.7346010553881064,
  'surpassed': 2.1400661634962708,
  'target': 1.4469189829363254,
  'planting': 1.7346010553881064,
  '33': 1.2237754316221157,
  'July-September': 2.14006616

In [161]:
def get_TF_IDF(term_frequency,inverse_document_frequency):
  n = len(term_frequency)
  tf_idfs = []
  for i in range (n):
    tokens = list(term_frequency[i].keys())
    tf_idf={}
    for token in tokens:
      tf_idf[token] = term_frequency[i][token] * inverse_document_frequency[i][token]
    tf_idfs.append(tf_idf)
  return tf_idfs

In [162]:
tf_idf = get_TF_IDF(term_frequency,inverse_document_frequency)

In [163]:
tf_idf

[{'Between': 0.03343853380462923,
  '2016': 0.027103141490439162,
  '2019': 0.08130942447131749,
  ',': 0.013603689726807672,
  'state': 0.06687706760925846,
  'forest': 0.019121491119095557,
  'department': 0.027103141490439162,
  'BJP': 0.03343853380462923,
  'government': 0.04521621821676017,
  'launched': 0.03343853380462923,
  '‘': 0.019121491119095557,
  'Green': 0.027103141490439162,
  'Maharashtra': 0.022608109108380084,
  '’': 0.019121491119095557,
  'drive': 0.019121491119095557,
  'aim': 0.027103141490439162,
  'plant': 0.022608109108380084,
  '50': 0.022608109108380084,
  'crore': 0.03254543358838004,
  'trees': 0.06782432732514025,
  'across': 0.03343853380462923,
  'four-year': 0.03343853380462923,
  'period': 0.03343853380462923,
  '.': 0.020405534590211508,
  'In': 0.013864112421889105,
  'October': 0.03343853380462923,
  'claimed': 0.027103141490439162,
  'surpassed': 0.03343853380462923,
  'target': 0.022608109108380084,
  'planting': 0.054206282980878324,
  '33': 0.0