In [1]:
# !pip3 install --upgrade nltk

In [2]:
import nltk
import string
import math
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Download required NLTK resources
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarveshmhadgut/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sarveshmhadgut/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarveshmhadgut/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarveshmhadgut/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Load document
with open("./datasets/doc_01.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [5]:
# Clean the text
def clean_data(text):
    return "".join([char.lower() for char in text if char in string.printable])


cleaned_text = clean_data(text)
print(cleaned_text)

between 2016 and 2019, the state forest department under thebjpgovernment had launched green maharashtra drive with an aim to plant 50 crore trees across the state in the four-year period. in october 2019, the government had claimed it had surpassed the target by planting 33 crore trees in july-september 2019.the indian expresshad found that non-forest agencies  such as gram panchayats  which were tasked with planting trees had not uploaded the mandatory audio-visual proof of the tree plantation drives on the specially created portal.
in pune revenue division, it was claimed the gram panchayats planted 1.7 crore saplings; however, no evidence was uploaded for 87 per cent (1.49 crore) saplings. also, out of the 59 government agencies involved in the drive as many as 38 had not submitted survival reports about the saplings.
this year, the targets set by the forest department were comparatively modest. for example, pune circle  which comprises three divisions in pune and solapur district 

In [6]:
# Tokenization
tokens = word_tokenize(cleaned_text)
print("Tokens:\n", tokens)

Tokens:
 ['between', '2016', 'and', '2019', ',', 'the', 'state', 'forest', 'department', 'under', 'thebjpgovernment', 'had', 'launched', 'green', 'maharashtra', 'drive', 'with', 'an', 'aim', 'to', 'plant', '50', 'crore', 'trees', 'across', 'the', 'state', 'in', 'the', 'four-year', 'period', '.', 'in', 'october', '2019', ',', 'the', 'government', 'had', 'claimed', 'it', 'had', 'surpassed', 'the', 'target', 'by', 'planting', '33', 'crore', 'trees', 'in', 'july-september', '2019.the', 'indian', 'expresshad', 'found', 'that', 'non-forest', 'agencies', 'such', 'as', 'gram', 'panchayats', 'which', 'were', 'tasked', 'with', 'planting', 'trees', 'had', 'not', 'uploaded', 'the', 'mandatory', 'audio-visual', 'proof', 'of', 'the', 'tree', 'plantation', 'drives', 'on', 'the', 'specially', 'created', 'portal', '.', 'in', 'pune', 'revenue', 'division', ',', 'it', 'was', 'claimed', 'the', 'gram', 'panchayats', 'planted', '1.7', 'crore', 'saplings', ';', 'however', ',', 'no', 'evidence', 'was', 'uploa

In [7]:
tokens_df = pd.DataFrame(tokens, columns=["Token"])
print("Tokens DataFrame:")
tokens_df

Tokens DataFrame:


Unnamed: 0,Token
0,between
1,2016
2,and
3,2019
4,","
...,...
626,","
627,especially
628,the
629,public


In [8]:
# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:\n", pos_tags)

POS Tags:
 [('between', 'IN'), ('2016', 'CD'), ('and', 'CC'), ('2019', 'CD'), (',', ','), ('the', 'DT'), ('state', 'NN'), ('forest', 'JJS'), ('department', 'NN'), ('under', 'IN'), ('thebjpgovernment', 'NN'), ('had', 'VBD'), ('launched', 'VBN'), ('green', 'JJ'), ('maharashtra', 'NNS'), ('drive', 'NN'), ('with', 'IN'), ('an', 'DT'), ('aim', 'NN'), ('to', 'TO'), ('plant', 'NN'), ('50', 'CD'), ('crore', 'NN'), ('trees', 'NNS'), ('across', 'IN'), ('the', 'DT'), ('state', 'NN'), ('in', 'IN'), ('the', 'DT'), ('four-year', 'JJ'), ('period', 'NN'), ('.', '.'), ('in', 'IN'), ('october', 'JJ'), ('2019', 'CD'), (',', ','), ('the', 'DT'), ('government', 'NN'), ('had', 'VBD'), ('claimed', 'VBN'), ('it', 'PRP'), ('had', 'VBD'), ('surpassed', 'VBN'), ('the', 'DT'), ('target', 'NN'), ('by', 'IN'), ('planting', 'VBG'), ('33', 'CD'), ('crore', 'NN'), ('trees', 'NNS'), ('in', 'IN'), ('july-september', 'JJ'), ('2019.the', 'CD'), ('indian', 'JJ'), ('expresshad', 'NN'), ('found', 'VBD'), ('that', 'IN'), ('no

In [9]:
pos_df = pd.DataFrame(list(pos_tags), columns=["Word", "Tag"])
pos_df

Unnamed: 0,Word,Tag
0,between,IN
1,2016,CD
2,and,CC
3,2019,CD
4,",",","
...,...,...
626,",",","
627,especially,RB
628,the,DT
629,public,NN


In [10]:
# Stopword Removal
stop_words = set(stopwords.words("english"))
filtered_tokens = [
    word for word in tokens if word.lower() not in stop_words and word.isalpha()
]
print("Tokens after Stopword Removal:\n", filtered_tokens)

Tokens after Stopword Removal:
 ['state', 'forest', 'department', 'thebjpgovernment', 'launched', 'green', 'maharashtra', 'drive', 'aim', 'plant', 'crore', 'trees', 'across', 'state', 'period', 'october', 'government', 'claimed', 'surpassed', 'target', 'planting', 'crore', 'trees', 'indian', 'expresshad', 'found', 'agencies', 'gram', 'panchayats', 'tasked', 'planting', 'trees', 'uploaded', 'mandatory', 'proof', 'tree', 'plantation', 'drives', 'specially', 'created', 'portal', 'pune', 'revenue', 'division', 'claimed', 'gram', 'panchayats', 'planted', 'crore', 'saplings', 'however', 'evidence', 'uploaded', 'per', 'cent', 'crore', 'saplings', 'also', 'government', 'agencies', 'involved', 'drive', 'many', 'submitted', 'survival', 'reports', 'saplings', 'year', 'targets', 'set', 'forest', 'department', 'comparatively', 'modest', 'example', 'pune', 'circle', 'comprises', 'three', 'divisions', 'pune', 'solapur', 'district', 'planned', 'plant', 'lakh', 'saplings', 'forest', 'land', 'however', 

In [11]:
filtered_df = pd.DataFrame(filtered_tokens, columns=["Filtered Tokens"])
filtered_df

Unnamed: 0,Filtered Tokens
0,state
1,forest
2,department
3,thebjpgovernment
4,launched
...,...
303,participation
304,stakeholders
305,society
306,especially


In [12]:
# Stemming
porter = PorterStemmer()
stemmed_words = [porter.stem(word) for word in filtered_tokens]
print("Stemmed Words:\n", stemmed_words)

Stemmed Words:
 ['state', 'forest', 'depart', 'thebjpgovern', 'launch', 'green', 'maharashtra', 'drive', 'aim', 'plant', 'crore', 'tree', 'across', 'state', 'period', 'octob', 'govern', 'claim', 'surpass', 'target', 'plant', 'crore', 'tree', 'indian', 'expresshad', 'found', 'agenc', 'gram', 'panchayat', 'task', 'plant', 'tree', 'upload', 'mandatori', 'proof', 'tree', 'plantat', 'drive', 'special', 'creat', 'portal', 'pune', 'revenu', 'divis', 'claim', 'gram', 'panchayat', 'plant', 'crore', 'sapl', 'howev', 'evid', 'upload', 'per', 'cent', 'crore', 'sapl', 'also', 'govern', 'agenc', 'involv', 'drive', 'mani', 'submit', 'surviv', 'report', 'sapl', 'year', 'target', 'set', 'forest', 'depart', 'compar', 'modest', 'exampl', 'pune', 'circl', 'compris', 'three', 'divis', 'pune', 'solapur', 'district', 'plan', 'plant', 'lakh', 'sapl', 'forest', 'land', 'howev', 'may', 'meet', 'target', 'due', 'unavail', 'fund', 'last', 'year', 'pune', 'circl', 'plant', 'lakh', 'sapl', 'forest', 'land', 'pune',

In [13]:
stemming_df = pd.DataFrame(stemmed_words, columns=["Stemmed Words"])
stemming_df

Unnamed: 0,Stemmed Words
0,state
1,forest
2,depart
3,thebjpgovern
4,launch
...,...
303,particip
304,stakehold
305,societi
306,especi


In [14]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Words:\n", lemmatized_words)

Lemmatized Words:
 ['state', 'forest', 'department', 'thebjpgovernment', 'launched', 'green', 'maharashtra', 'drive', 'aim', 'plant', 'crore', 'tree', 'across', 'state', 'period', 'october', 'government', 'claimed', 'surpassed', 'target', 'planting', 'crore', 'tree', 'indian', 'expresshad', 'found', 'agency', 'gram', 'panchayat', 'tasked', 'planting', 'tree', 'uploaded', 'mandatory', 'proof', 'tree', 'plantation', 'drive', 'specially', 'created', 'portal', 'pune', 'revenue', 'division', 'claimed', 'gram', 'panchayat', 'planted', 'crore', 'sapling', 'however', 'evidence', 'uploaded', 'per', 'cent', 'crore', 'sapling', 'also', 'government', 'agency', 'involved', 'drive', 'many', 'submitted', 'survival', 'report', 'sapling', 'year', 'target', 'set', 'forest', 'department', 'comparatively', 'modest', 'example', 'pune', 'circle', 'comprises', 'three', 'division', 'pune', 'solapur', 'district', 'planned', 'plant', 'lakh', 'sapling', 'forest', 'land', 'however', 'may', 'meet', 'target', 'due'

In [15]:
lemmatized_df = pd.DataFrame(lemmatized_words)
lemmatized_df

Unnamed: 0,0
0,state
1,forest
2,department
3,thebjpgovernment
4,launched
...,...
303,participation
304,stakeholder
305,society
306,especially


# Part 2


In [16]:
# Term Frequency (TF)
tf = FreqDist(stemmed_words)
print("Term Frequency:\n", tf)

Term Frequency:
 <FreqDist with 180 samples and 308 outcomes>


In [17]:
tf_df = pd.DataFrame(list(tf.items()), columns=["Word", "TF Score"])
tf_df

Unnamed: 0,Word,TF Score
0,state,3
1,forest,13
2,depart,8
3,thebjpgovern,1
4,launch,1
...,...,...
175,integr,1
176,place,1
177,ensur,1
178,seamless,1


In [18]:
# TF-IDF Calculation
corpus = [cleaned_text]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
idf_values = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

In [19]:
tfidf_scores = {word: tf[word] * idf_values.get(word, 0) for word in tf.keys()}
print("Term Frequency - Inverse Document Frequency:\n", tfidf_scores)

Term Frequency - Inverse Document Frequency:
 {'state': np.float64(3.0), 'forest': np.float64(13.0), 'depart': 0, 'thebjpgovern': 0, 'launch': 0, 'green': np.float64(4.0), 'maharashtra': np.float64(5.0), 'drive': np.float64(4.0), 'aim': np.float64(4.0), 'plant': np.float64(8.0), 'crore': np.float64(12.0), 'tree': np.float64(7.0), 'across': np.float64(1.0), 'period': np.float64(1.0), 'octob': 0, 'govern': 0, 'claim': 0, 'surpass': 0, 'target': np.float64(4.0), 'indian': np.float64(1.0), 'expresshad': np.float64(1.0), 'found': np.float64(1.0), 'agenc': 0, 'gram': np.float64(2.0), 'panchayat': 0, 'task': 0, 'upload': 0, 'mandatori': 0, 'proof': np.float64(1.0), 'plantat': 0, 'special': np.float64(2.0), 'creat': 0, 'portal': np.float64(1.0), 'pune': np.float64(5.0), 'revenu': 0, 'divis': 0, 'sapl': 0, 'howev': 0, 'evid': 0, 'per': np.float64(1.0), 'cent': np.float64(1.0), 'also': np.float64(1.0), 'involv': 0, 'mani': 0, 'submit': 0, 'surviv': 0, 'report': 0, 'year': np.float64(5.0), 'set':

In [20]:
tfidf_df = pd.DataFrame(list(tfidf_scores.items()), columns=["Word", "TF-IDF Score"])
tfidf_df

Unnamed: 0,Word,TF-IDF Score
0,state,3.0
1,forest,13.0
2,depart,0.0
3,thebjpgovern,0.0
4,launch,0.0
...,...,...
175,integr,0.0
176,place,1.0
177,ensur,0.0
178,seamless,1.0
