### Importing Libraries

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords 
nltk.download('stopwords')
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag

[nltk_data] Downloading package punkt to /home/pict/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pict/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pict/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### Implementing data preprocessing

In [3]:
sent = "Millions of people in India took part in an annual tree planting drive Sunday. More than 250 million saplings were planted in a single day across the country's most-populous state."

#### Tokenization

In [4]:
print(word_tokenize(sent))

['Millions', 'of', 'people', 'in', 'India', 'took', 'part', 'in', 'an', 'annual', 'tree', 'planting', 'drive', 'Sunday', '.', 'More', 'than', '250', 'million', 'saplings', 'were', 'planted', 'in', 'a', 'single', 'day', 'across', 'the', 'country', "'s", 'most-populous', 'state', '.']


In [5]:
print(sent_tokenize(sent))

['Millions of people in India took part in an annual tree planting drive Sunday.', "More than 250 million saplings were planted in a single day across the country's most-populous state."]


#### Stopwards

In [31]:
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
cleaned_token = []
token = word_tokenize(sent)
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
print("This is the unclean version:", token)
print("This is the cleaned version:", cleaned_token)

This is the unclean version: ['Millions', 'of', 'people', 'in', 'India', 'took', 'part', 'in', 'an', 'annual', 'tree', 'planting', 'drive', 'Sunday', '.', 'More', 'than', '250', 'million', 'saplings', 'were', 'planted', 'in', 'a', 'single', 'day', 'across', 'the', 'country', "'s", 'most-populous', 'state', '.']
This is the cleaned version: ['Millions', 'people', 'India', 'took', 'part', 'annual', 'tree', 'planting', 'drive', 'Sunday', '.', 'More', '250', 'million', 'saplings', 'planted', 'single', 'day', 'across', 'country', "'s", 'most-populous', 'state', '.']


Stemming

In [8]:

stemmer = PorterStemmer()
words = ['plant', 'planting', 'plants', 'planted']
stemmed = [stemmer.stem(word) for word in words]
print(stemmed)

['plant', 'plant', 'plant', 'plant']


In [9]:
token = word_tokenize(sent)
stemmed = ""
for word in token:
    stemmed += stemmer.stem(word) + " "
print(stemmed)

million of peopl in india took part in an annual tree plant drive sunday . more than 250 million sapl were plant in a singl day across the countri 's most-popul state . 


In [10]:

sent2 = "The campaign was led by Uttar Pradesh state government officials, lawmakers, and activists, in a bid to reduce carbon emissions and combat climate change"

In [11]:
print(word_tokenize(sent2))

['The', 'campaign', 'was', 'led', 'by', 'Uttar', 'Pradesh', 'state', 'government', 'officials', ',', 'lawmakers', ',', 'and', 'activists', ',', 'in', 'a', 'bid', 'to', 'reduce', 'carbon', 'emissions', 'and', 'combat', 'climate', 'change']


In [12]:
print(sent_tokenize(sent2))

['The campaign was led by Uttar Pradesh state government officials, lawmakers, and activists, in a bid to reduce carbon emissions and combat climate change']


In [13]:
cleaned_token = []
token = word_tokenize(sent2)
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
print("This is the unclean version:", token)
print("This is the cleaned version:", cleaned_token)

This is the unclean version: ['The', 'campaign', 'was', 'led', 'by', 'Uttar', 'Pradesh', 'state', 'government', 'officials', ',', 'lawmakers', ',', 'and', 'activists', ',', 'in', 'a', 'bid', 'to', 'reduce', 'carbon', 'emissions', 'and', 'combat', 'climate', 'change']
This is the cleaned version: ['The', 'campaign', 'led', 'Uttar', 'Pradesh', 'state', 'government', 'officials', ',', 'lawmakers', ',', 'activists', ',', 'bid', 'reduce', 'carbon', 'emissions', 'combat', 'climate', 'change']


In [14]:
token = word_tokenize(sent2)
stemmed = ""
for word in token:
    stemmed += stemmer.stem(word) + " "
print(stemmed)

the campaign wa led by uttar pradesh state govern offici , lawmak , and activist , in a bid to reduc carbon emiss and combat climat chang 


#### POS_Tagging

In [15]:

token = word_tokenize(sent) + word_tokenize(sent2)
tagged = pos_tag(cleaned_token)                 
print(tagged)

[('The', 'DT'), ('campaign', 'NN'), ('led', 'VBD'), ('Uttar', 'NNP'), ('Pradesh', 'NNP'), ('state', 'NN'), ('government', 'NN'), ('officials', 'NNS'), (',', ','), ('lawmakers', 'NNS'), (',', ','), ('activists', 'NNS'), (',', ','), ('bid', 'NN'), ('reduce', 'VB'), ('carbon', 'NN'), ('emissions', 'NNS'), ('combat', 'VBP'), ('climate', 'NN'), ('change', 'NN')]


In [16]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [17]:
bagOfWordsA = sent.split(' ')
bagOfWordsB = sent2.split(' ')

In [18]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [19]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

####  Term Frequency 

In [20]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [21]:
print(tfA)

{'across': 0.03333333333333333, 'planting': 0.03333333333333333, 'planted': 0.03333333333333333, 'by': 0.0, 'Pradesh': 0.0, 'Millions': 0.03333333333333333, 'in': 0.1, 'million': 0.03333333333333333, 'took': 0.03333333333333333, 'most-populous': 0.03333333333333333, 'single': 0.03333333333333333, 'annual': 0.03333333333333333, 'state.': 0.03333333333333333, 'Sunday.': 0.03333333333333333, 'bid': 0.0, 'of': 0.03333333333333333, 'More': 0.03333333333333333, 'people': 0.03333333333333333, 'the': 0.03333333333333333, 'activists,': 0.0, 'reduce': 0.0, 'officials,': 0.0, 'The': 0.0, 'were': 0.03333333333333333, 'government': 0.0, '250': 0.03333333333333333, 'campaign': 0.0, 'an': 0.03333333333333333, 'than': 0.03333333333333333, 'day': 0.03333333333333333, 'emissions': 0.0, 'climate': 0.0, 'saplings': 0.03333333333333333, 'led': 0.0, 'to': 0.0, 'was': 0.0, 'Uttar': 0.0, 'combat': 0.0, 'state': 0.0, 'change': 0.0, "country's": 0.03333333333333333, 'drive': 0.03333333333333333, 'tree': 0.03333

In [22]:
print(tfB)

{'across': 0.0, 'planting': 0.0, 'planted': 0.0, 'by': 0.041666666666666664, 'Pradesh': 0.041666666666666664, 'Millions': 0.0, 'in': 0.041666666666666664, 'million': 0.0, 'took': 0.0, 'most-populous': 0.0, 'single': 0.0, 'annual': 0.0, 'state.': 0.0, 'Sunday.': 0.0, 'bid': 0.041666666666666664, 'of': 0.0, 'More': 0.0, 'people': 0.0, 'the': 0.0, 'activists,': 0.041666666666666664, 'reduce': 0.041666666666666664, 'officials,': 0.041666666666666664, 'The': 0.041666666666666664, 'were': 0.0, 'government': 0.041666666666666664, '250': 0.0, 'campaign': 0.041666666666666664, 'an': 0.0, 'than': 0.0, 'day': 0.0, 'emissions': 0.041666666666666664, 'climate': 0.041666666666666664, 'saplings': 0.0, 'led': 0.041666666666666664, 'to': 0.041666666666666664, 'was': 0.041666666666666664, 'Uttar': 0.041666666666666664, 'combat': 0.041666666666666664, 'state': 0.041666666666666664, 'change': 0.041666666666666664, "country's": 0.0, 'drive': 0.0, 'tree': 0.0, 'and': 0.08333333333333333, 'part': 0.0, 'India

#### Inverse Document Frequency

In [23]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [24]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [25]:
idfs

{'across': 0.6931471805599453,
 'planting': 0.6931471805599453,
 'planted': 0.6931471805599453,
 'by': 0.6931471805599453,
 'Pradesh': 0.6931471805599453,
 'Millions': 0.6931471805599453,
 'in': 0.0,
 'million': 0.6931471805599453,
 'took': 0.6931471805599453,
 'most-populous': 0.6931471805599453,
 'single': 0.6931471805599453,
 'annual': 0.6931471805599453,
 'state.': 0.6931471805599453,
 'Sunday.': 0.6931471805599453,
 'bid': 0.6931471805599453,
 'of': 0.6931471805599453,
 'More': 0.6931471805599453,
 'people': 0.6931471805599453,
 'the': 0.6931471805599453,
 'activists,': 0.6931471805599453,
 'reduce': 0.6931471805599453,
 'officials,': 0.6931471805599453,
 'The': 0.6931471805599453,
 'were': 0.6931471805599453,
 'government': 0.6931471805599453,
 '250': 0.6931471805599453,
 'campaign': 0.6931471805599453,
 'an': 0.6931471805599453,
 'than': 0.6931471805599453,
 'day': 0.6931471805599453,
 'emissions': 0.6931471805599453,
 'climate': 0.6931471805599453,
 'saplings': 0.69314718055994

In [26]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [27]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])

In [28]:
df

Unnamed: 0,across,planting,planted,by,Pradesh,Millions,in,million,took,most-populous,...,change,country's,drive,tree,and,part,India,carbon,"lawmakers,",a
0,0.023105,0.023105,0.023105,0.0,0.0,0.023105,0.0,0.023105,0.023105,0.023105,...,0.0,0.023105,0.023105,0.023105,0.0,0.023105,0.023105,0.0,0.0,0.0
1,0.0,0.0,0.0,0.028881,0.028881,0.0,0.0,0.0,0.0,0.0,...,0.028881,0.0,0.0,0.0,0.057762,0.0,0.0,0.028881,0.028881,0.0


In [29]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([sent, sent2])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)



In [30]:
df

Unnamed: 0,250,across,activists,an,and,annual,bid,by,campaign,carbon,...,state,sunday,than,the,to,took,tree,uttar,was,were
0,0.180868,0.180868,0.0,0.180868,0.0,0.180868,0.0,0.0,0.0,0.0,...,0.128689,0.180868,0.180868,0.128689,0.0,0.180868,0.180868,0.0,0.0,0.180868
1,0.0,0.0,0.206202,0.0,0.412404,0.0,0.206202,0.206202,0.206202,0.206202,...,0.146714,0.0,0.0,0.146714,0.206202,0.0,0.0,0.206202,0.206202,0.0
