Section A

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [2]:
#sample sentence

sentence1="Technology has revolutionized the way we live, work, and communicate. From smartphones to artificial intelligence, innovations continue to shape our daily lives. "
sentence2="While some argue that digital advancements lead to social isolation, others believe they foster global connectivity and creativity. As we move forward, it's crucial to balance innovation with ethical considerations."

In [3]:
#tokenization
from nltk import word_tokenize, sent_tokenize

In [4]:
print("Tokenized words:",word_tokenize(sentence1))
print("\nTokenized sentences:",sent_tokenize(sentence1))

Tokenized words: ['Technology', 'has', 'revolutionized', 'the', 'way', 'we', 'live', ',', 'work', ',', 'and', 'communicate', '.', 'From', 'smartphones', 'to', 'artificial', 'intelligence', ',', 'innovations', 'continue', 'to', 'shape', 'our', 'daily', 'lives', '.']

Tokenized sentences: ['Technology has revolutionized the way we live, work, and communicate.', 'From smartphones to artificial intelligence, innovations continue to shape our daily lives.']


In [5]:
#POS tagging

from nltk import pos_tag
token=word_tokenize(sentence1)+word_tokenize(sentence2)
tagged=pos_tag(token)
print("Tagging parts of speech:",tagged)

Tagging parts of speech: [('Technology', 'NN'), ('has', 'VBZ'), ('revolutionized', 'VBN'), ('the', 'DT'), ('way', 'NN'), ('we', 'PRP'), ('live', 'VBP'), (',', ','), ('work', 'NN'), (',', ','), ('and', 'CC'), ('communicate', 'NN'), ('.', '.'), ('From', 'IN'), ('smartphones', 'NNS'), ('to', 'TO'), ('artificial', 'JJ'), ('intelligence', 'NN'), (',', ','), ('innovations', 'NNS'), ('continue', 'VBP'), ('to', 'TO'), ('shape', 'VB'), ('our', 'PRP$'), ('daily', 'JJ'), ('lives', 'NNS'), ('.', '.'), ('While', 'IN'), ('some', 'DT'), ('argue', 'VBP'), ('that', 'IN'), ('digital', 'JJ'), ('advancements', 'NNS'), ('lead', 'VBP'), ('to', 'TO'), ('social', 'JJ'), ('isolation', 'NN'), (',', ','), ('others', 'NNS'), ('believe', 'VBP'), ('they', 'PRP'), ('foster', 'VBP'), ('global', 'JJ'), ('connectivity', 'NN'), ('and', 'CC'), ('creativity', 'NN'), ('.', '.'), ('As', 'IN'), ('we', 'PRP'), ('move', 'VBP'), ('forward', 'RB'), (',', ','), ('it', 'PRP'), ("'s", 'VBZ'), ('crucial', 'JJ'), ('to', 'TO'), ('bala

In [6]:
#stop word removal

from nltk.corpus import stopwords
stop_words=stopwords.words('english')
token=word_tokenize(sentence1)
cleaned_token=[]
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
print("Unclean version:",token)
print("\nCleaned version:",cleaned_token)

Unclean version: ['Technology', 'has', 'revolutionized', 'the', 'way', 'we', 'live', ',', 'work', ',', 'and', 'communicate', '.', 'From', 'smartphones', 'to', 'artificial', 'intelligence', ',', 'innovations', 'continue', 'to', 'shape', 'our', 'daily', 'lives', '.']

Cleaned version: ['Technology', 'revolutionized', 'way', 'live', ',', 'work', ',', 'communicate', '.', 'From', 'smartphones', 'artificial', 'intelligence', ',', 'innovations', 'continue', 'shape', 'daily', 'lives', '.']


In [7]:
#stemming

from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
token=word_tokenize(sentence2)
stemmed=[stemmer.stem(word) for word in token]
print(" ".join(stemmed))

while some argu that digit advanc lead to social isol , other believ they foster global connect and creativ . as we move forward , it 's crucial to balanc innov with ethic consider .


In [9]:
#lemmatization

from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
lemmatizer=WordNetLemmatizer()
token=word_tokenize(sentence2)
lemmatized_output=[lemmatizer.lemmatize(word) for word in token]
print(" ".join(lemmatized_output))

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...


While some argue that digital advancement lead to social isolation , others believe they foster global connectivity and creativity . As we move forward , it 's crucial to balance innovation with ethical consideration .


Section B

In [10]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
documentA='The old oak tree stood tall in the middle of the forest,its branches reaching out like ancient fingers'
documentB='The children laughed joyfully as they played in the park,their voices echoing through the trees'

In [12]:
#creating bag of words
bagOfWordsA=documentA.split(' ')
bagOfWordsB=documentB.split(' ')

In [13]:
uniqueWords=set(bagOfWordsA).union(bagOfWordsB)

In [14]:
numOfWordsA=dict.fromkeys(uniqueWords,0)
for word in bagOfWordsA:
    numOfWordsA[word]+=1

numOfWordsB=dict.fromkeys(uniqueWords,0)
for word in bagOfWordsB:
    numOfWordsB[word]+=1
    

In [15]:
#computing term frequency(TF)

def computeTF(wordDict,bagOfWords):
    tfDict={}
    bagOfWordsCount=len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word]=count/float(bagOfWordsCount)
    return tfDict

In [16]:
tfA=computeTF(numOfWordsA,bagOfWordsA)
tfB=computeTF(numOfWordsB,bagOfWordsB)

In [17]:
tfA

{'branches': 0.05555555555555555,
 'of': 0.05555555555555555,
 'the': 0.1111111111111111,
 'echoing': 0.0,
 'tree': 0.05555555555555555,
 'through': 0.0,
 'children': 0.0,
 'as': 0.0,
 'joyfully': 0.0,
 'park,their': 0.0,
 'stood': 0.05555555555555555,
 'out': 0.05555555555555555,
 'like': 0.05555555555555555,
 'played': 0.0,
 'trees': 0.0,
 'they': 0.0,
 'ancient': 0.05555555555555555,
 'tall': 0.05555555555555555,
 'oak': 0.05555555555555555,
 'The': 0.05555555555555555,
 'fingers': 0.05555555555555555,
 'forest,its': 0.05555555555555555,
 'old': 0.05555555555555555,
 'reaching': 0.05555555555555555,
 'in': 0.05555555555555555,
 'voices': 0.0,
 'middle': 0.05555555555555555,
 'laughed': 0.0}

In [18]:
tfB

{'branches': 0.0,
 'of': 0.0,
 'the': 0.13333333333333333,
 'echoing': 0.06666666666666667,
 'tree': 0.0,
 'through': 0.06666666666666667,
 'children': 0.06666666666666667,
 'as': 0.06666666666666667,
 'joyfully': 0.06666666666666667,
 'park,their': 0.06666666666666667,
 'stood': 0.0,
 'out': 0.0,
 'like': 0.0,
 'played': 0.06666666666666667,
 'trees': 0.06666666666666667,
 'they': 0.06666666666666667,
 'ancient': 0.0,
 'tall': 0.0,
 'oak': 0.0,
 'The': 0.06666666666666667,
 'fingers': 0.0,
 'forest,its': 0.0,
 'old': 0.0,
 'reaching': 0.0,
 'in': 0.06666666666666667,
 'voices': 0.06666666666666667,
 'middle': 0.0,
 'laughed': 0.06666666666666667}

In [20]:
#computing inverse document frequency(IDF)

def computeIDF(documents):
    import math
    N=len(documents)
    idfDict=dict.fromkeys(documents[0].keys(),0)
    for document in documents:
        for word,val in document.items():
            if val>0:
                idfDict[word]+=1
    
    for word,val in idfDict.items():
        idfDict[word]=math.log(N/float(val))
    return idfDict
idfs=computeIDF([numOfWordsA,numOfWordsB])
idfs

{'branches': 0.6931471805599453,
 'of': 0.6931471805599453,
 'the': 0.0,
 'echoing': 0.6931471805599453,
 'tree': 0.6931471805599453,
 'through': 0.6931471805599453,
 'children': 0.6931471805599453,
 'as': 0.6931471805599453,
 'joyfully': 0.6931471805599453,
 'park,their': 0.6931471805599453,
 'stood': 0.6931471805599453,
 'out': 0.6931471805599453,
 'like': 0.6931471805599453,
 'played': 0.6931471805599453,
 'trees': 0.6931471805599453,
 'they': 0.6931471805599453,
 'ancient': 0.6931471805599453,
 'tall': 0.6931471805599453,
 'oak': 0.6931471805599453,
 'The': 0.0,
 'fingers': 0.6931471805599453,
 'forest,its': 0.6931471805599453,
 'old': 0.6931471805599453,
 'reaching': 0.6931471805599453,
 'in': 0.0,
 'voices': 0.6931471805599453,
 'middle': 0.6931471805599453,
 'laughed': 0.6931471805599453}

In [21]:
#computing term frequency - inverse document frequency(TF/IDF)
def computeTFIDF(tfBagOfWords,idfs):
    tfidf={}
    for word, val in tfBagOfWords.items():
        tfidf[word]=val*idfs[word]
    return tfidf

tfidfA=computeTFIDF(tfA,idfs)
tfidfB=computeTFIDF(tfB,idfs)
df=pd.DataFrame([tfidfA,tfidfB])
df

Unnamed: 0,branches,of,the,echoing,tree,through,children,as,joyfully,"park,their",...,oak,The,fingers,"forest,its",old,reaching,in,voices,middle,laughed
0,0.038508,0.038508,0.0,0.0,0.038508,0.0,0.0,0.0,0.0,0.0,...,0.038508,0.0,0.038508,0.038508,0.038508,0.038508,0.0,0.0,0.038508,0.0
1,0.0,0.0,0.0,0.04621,0.0,0.04621,0.04621,0.04621,0.04621,0.04621,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04621,0.0,0.04621
