In [46]:
import pandas as pd
import math
from sklearn.feature_extraction.text import TfidfVectorizer

## Initialize the Documents

In [47]:
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

## Create BagofWords (BoW) for Document A and B

In [48]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

## Create Collection of Unique words from Document A and B

In [49]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

## Create a dictionary of words and their occurrence for each document in the corpus

In [50]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
    numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

## Compute the term frequency for each of our documents

In [51]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

## Compute the term Inverse Document Frequency

In [52]:
def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict
idfs = computeIDF([numOfWordsA, numOfWordsB])
idfs

{'Jupiter': 0.6931471805599453,
 'Sun': 0.6931471805599453,
 'planet': 0.6931471805599453,
 'the': 0.0,
 'fourth': 0.6931471805599453,
 'is': 0.0,
 'largest': 0.6931471805599453,
 'Planet': 0.6931471805599453,
 'Mars': 0.6931471805599453,
 'from': 0.6931471805599453}

## Compute the term TF/IDF for all words

In [53]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,Jupiter,Sun,planet,the,fourth,is,largest,Planet,Mars,from
0,0.138629,0.0,0.0,0.0,0.0,0.0,0.138629,0.138629,0.0,0.0
1,0.0,0.086643,0.086643,0.0,0.086643,0.0,0.0,0.0,0.086643,0.086643


## Assignmet 2

In [54]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(sentence):
    # Define the list of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Split the sentence into words
    words = sentence.split()
    
    # Filter out stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join the filtered words back into a sentence
    return ' '.join(filtered_words)

documentA = remove_stopwords(documentA)
documentB = remove_stopwords(documentB)


bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
    numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1


tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
df

{'Jupiter': 0.6931471805599453, 'Sun': 0.6931471805599453, 'planet': 0.6931471805599453, 'largest': 0.6931471805599453, 'fourth': 0.6931471805599453, 'Planet': 0.6931471805599453, 'Mars': 0.6931471805599453}


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Jupiter,Sun,planet,largest,fourth,Planet,Mars
0,0.231049,0.0,0.0,0.231049,0.0,0.231049,0.0
1,0.0,0.173287,0.173287,0.0,0.173287,0.0,0.173287
