In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [2]:
document = "Hello Everyone. Welcome to the Python Programming"

In [3]:
sentences = sent_tokenize(document)

In [6]:
print("Original Document:")
print(document)
print("\nSentence Tokenization:")
print(sentences)

Original Document:
Hello Everyone. Welcome to the Python Programming

Sentence Tokenization:
['Hello Everyone.', 'Welcome to the Python Programming']


In [7]:
tokens = [word_tokenize(sentence) for sentence in sentences]

In [8]:
print(tokens)

[['Hello', 'Everyone', '.'], ['Welcome', 'to', 'the', 'Python', 'Programming']]


In [11]:
pos_tags = [pos_tag(token) for token in tokens]

In [12]:
print(pos_tags)

[[('Hello', 'NNP'), ('Everyone', 'NNP'), ('.', '.')], [('Welcome', 'VB'), ('to', 'TO'), ('the', 'DT'), ('Python', 'NNP'), ('Programming', 'NNP')]]


In [14]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [[word for word in sentence if word.lower() not in stop_words] for sentence in tokens]

In [15]:
print(document)
print(filtered_tokens)

Hello Everyone. Welcome to the Python Programming
[['Hello', 'Everyone', '.'], ['Welcome', 'Python', 'Programming']]


In [16]:
stemmer = PorterStemmer()
stemmed_tokens = [[stemmer.stem(word) for word in sentence] for sentence in tokens]

In [17]:
print(document)
print(stemmed_tokens)

Hello Everyone. Welcome to the Python Programming
[['hello', 'everyon', '.'], ['welcom', 'to', 'the', 'python', 'program']]


In [18]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in tokens] 

In [19]:
print(document)
print(lemmatized_tokens)

Hello Everyone. Welcome to the Python Programming
[['Hello', 'Everyone', '.'], ['Welcome', 'to', 'the', 'Python', 'Programming']]


In [20]:
from collections import Counter
import math

In [21]:
corpus = [
    'The quick brown fox jumps over the lazy dog',
    'The brown fox is quick',
    'The lazy dog is sleeping'
]

In [22]:
def tf_idf(corpus):
    tokenized_docs = [doc.lower().split() for doc in corpus]
    tf_docs = [Counter(tokens) for tokens in tokenized_docs]
    n_docs = len(corpus)
    idf = {token: math.log(n_docs / sum(1 for doc in tokenized_docs if token in doc)) for doc in tokenized_docs for token in set(doc)}
    return [{token: tf_docs[i][token] * idf[token] for token in tf_docs[i]} for i in range(n_docs)]

In [23]:
tfidf_docs = tf_idf(corpus)


In [24]:
for i, tfidf_doc in enumerate(tfidf_docs):
    print(f"Document {i+1}: {tfidf_doc}")

Document 1: {'the': 0.0, 'quick': 0.4054651081081644, 'brown': 0.4054651081081644, 'fox': 0.4054651081081644, 'jumps': 1.0986122886681098, 'over': 1.0986122886681098, 'lazy': 0.4054651081081644, 'dog': 0.4054651081081644}
Document 2: {'the': 0.0, 'brown': 0.4054651081081644, 'fox': 0.4054651081081644, 'is': 0.4054651081081644, 'quick': 0.4054651081081644}
Document 3: {'the': 0.0, 'lazy': 0.4054651081081644, 'dog': 0.4054651081081644, 'is': 0.4054651081081644, 'sleeping': 1.0986122886681098}
