In [43]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt') 
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohini\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rohini\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rohini\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Rohini\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Rohini\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rohini\AppData\Roamin

True

In [64]:
text = """Hello this side Rohini Gaikwad. I'm Third Year student at the PDEA's College of Enggineering Pune."""

In [65]:
sentence = sent_tokenize(text)
print("Sentence: ", sentence)

Sentence:  ['Hello this side Rohini Gaikwad.', "I'm Third Year student at the PDEA's College of Enggineering Pune."]


In [66]:
words = word_tokenize(text)
print("Word: ", words)

Word:  ['Hello', 'this', 'side', 'Rohini', 'Gaikwad', '.', 'I', "'m", 'Third', 'Year', 'student', 'at', 'the', 'PDEA', "'s", 'College', 'of', 'Enggineering', 'Pune', '.']


In [67]:
pos_tags = pos_tag(words)
print("Pos Tags: ", pos_tags)

Pos Tags:  [('Hello', 'NNP'), ('this', 'DT'), ('side', 'NN'), ('Rohini', 'NNP'), ('Gaikwad', 'NNP'), ('.', '.'), ('I', 'PRP'), ("'m", 'VBP'), ('Third', 'JJ'), ('Year', 'JJ'), ('student', 'NN'), ('at', 'IN'), ('the', 'DT'), ('PDEA', 'NNP'), ("'s", 'POS'), ('College', 'NNP'), ('of', 'IN'), ('Enggineering', 'NNP'), ('Pune', 'NNP'), ('.', '.')]


In [68]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in words if word.lower() not in stop_words and word not in string.punctuation]
print("Tokens after Stopwords Removal:", filtered_tokens)

Tokens after Stopwords Removal: ['Hello', 'side', 'Rohini', 'Gaikwad', "'m", 'Third', 'Year', 'student', 'PDEA', "'s", 'College', 'Enggineering', 'Pune']


In [69]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Words:", stemmed_words)


Stemmed Words: ['hello', 'side', 'rohini', 'gaikwad', "'m", 'third', 'year', 'student', 'pdea', "'s", 'colleg', 'enggin', 'pune']


In [71]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['Hello', 'side', 'Rohini', 'Gaikwad', "'m", 'Third', 'Year', 'student', 'PDEA', "'s", 'College', 'Enggineering', 'Pune']


In [72]:
vectorizer = TfidfVectorizer()
new_sentence=[' '.join(sentence)]
tfidf_matrix = vectorizer.fit_transform(new_sentence)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(df_tfidf)


TF-IDF Matrix:
         at   college  enggineering   gaikwad     hello        of      pdea  \
0  0.258199  0.258199      0.258199  0.258199  0.258199  0.258199  0.258199   

       pune    rohini      side   student       the     third      this  \
0  0.258199  0.258199  0.258199  0.258199  0.258199  0.258199  0.258199   

       year  
0  0.258199  
