In [19]:
import pandas as pd
import numpy as np
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
document = """ Data science combines statistics, machine learning, and domain knowledge
It helps businesses make informed decisions by uncovering patterns and trends. With the
data science plays a crucial role in various industries, but challenges like data quantity
Collaboration and ongoing research are key to maximizing its potential."""

In [22]:
words = word_tokenize(document)
print("Original Words:", words)

Original Words: ['Data', 'science', 'combines', 'statistics', ',', 'machine', 'learning', ',', 'and', 'domain', 'knowledge', 'It', 'helps', 'businesses', 'make', 'informed', 'decisions', 'by', 'uncovering', 'patterns', 'and', 'trends', '.', 'With', 'the', 'data', 'science', 'plays', 'a', 'crucial', 'role', 'in', 'various', 'industries', ',', 'but', 'challenges', 'like', 'data', 'quantity', 'Collaboration', 'and', 'ongoing', 'research', 'are', 'key', 'to', 'maximizing', 'its', 'potential', '.']


In [23]:
pos = pos_tag(words)
print("POS tagging:", pos)

POS tagging: [('Data', 'NNP'), ('science', 'NN'), ('combines', 'NNS'), ('statistics', 'NNS'), (',', ','), ('machine', 'NN'), ('learning', 'NN'), (',', ','), ('and', 'CC'), ('domain', 'NN'), ('knowledge', 'NN'), ('It', 'PRP'), ('helps', 'VBZ'), ('businesses', 'NNS'), ('make', 'VBP'), ('informed', 'VBN'), ('decisions', 'NNS'), ('by', 'IN'), ('uncovering', 'VBG'), ('patterns', 'NNS'), ('and', 'CC'), ('trends', 'NNS'), ('.', '.'), ('With', 'IN'), ('the', 'DT'), ('data', 'NNS'), ('science', 'NN'), ('plays', 'VBZ'), ('a', 'DT'), ('crucial', 'JJ'), ('role', 'NN'), ('in', 'IN'), ('various', 'JJ'), ('industries', 'NNS'), (',', ','), ('but', 'CC'), ('challenges', 'NNS'), ('like', 'IN'), ('data', 'NNS'), ('quantity', 'NN'), ('Collaboration', 'NNP'), ('and', 'CC'), ('ongoing', 'JJ'), ('research', 'NN'), ('are', 'VBP'), ('key', 'JJ'), ('to', 'TO'), ('maximizing', 'VBG'), ('its', 'PRP$'), ('potential', 'NN'), ('.', '.')]


In [24]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in words if word.lower() not in stop_words and word]
print("Stop words removed:", filtered_tokens)

Stop words removed: ['Data', 'science', 'combines', 'statistics', ',', 'machine', 'learning', ',', 'domain', 'knowledge', 'helps', 'businesses', 'make', 'informed', 'decisions', 'uncovering', 'patterns', 'trends', '.', 'data', 'science', 'plays', 'crucial', 'role', 'various', 'industries', ',', 'challenges', 'like', 'data', 'quantity', 'Collaboration', 'ongoing', 'research', 'key', 'maximizing', 'potential', '.']


In [25]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['data', 'scienc', 'combin', 'statist', ',', 'machin', 'learn', ',', 'domain', 'knowledg', 'help', 'busi', 'make', 'inform', 'decis', 'uncov', 'pattern', 'trend', '.', 'data', 'scienc', 'play', 'crucial', 'role', 'variou', 'industri', ',', 'challeng', 'like', 'data', 'quantiti', 'collabor', 'ongo', 'research', 'key', 'maxim', 'potenti', '.']


In [29]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatization:", lemmatized_words)

Lemmatization: ['Data', 'science', 'combine', 'statistic', ',', 'machine', 'learning', ',', 'domain', 'knowledge', 'help', 'business', 'make', 'informed', 'decision', 'uncovering', 'pattern', 'trend', '.', 'data', 'science', 'play', 'crucial', 'role', 'various', 'industry', ',', 'challenge', 'like', 'data', 'quantity', 'Collaboration', 'ongoing', 'research', 'key', 'maximizing', 'potential', '.']


In [27]:
documents = [document]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform (documents)
terms = tfidf_vectorizer.get_feature_names_out()
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("Terms:", terms)

TF-IDF Matrix:
[[0.39056673 0.13018891 0.13018891 0.13018891 0.13018891 0.13018891
  0.13018891 0.13018891 0.13018891 0.39056673 0.13018891 0.13018891
  0.13018891 0.13018891 0.13018891 0.13018891 0.13018891 0.13018891
  0.13018891 0.13018891 0.13018891 0.13018891 0.13018891 0.13018891
  0.13018891 0.13018891 0.13018891 0.13018891 0.13018891 0.13018891
  0.13018891 0.13018891 0.26037782 0.13018891 0.13018891 0.13018891
  0.13018891 0.13018891 0.13018891 0.13018891]]
Terms: ['and' 'are' 'businesses' 'but' 'by' 'challenges' 'collaboration'
 'combines' 'crucial' 'data' 'decisions' 'domain' 'helps' 'in'
 'industries' 'informed' 'it' 'its' 'key' 'knowledge' 'learning' 'like'
 'machine' 'make' 'maximizing' 'ongoing' 'patterns' 'plays' 'potential'
 'quantity' 'research' 'role' 'science' 'statistics' 'the' 'to' 'trends'
 'uncovering' 'various' 'with']
