In [2]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
text="Natural language processing (NLP) is a field of computer science, artificial intelligence, and linguistics concerned with the interactions between computers and human language."

In [4]:
print(word_tokenize(text))

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', ',', 'and', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', '.']


In [5]:
print(sent_tokenize(text))

['Natural language processing (NLP) is a field of computer science, artificial intelligence, and linguistics concerned with the interactions between computers and human language.']


In [6]:
to_tag=word_tokenize(text)

In [7]:
print(pos_tag(to_tag))

[('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('computer', 'NN'), ('science', 'NN'), (',', ','), ('artificial', 'JJ'), ('intelligence', 'NN'), (',', ','), ('and', 'CC'), ('linguistics', 'NNS'), ('concerned', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('interactions', 'NNS'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]


In [9]:
stop_words=set(stopwords.words('english'))
print(stop_words)

{"mightn't", 'mustn', 'such', 'into', 'out', 'we', 'by', "doesn't", 'having', 'don', 'during', 'will', 'itself', 'mightn', 'there', 'on', 'myself', 'once', "wasn't", 'over', 'am', 'while', 'only', 'until', 'the', "you've", 'all', 'off', 'your', 'his', 'not', 'can', 'how', 'wouldn', 'these', 'have', 'and', 'between', 'both', 'be', 'any', 've', 'shouldn', 'after', 'here', "shan't", 'too', "it's", 'them', 'down', 'who', 'won', 'him', 'does', 'm', 'needn', "you'd", 'haven', 'their', 'at', 'it', 'hadn', "needn't", 'some', 'that', "that'll", 'ourselves', 'wasn', "weren't", 't', 'where', 'just', 'aren', 'shan', 'with', "you'll", "isn't", 'her', 'themselves', 'which', 'yours', 'its', 'they', 'my', 'yourselves', 'he', 'are', 'should', 'than', 'now', 'through', 's', 'then', 'against', 'ma', 'll', 'o', 'why', 'those', 'did', 'whom', 'further', 'ours', 'herself', 'has', 'from', 'of', 'an', 'being', 'ain', 'above', 'under', "should've", 'about', 'again', 'because', 'but', 'own', 'is', "haven't", "s

In [10]:
to_clean=word_tokenize(text)

In [11]:
no_stopwords_text=[]
for token in to_clean:
    if(token not in stop_words):
        no_stopwords_text.append(token)

print(no_stopwords_text)

['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'computer', 'science', ',', 'artificial', 'intelligence', ',', 'linguistics', 'concerned', 'interactions', 'computers', 'human', 'language', '.']


In [12]:
stemmer=PorterStemmer()

In [14]:
stemmed_words=[]
for token in no_stopwords_text:
    stemmed_word=stemmer.stem(token)
    stemmed_words.append(stemmed_word)

print(stemmed_words)
    

['natur', 'languag', 'process', '(', 'nlp', ')', 'field', 'comput', 'scienc', ',', 'artifici', 'intellig', ',', 'linguist', 'concern', 'interact', 'comput', 'human', 'languag', '.']


In [15]:
lemmatizer=WordNetLemmatizer()

In [16]:
lemmatized_words=[]
for token in no_stopwords_text:
    lemmatized=lemmatizer.lemmatize(token)
    lemmatized_words.append(lemmatized)

print(lemmatized_words)

['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'computer', 'science', ',', 'artificial', 'intelligence', ',', 'linguistics', 'concerned', 'interaction', 'computer', 'human', 'language', '.']


In [17]:
vectorizer=TfidfVectorizer()

In [18]:
corpus= [
    "I love to eat pizza",
    "Pizza is my favorite food",
    "I enjoy eating pizza with friends",
    "I like to have pizza for dinner",
    "Pizza toppings include cheese, pepperoni, and mushrooms"
]

In [19]:
tfidf_matrix=vectorizer.fit_transform(corpus)

In [20]:
print(tfidf_matrix)

  (0, 18)	0.2808823162882302
  (0, 3)	0.5894630806320427
  (0, 19)	0.47557510189256375
  (0, 14)	0.5894630806320427
  (1, 7)	0.48638584746139363
  (1, 6)	0.48638584746139363
  (1, 16)	0.48638584746139363
  (1, 12)	0.48638584746139363
  (1, 18)	0.2317654623904255
  (2, 9)	0.48638584746139363
  (2, 21)	0.48638584746139363
  (2, 4)	0.48638584746139363
  (2, 5)	0.48638584746139363
  (2, 18)	0.2317654623904255
  (3, 2)	0.4527727535876864
  (3, 8)	0.4527727535876864
  (3, 10)	0.4527727535876864
  (3, 13)	0.4527727535876864
  (3, 18)	0.21574864305928557
  (3, 19)	0.3652942067054634
  (4, 15)	0.4007361920444453
  (4, 0)	0.4007361920444453
  (4, 17)	0.4007361920444453
  (4, 1)	0.4007361920444453
  (4, 11)	0.4007361920444453
  (4, 20)	0.4007361920444453
  (4, 18)	0.19095294266992674


In [21]:
feature_names=vectorizer.get_feature_names_out()

In [22]:
print(feature_names)

['and' 'cheese' 'dinner' 'eat' 'eating' 'enjoy' 'favorite' 'food' 'for'
 'friends' 'have' 'include' 'is' 'like' 'love' 'mushrooms' 'my'
 'pepperoni' 'pizza' 'to' 'toppings' 'with']


In [23]:
tfidf_values = tfidf_matrix.toarray()[0]

# Create a dictionary to store TF-IDF values for each term
tfidf_dict = dict(zip(feature_names, tfidf_values))

print("TF-IDF representation:")
for term, tfidf in tfidf_dict.items():
    print(f"{term}: {tfidf}")

TF-IDF representation:
and: 0.0
cheese: 0.0
dinner: 0.0
eat: 0.5894630806320427
eating: 0.0
enjoy: 0.0
favorite: 0.0
food: 0.0
for: 0.0
friends: 0.0
have: 0.0
include: 0.0
is: 0.0
like: 0.0
love: 0.5894630806320427
mushrooms: 0.0
my: 0.0
pepperoni: 0.0
pizza: 0.2808823162882302
to: 0.47557510189256375
toppings: 0.0
with: 0.0
