In [1]:
doc = "A paragraph is a distinct section of writing that focuses on a single idea or topic, typically consisting of several sentences. It is a way to organize text and help the reader follow the author's thoughts by clearly signaling shifts in ideas and giving the reader a break. "

In [2]:
import nltk

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
tokens = word_tokenize(doc)
print(tokens)

['A', 'paragraph', 'is', 'a', 'distinct', 'section', 'of', 'writing', 'that', 'focuses', 'on', 'a', 'single', 'idea', 'or', 'topic', ',', 'typically', 'consisting', 'of', 'several', 'sentences', '.', 'It', 'is', 'a', 'way', 'to', 'organize', 'text', 'and', 'help', 'the', 'reader', 'follow', 'the', 'author', "'s", 'thoughts', 'by', 'clearly', 'signaling', 'shifts', 'in', 'ideas', 'and', 'giving', 'the', 'reader', 'a', 'break', '.']


In [5]:
from nltk import pos_tag
tags = pos_tag(tokens)
print(tags)

[('A', 'DT'), ('paragraph', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('distinct', 'JJ'), ('section', 'NN'), ('of', 'IN'), ('writing', 'VBG'), ('that', 'DT'), ('focuses', 'VBZ'), ('on', 'IN'), ('a', 'DT'), ('single', 'JJ'), ('idea', 'NN'), ('or', 'CC'), ('topic', 'NN'), (',', ','), ('typically', 'RB'), ('consisting', 'VBG'), ('of', 'IN'), ('several', 'JJ'), ('sentences', 'NNS'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('way', 'NN'), ('to', 'TO'), ('organize', 'VB'), ('text', 'NN'), ('and', 'CC'), ('help', 'VB'), ('the', 'DT'), ('reader', 'NN'), ('follow', 'VBP'), ('the', 'DT'), ('author', 'NN'), ("'s", 'POS'), ('thoughts', 'NNS'), ('by', 'IN'), ('clearly', 'RB'), ('signaling', 'VBG'), ('shifts', 'NNS'), ('in', 'IN'), ('ideas', 'NNS'), ('and', 'CC'), ('giving', 'VBG'), ('the', 'DT'), ('reader', 'NN'), ('a', 'DT'), ('break', 'NN'), ('.', '.')]


In [10]:
from nltk.corpus import stopwords
s_words = set(stopwords.words('English'))

In [13]:
tokens_without_stopwords = [word for word in tokens if word.lower() not in s_words]
print("\n Tokens Without Stopwords:",tokens_without_stopwords)
import string
tokens_without_stopwords_and_punctuation = [word for word in tokens if word.lower() not in s_words and word not in string.punctuation]
print("\n Tokens Without Stopwords and Punctuation:",tokens_without_stopwords_and_punctuation)
tswp = tokens_without_stopwords_and_punctuation


 Tokens Without Stopwords: ['paragraph', 'distinct', 'section', 'writing', 'focuses', 'single', 'idea', 'topic', ',', 'typically', 'consisting', 'several', 'sentences', '.', 'way', 'organize', 'text', 'help', 'reader', 'follow', 'author', "'s", 'thoughts', 'clearly', 'signaling', 'shifts', 'ideas', 'giving', 'reader', 'break', '.']

 Tokens Without Stopwords and Punctuation: ['paragraph', 'distinct', 'section', 'writing', 'focuses', 'single', 'idea', 'topic', 'typically', 'consisting', 'several', 'sentences', 'way', 'organize', 'text', 'help', 'reader', 'follow', 'author', "'s", 'thoughts', 'clearly', 'signaling', 'shifts', 'ideas', 'giving', 'reader', 'break']


In [15]:
from nltk.stem import PorterStemmer
Porter = PorterStemmer()
var_porter = [Porter.stem(word) for word in tswp]
print(var_porter)

['paragraph', 'distinct', 'section', 'write', 'focus', 'singl', 'idea', 'topic', 'typic', 'consist', 'sever', 'sentenc', 'way', 'organ', 'text', 'help', 'reader', 'follow', 'author', "'s", 'thought', 'clearli', 'signal', 'shift', 'idea', 'give', 'reader', 'break']


In [16]:
from nltk.stem import SnowballStemmer
Snowball = SnowballStemmer('english')
var_snowball = [Snowball.stem(word) for word in tswp]
print(var_snowball)

['paragraph', 'distinct', 'section', 'write', 'focus', 'singl', 'idea', 'topic', 'typic', 'consist', 'sever', 'sentenc', 'way', 'organ', 'text', 'help', 'reader', 'follow', 'author', "'s", 'thought', 'clear', 'signal', 'shift', 'idea', 'give', 'reader', 'break']


In [17]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
lemmatized = [lemma.lemmatize(word, pos='v') for word in tswp]
print(lemmatized)

['paragraph', 'distinct', 'section', 'write', 'focus', 'single', 'idea', 'topic', 'typically', 'consist', 'several', 'sentence', 'way', 'organize', 'text', 'help', 'reader', 'follow', 'author', "'s", 'thoughts', 'clearly', 'signal', 'shift', 'ideas', 'give', 'reader', 'break']


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [2]:
doc = ["The old library creaked with every step, hiding secrets between its dusty shelves.",
"A bright blue kite danced wildly in the afternoon sky above the park.",
"She poured a second cup of coffee, staring at the rain tapping against the windowpane."]

In [3]:
tfidf_matrix = vectorizer.fit_transform(doc)

In [5]:
import pandas as pd
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("TF-IDF Representation:")
print(df.round(2))

TF-IDF Representation:
   above  afternoon  against    at  between  blue  bright  coffee  creaked  \
0    0.0        0.0     0.00  0.00     0.28   0.0     0.0    0.00     0.28   
1    0.3        0.3     0.00  0.00     0.00   0.3     0.3    0.00     0.00   
2    0.0        0.0     0.27  0.27     0.00   0.0     0.0    0.27     0.00   

    cup  ...   she  shelves  sky  staring  step  tapping   the  wildly  \
0  0.00  ...  0.00     0.28  0.0     0.00  0.28     0.00  0.17     0.0   
1  0.00  ...  0.00     0.00  0.3     0.00  0.00     0.00  0.35     0.3   
2  0.27  ...  0.27     0.00  0.0     0.27  0.00     0.27  0.32     0.0   

   windowpane  with  
0        0.00  0.28  
1        0.00  0.00  
2        0.27  0.00  

[3 rows x 35 columns]
