In [1]:
paragraph = """Washington played an indispensable role in adopting and ratifying the Constitution of the United States. He was then twice elected president by the Electoral College unanimously. As president, he implemented a strong, well-financed national government while remaining impartial in a fierce rivalry between cabinet members Thomas Jefferson and Alexander Hamilton. During the French Revolution, he proclaimed a policy of neutrality while sanctioning the Jay Treaty. He set enduring precedents for the office of president, including the title "Mr. President", and swearing the Oath of Office on the Bible. His Farewell Address is widely regarded as a pre-eminent statement on republicanism.

Washington was a slave owner who had a complicated relationship with slavery. During his lifetime he controlled a cumulative total of over 577 slaves, who were forced to work on his farms and wherever he lived, including the President's House in Philadelphia. As president, he signed laws passed by Congress that both protected and curtailed slavery. His will said that one of his slaves, William Lee, should be freed upon his death and that the other 123 slaves must work for his wife and be freed on her death. She freed them during her lifetime to remove the incentive for hastening her death.[11][12]

He endeavored to assimilate Native Americans into the Anglo-American culture. However, he waged military campaigns against hostile Native American nations during the Revolutionary War and the Northwest Indian War. He was a member of the Anglican Church and the Freemasons, and he urged broad religious freedom in his roles as general and president. Upon his death, he was eulogized by Henry "Light-Horse Harry" Lee as "first in war, first in peace, and first in the hearts of his countrymen".[13]

Washington has been memorialized by monuments, a federal holiday, various media depictions, geographical locations, including the national capital, the State of Washington, stamps, and currency, and many scholars and ordinary Americans alike rank him among the greatest U.S. presidents. In 1976, Washington was posthumously promoted to the rank of General of the Armies of the United States, the highest rank in the United States Army."""

In [2]:
!pip install nltk



In [3]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [4]:
##tokenization-converting paragraph into senetnces
nltk.download('punkt')
sentences = nltk.sent_tokenize(paragraph)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rajashree\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
len(sentences)

17

In [6]:
import re

In [7]:
corpus=[]
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', "  ", sentences[i])
    review = review.lower()
    review = review.split()
    review = ' '.join(review)
    corpus.append(review)

In [8]:
corpus

['washington played an indispensable role in adopting and ratifying the constitution of the united states',
 'he was then twice elected president by the electoral college unanimously',
 'as president he implemented a strong well financed national government while remaining impartial in a fierce rivalry between cabinet members thomas jefferson and alexander hamilton',
 'during the french revolution he proclaimed a policy of neutrality while sanctioning the jay treaty',
 'he set enduring precedents for the office of president including the title mr president and swearing the oath of office on the bible',
 'his farewell address is widely regarded as a pre eminent statement on republicanism',
 'washington was a slave owner who had a complicated relationship with slavery',
 'during his lifetime he controlled a cumulative total of over slaves who were forced to work on his farms and wherever he lived including the president s house in philadelphia',
 'as president he signed laws passed by co

In [9]:
stemmer = PorterStemmer()

In [10]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [11]:
for i in corpus:
    words = nltk. word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(stemmer.stem(word))
        

washington
play
indispens
role
adopt
ratifi
constitut
unit
state
twice
elect
presid
elector
colleg
unanim
presid
implement
strong
well
financ
nation
govern
remain
imparti
fierc
rivalri
cabinet
member
thoma
jefferson
alexand
hamilton
french
revolut
proclaim
polici
neutral
sanction
jay
treati
set
endur
preced
offic
presid
includ
titl
mr
presid
swear
oath
offic
bibl
farewel
address
wide
regard
pre
emin
statement
republican
washington
slave
owner
complic
relationship
slaveri
lifetim
control
cumul
total
slave
forc
work
farm
wherev
live
includ
presid
hous
philadelphia
presid
sign
law
pass
congress
protect
curtail
slaveri
said
one
slave
william
lee
freed
upon
death
slave
must
work
wife
freed
death
freed
lifetim
remov
incent
hasten
death
endeavor
assimil
nativ
american
anglo
american
cultur
howev
wage
militari
campaign
hostil
nativ
american
nation
revolutionari
war
northwest
indian
war
member
anglican
church
freemason
urg
broad
religi
freedom
role
gener
presid
upon
death
eulog
henri
light
hors

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
for i in corpus:
    words = nltk. word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(lemmatizer.lemmatize(word))
        

washington
played
indispensable
role
adopting
ratifying
constitution
united
state
twice
elected
president
electoral
college
unanimously
president
implemented
strong
well
financed
national
government
remaining
impartial
fierce
rivalry
cabinet
member
thomas
jefferson
alexander
hamilton
french
revolution
proclaimed
policy
neutrality
sanctioning
jay
treaty
set
enduring
precedent
office
president
including
title
mr
president
swearing
oath
office
bible
farewell
address
widely
regarded
pre
eminent
statement
republicanism
washington
slave
owner
complicated
relationship
slavery
lifetime
controlled
cumulative
total
slave
forced
work
farm
wherever
lived
including
president
house
philadelphia
president
signed
law
passed
congress
protected
curtailed
slavery
said
one
slave
william
lee
freed
upon
death
slave
must
work
wife
freed
death
freed
lifetime
remove
incentive
hastening
death
endeavored
assimilate
native
american
anglo
american
culture
however
waged
military
campaign
hostile
native
american
nat

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv =CountVectorizer(binary=True)

In [15]:
X = cv.fit_transform(corpus)

In [16]:
cv.vocabulary_

{'washington': 179,
 'played': 122,
 'an': 8,
 'indispensable': 85,
 'role': 143,
 'in': 81,
 'adopting': 1,
 'and': 9,
 'ratifying': 133,
 'the': 162,
 'constitution': 30,
 'of': 111,
 'united': 172,
 'states': 158,
 'he': 67,
 'was': 178,
 'then': 164,
 'twice': 170,
 'elected': 40,
 'president': 127,
 'by': 22,
 'electoral': 41,
 'college': 27,
 'unanimously': 171,
 'as': 14,
 'implemented': 80,
 'strong': 159,
 'well': 180,
 'financed': 50,
 'national': 105,
 'government': 60,
 'while': 183,
 'remaining': 137,
 'impartial': 79,
 'fierce': 49,
 'rivalry': 142,
 'between': 18,
 'cabinet': 23,
 'members': 99,
 'thomas': 165,
 'jefferson': 89,
 'alexander': 3,
 'hamilton': 63,
 'during': 39,
 'french': 57,
 'revolution': 140,
 'proclaimed': 129,
 'policy': 123,
 'neutrality': 108,
 'sanctioning': 146,
 'jay': 88,
 'treaty': 169,
 'set': 148,
 'enduring': 44,
 'precedents': 126,
 'for': 52,
 'office': 112,
 'including': 83,
 'title': 166,
 'mr': 103,
 'swearing': 160,
 'oath': 110,
 'on

In [17]:
corpus[0]

'washington played an indispensable role in adopting and ratifying the constitution of the united states'

In [18]:
X[0].toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [19]:
corpus =[]
for i in range(0, len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [20]:
corpus

['washington played indispensable role adopting ratifying constitution united state',
 'twice elected president electoral college unanimously',
 'president implemented strong well financed national government remaining impartial fierce rivalry cabinet member thomas jefferson alexander hamilton',
 'french revolution proclaimed policy neutrality sanctioning jay treaty',
 'set enduring precedent office president including title mr president swearing oath office bible',
 'farewell address widely regarded pre eminent statement republicanism',
 'washington slave owner complicated relationship slavery',
 'lifetime controlled cumulative total slave forced work farm wherever lived including president house philadelphia',
 'president signed law passed congress protected curtailed slavery',
 'said one slave william lee freed upon death slave must work wife freed death',
 'freed lifetime remove incentive hastening death',
 'endeavored assimilate native american anglo american culture',
 'however w

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv =CountVectorizer(binary=True, ngram_range=(3, 3))

In [22]:
X = cv.fit_transform(corpus)

In [23]:
cv.vocabulary_

{'washington played indispensable': 149,
 'played indispensable role': 94,
 'indispensable role adopting': 61,
 'role adopting ratifying': 119,
 'adopting ratifying constitution': 1,
 'ratifying constitution united': 111,
 'constitution united state': 18,
 'twice elected president': 138,
 'elected president electoral': 25,
 'president electoral college': 99,
 'electoral college unanimously': 26,
 'president implemented strong': 101,
 'implemented strong well': 56,
 'strong well financed': 133,
 'well financed national': 153,
 'financed national government': 35,
 'national government remaining': 82,
 'government remaining impartial': 47,
 'remaining impartial fierce': 114,
 'impartial fierce rivalry': 55,
 'fierce rivalry cabinet': 34,
 'rivalry cabinet member': 118,
 'cabinet member thomas': 12,
 'member thomas jefferson': 74,
 'thomas jefferson alexander': 135,
 'jefferson alexander hamilton': 62,
 'french revolution proclaimed': 44,
 'revolution proclaimed policy': 116,
 'proclaimed 

In [24]:
##TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tf =TfidfVectorizer(binary=True, ngram_range=(3, 3))

In [25]:
X = tf.fit_transform(corpus)

In [26]:
corpus[0]

'washington played indispensable role adopting ratifying constitution united state'

In [27]:
X[0].toarray()

array([[0.        , 0.37796447, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [28]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [29]:
words=[]
for sent in sentences:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [30]:
words[1]

['he',
 'was',
 'then',
 'twice',
 'elected',
 'president',
 'by',
 'the',
 'electoral',
 'college',
 'unanimously']