In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
text = """Python is an interpreted high-level programming language for 
general-purpose programming. Created by Guido van Rossum and first 
released in 1991, Python has a design philosophy that emphasizes code
readability, notably using significant whitespace."""

In [4]:
print(word_tokenize(text))

['Python', 'is', 'an', 'interpreted', 'high-level', 'programming', 'language', 'for', 'general-purpose', 'programming', '.', 'Created', 'by', 'Guido', 'van', 'Rossum', 'and', 'first', 'released', 'in', '1991', ',', 'Python', 'has', 'a', 'design', 'philosophy', 'that', 'emphasizes', 'code', 'readability', ',', 'notably', 'using', 'significant', 'whitespace', '.']


In [5]:
print(sent_tokenize(text))

['Python is an interpreted high-level programming language for \ngeneral-purpose programming.', 'Created by Guido van Rossum and first \nreleased in 1991, Python has a design philosophy that emphasizes code\nreadability, notably using significant whitespace.']


In [6]:
tokens = word_tokenize(text)

In [8]:
print(len(stopwords.words("english")))

179


In [9]:
words = []

for word in tokens:
    if word not in stopwords.words("english"):
        words.append(word)

In [10]:
print(words)

['Python', 'interpreted', 'high-level', 'programming', 'language', 'general-purpose', 'programming', '.', 'Created', 'Guido', 'van', 'Rossum', 'first', 'released', '1991', ',', 'Python', 'design', 'philosophy', 'emphasizes', 'code', 'readability', ',', 'notably', 'using', 'significant', 'whitespace', '.']


In [11]:
words = [word for word in tokens if word not in stopwords.words("english")]

In [12]:
ps = PorterStemmer()
for word in words:
    print(ps.stem(word))

python
interpret
high-level
program
languag
general-purpos
program
.
creat
guido
van
rossum
first
releas
1991
,
python
design
philosophi
emphas
code
readabl
,
notabl
use
signific
whitespac
.


In [13]:
ls = LancasterStemmer()
for word in words:
    print(ls.stem(word))

python
interpret
high-level
program
langu
general-purpose
program
.
cre
guido
van
ross
first
releas
1991
,
python
design
philosoph
emphas
cod
read
,
not
us
sign
whitespac
.


In [14]:
wnet = WordNetLemmatizer()

In [15]:
for word in words:
    print(wnet.lemmatize(word, pos='n'), wnet.lemmatize(word, pos='v'))

Python Python
interpreted interpret
high-level high-level
programming program
language language
general-purpose general-purpose
programming program
. .
Created Created
Guido Guido
van van
Rossum Rossum
first first
released release
1991 1991
, ,
Python Python
design design
philosophy philosophy
emphasizes emphasize
code code
readability readability
, ,
notably notably
using use
significant significant
whitespace whitespace
. .


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
cv = CountVectorizer()

In [18]:
vect = cv.fit(words)

In [21]:
print(vect.vocabulary_)

{'emphasizes': 4, 'notably': 12, 'created': 2, 'rossum': 19, 'python': 16, '1991': 0, 'code': 1, 'released': 18, 'significant': 20, 'programming': 14, 'general': 6, 'purpose': 15, 'guido': 7, 'whitespace': 23, 'using': 21, 'language': 10, 'level': 11, 'philosophy': 13, 'interpreted': 9, 'first': 5, 'van': 22, 'high': 8, 'readability': 17, 'design': 3}


In [22]:
vect = cv.fit_transform(words)

In [23]:
vect

<28x24 sparse matrix of type '<class 'numpy.int64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [24]:
vect.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0],
       [0, 0, 0, 0, 0

In [25]:
tf = TfidfVectorizer()

In [26]:
tfVect = tf.fit_transform(words)

In [27]:
tfVect

<28x24 sparse matrix of type '<class 'numpy.float64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [28]:
tfVect.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.70710678,  0.        ,
         0.        ,  0.70710678,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
  