In [3]:
# Tokenization
from nltk.tokenize import word_tokenize
# Remove Stopwords
from nltk.corpus import stopwords
# Lemmatization / Stemming
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
documents = [
    "Hello, this is python programming, and python is a language",
    "Python is used in machine learning and game development",
    "Python is one of the most popular programming today and it is loved by everyone"
]

In [5]:
word_tokenize(documents[0])

['Hello',
 ',',
 'this',
 'is',
 'python',
 'programming',
 ',',
 'and',
 'python',
 'is',
 'a',
 'language']

In [6]:
tokens = []
for i in range(len(documents)):
    tokens.append(word_tokenize(documents[i].lower()))

In [9]:
print(tokens[0])

['hello', ',', 'this', 'is', 'python', 'programming', ',', 'and', 'python', 'is', 'a', 'language']


In [11]:
eng_stopwords = stopwords.words("english")

In [12]:
eng_stopwords.extend([',','.','!','@','#','?'])

In [13]:
main_words = []
for i in range(len(tokens)):
    words = []
    for token in tokens[i]:
        if token not in eng_stopwords:
            words.append(token)
    main_words.append(words)

In [14]:
print(main_words)

[['hello', 'python', 'programming', 'python', 'language'], ['python', 'used', 'machine', 'learning', 'game', 'development'], ['python', 'one', 'popular', 'programming', 'today', 'loved', 'everyone']]


In [15]:
ps = PorterStemmer()

In [20]:
# ps.stem('playing')
# ps.stem('wanted')
# ps.stem('flying')
ps.stem('crying')

'cri'

In [21]:
wnet = WordNetLemmatizer()

In [25]:
# wnet.lemmatize("playing",pos='v')
# wnet.lemmatize("flying",pos='v')
wnet.lemmatize("bought",pos='v')

'buy'

In [26]:
for i in range(len(main_words)):
    for j in range(len(main_words[i])):
        main_words[i][j] = wnet.lemmatize(main_words[i][j],pos='v')

In [27]:
print(main_words)

[['hello', 'python', 'program', 'python', 'language'], ['python', 'use', 'machine', 'learn', 'game', 'development'], ['python', 'one', 'popular', 'program', 'today', 'love', 'everyone']]


In [28]:
for i in range(len(main_words)):
    main_words[i] = " ".join(main_words[i])

In [29]:
main_words

['hello python program python language',
 'python use machine learn game development',
 'python one popular program today love everyone']

In [30]:
cv = CountVectorizer()

In [31]:
vect = cv.fit(main_words)

In [34]:
print(vect.vocabulary_)

{'hello': 3, 'python': 11, 'program': 10, 'language': 4, 'use': 13, 'machine': 7, 'learn': 5, 'game': 2, 'development': 0, 'one': 8, 'popular': 9, 'today': 12, 'love': 6, 'everyone': 1}


In [35]:
vect = cv.transform(main_words)

In [36]:
vect

<3x14 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [38]:
main_words

['hello python program python language',
 'python use machine learn game development',
 'python one popular program today love everyone']

In [37]:
vect.toarray()

array([[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0],
       [1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0]], dtype=int64)

In [39]:
tfidf = TfidfVectorizer()

In [40]:
tfidf.fit_transform(main_words).toarray()

array([[0.        , 0.        , 0.        , 0.50165133, 0.50165133,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.38151877, 0.59256672, 0.        , 0.        ],
       [0.43238509, 0.        , 0.43238509, 0.        , 0.        ,
        0.43238509, 0.        , 0.43238509, 0.        , 0.        ,
        0.        , 0.2553736 , 0.        , 0.43238509],
       [0.        , 0.41074684, 0.        , 0.        , 0.        ,
        0.        , 0.41074684, 0.        , 0.41074684, 0.41074684,
        0.31238356, 0.2425937 , 0.41074684, 0.        ]])