In [1]:
# 1. Tokenization
from nltk.tokenize import word_tokenize
# 2. Remove Stopwords
from nltk.corpus import stopwords
# 3. Stemming
from nltk.stem import PorterStemmer, WordNetLemmatizer
# 4. Vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [69]:
data = [
    "Hello ram how are you. Did you watch the yesterday cricket match ?",
    "In yesterday's cricket match virat kohli scored a century and rohit also scored a century",
    "virat kohli is now the captain of indian cricket team because he played very well and bought us to number one position"
]

In [70]:
# 1. Tokenization
print(word_tokenize(data[0]))
print(word_tokenize(data[1]))
print(word_tokenize(data[2]))

['Hello', 'ram', 'how', 'are', 'you', '.', 'Did', 'you', 'watch', 'the', 'yesterday', 'cricket', 'match', '?']
['In', 'yesterday', "'s", 'cricket', 'match', 'virat', 'kohli', 'scored', 'a', 'century', 'and', 'rohit', 'also', 'scored', 'a', 'century']
['virat', 'kohli', 'is', 'now', 'the', 'captain', 'of', 'indian', 'cricket', 'team', 'because', 'he', 'played', 'very', 'well', 'and', 'bought', 'us', 'to', 'number', 'one', 'position']


In [71]:
tokens = []
for i in range(len(data)):
    tokens.append(word_tokenize(data[i].lower()))

In [72]:
# 2. Removing Stopwords
engStopwords = stopwords.words('english')
engStopwords.extend(['.','?',"'s","also",","])

In [73]:
wordList = []
for tokenList in tokens:
    t = []
    for token in tokenList:
        if token not in engStopwords:
            t.append(token)
    wordList.append(t)

In [74]:
wordList

[['hello', 'ram', 'watch', 'yesterday', 'cricket', 'match'],
 ['yesterday',
  'cricket',
  'match',
  'virat',
  'kohli',
  'scored',
  'century',
  'rohit',
  'scored',
  'century'],
 ['virat',
  'kohli',
  'captain',
  'indian',
  'cricket',
  'team',
  'played',
  'well',
  'bought',
  'us',
  'number',
  'one',
  'position']]

In [75]:
# 3. Stemming
# ps = PorterStemmer()
wnet = WordNetLemmatizer()

In [76]:
for i in range(len(wordList)):
    for j in range(len(wordList[i])):
#         print("Actual : {}, Stem : {}".format(wordList[i][j],wnet.lemmatize(wordList[i][j], pos='v')))
        wordList[i][j] = wnet.lemmatize(wordList[i][j], pos='v')

In [77]:
for i in range(len(wordList)):
    wordList[i] = ' '.join(wordList[i])

In [78]:
wordList[2]

'virat kohli captain indian cricket team play well buy us number one position'

In [79]:
cv = CountVectorizer()
cv.fit(wordList)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [80]:
print(cv.vocabulary_)

{'hello': 4, 'ram': 12, 'watch': 18, 'yesterday': 20, 'cricket': 3, 'match': 7, 'virat': 17, 'kohli': 6, 'score': 14, 'century': 2, 'rohit': 13, 'captain': 1, 'indian': 5, 'team': 15, 'play': 10, 'well': 19, 'buy': 0, 'us': 16, 'number': 8, 'one': 9, 'position': 11}


In [81]:
vect = cv.transform(wordList)

In [82]:
vect

<3x21 sparse matrix of type '<class 'numpy.int64'>'
	with 27 stored elements in Compressed Sparse Row format>

In [83]:
vect.toarray()

array([[0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 1],
       [1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0]],
      dtype=int64)

In [85]:
wordList

['hello ram watch yesterday cricket match',
 'yesterday cricket match virat kohli score century rohit score century',
 'virat kohli captain indian cricket team play well buy us number one position']

In [86]:
vect = cv.fit_transform(wordList)

In [87]:
vect.toarray()

array([[0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 1],
       [1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0]],
      dtype=int64)

In [88]:
tfidf = TfidfVectorizer()

In [89]:
tf_vect = tfidf.fit_transform(wordList)

In [91]:
tf_vect.toarray()

array([[0.        , 0.        , 0.        , 0.27824521, 0.4711101 ,
        0.        , 0.        , 0.35829137, 0.        , 0.        ,
        0.        , 0.        , 0.4711101 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.4711101 , 0.        ,
        0.35829137],
       [0.        , 0.        , 0.58564651, 0.17294613, 0.        ,
        0.        , 0.22269963, 0.22269963, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.29282326, 0.58564651,
        0.        , 0.        , 0.22269963, 0.        , 0.        ,
        0.22269963],
       [0.2948118 , 0.2948118 , 0.        , 0.1741206 , 0.        ,
        0.2948118 , 0.22421198, 0.        , 0.2948118 , 0.2948118 ,
        0.2948118 , 0.2948118 , 0.        , 0.        , 0.        ,
        0.2948118 , 0.2948118 , 0.22421198, 0.        , 0.2948118 ,
        0.        ]])