In [2]:
# 1. Tokenization
from nltk.tokenize import word_tokenize
# 2. Stopwords and punctuation
from nltk.corpus import stopwords
import string
# 3. Stemming / Lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer
# 4. Vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [9]:
documents = ["I was playing cricket yesterday and suddenly I saw Sachin passing by",
            "I was watching IPL and I think cricket is a boring game now, cricket used to be fun earlier",
            "Cricket is being watched a lot in india. India has a huge audience to watch cricket!!"]

In [10]:
print(word_tokenize(documents[0]))

['I', 'was', 'playing', 'cricket', 'yesterday', 'and', 'suddenly', 'I', 'saw', 'Sachin', 'passing', 'by']


In [13]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
table = str.maketrans('', '', string.punctuation)

In [16]:
print(table)

{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [19]:
documents[2].translate(table)

'Cricket is being watched a lot in india India has a huge audience to watch cricket'

In [20]:
for i in range(len(documents)):
    documents[i] = documents[i].lower().translate(table)

In [21]:
tokens = []
for i in range(len(documents)):
    tokens.append(word_tokenize(documents[i]))

In [22]:
print(tokens)

[['i', 'was', 'playing', 'cricket', 'yesterday', 'and', 'suddenly', 'i', 'saw', 'sachin', 'passing', 'by'], ['i', 'was', 'watching', 'ipl', 'and', 'i', 'think', 'cricket', 'is', 'a', 'boring', 'game', 'now', 'cricket', 'used', 'to', 'be', 'fun', 'earlier'], ['cricket', 'is', 'being', 'watched', 'a', 'lot', 'in', 'india', 'india', 'has', 'a', 'huge', 'audience', 'to', 'watch', 'cricket']]


In [27]:
eng_stopwords = stopwords.words('english')

In [28]:
wordsList = []
for tokenList in tokens:
    words = []
    for token in tokenList:
        if token not in eng_stopwords:
            words.append(token)
    wordsList.append(words)

In [29]:
print(wordsList)

[['playing', 'cricket', 'yesterday', 'suddenly', 'saw', 'sachin', 'passing'], ['watching', 'ipl', 'think', 'cricket', 'boring', 'game', 'cricket', 'used', 'fun', 'earlier'], ['cricket', 'watched', 'lot', 'india', 'india', 'huge', 'audience', 'watch', 'cricket']]


In [30]:
ps = PorterStemmer()

In [31]:
ps.stem('playing')

'play'

In [32]:
ps.stem('going')

'go'

In [34]:
ps.stem('watched')

'watch'

In [35]:
ps.stem('flying')

'fli'

In [36]:
ps.stem('bought')

'bought'

In [37]:
wnet = WordNetLemmatizer()

In [38]:
wnet.lemmatize('watching', pos='v')

'watch'

In [39]:
wnet.lemmatize('flying', pos='v')

'fly'

In [40]:
wnet.lemmatize('bought', pos='v')

'buy'

In [41]:
for i in range(len(wordsList)):
    for j in range(len(wordsList[i])):
        wordsList[i][j] = wnet.lemmatize(wordsList[i][j], pos='v')

In [42]:
print(wordsList)

[['play', 'cricket', 'yesterday', 'suddenly', 'saw', 'sachin', 'pass'], ['watch', 'ipl', 'think', 'cricket', 'bore', 'game', 'cricket', 'use', 'fun', 'earlier'], ['cricket', 'watch', 'lot', 'india', 'india', 'huge', 'audience', 'watch', 'cricket']]


In [43]:
for i in range(len(wordsList)):
    wordsList[i] = ' '.join(wordsList[i])

In [44]:
wordsList

['play cricket yesterday suddenly saw sachin pass',
 'watch ipl think cricket bore game cricket use fun earlier',
 'cricket watch lot india india huge audience watch cricket']

In [45]:
cv = CountVectorizer()
cv.fit(wordsList)

CountVectorizer()

In [47]:
print(cv.vocabulary_)

{'play': 11, 'cricket': 2, 'yesterday': 18, 'suddenly': 14, 'saw': 13, 'sachin': 12, 'pass': 10, 'watch': 17, 'ipl': 8, 'think': 15, 'bore': 1, 'game': 5, 'use': 16, 'fun': 4, 'earlier': 3, 'lot': 9, 'india': 7, 'huge': 6, 'audience': 0}


In [50]:
cv.transform(wordsList)

<3x19 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [51]:
cv.transform(wordsList).toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1],
       [0, 1, 2, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
       [1, 0, 2, 0, 0, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0]],
      dtype=int64)