In [1]:
import nltk
# Tokenization
from nltk.tokenize import word_tokenize
# Removal of Stopwords
from nltk.corpus import stopwords
# Stemming
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
# Lemmatization
from nltk.stem import WordNetLemmatizer
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string

In [3]:
document = [
    "I like to play cricket. Yesterday I was playing cricket with my friends...",
    "But my other friends like to play football, so they didn't enjoyed much.",
    "Upcoming match is of India Vs Australia. India has challenged Australia..!!"
]

In [4]:
document[0]

'I like to play cricket. Yesterday I was playing cricket with my friends...'

In [7]:
# document[0].split()

In [9]:
word_tokenize(document[0])

['I',
 'like',
 'to',
 'play',
 'cricket',
 '.',
 'Yesterday',
 'I',
 'was',
 'playing',
 'cricket',
 'with',
 'my',
 'friends',
 '...']

In [13]:
table = str.maketrans('','',string.punctuation)
for i in range(len(document)):
    document[i] = document[i].translate(table)

In [20]:
print(string.punctuation)
print("="*30)
print(table)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [14]:
document

['I like to play cricket Yesterday I was playing cricket with my friends',
 'But my other friends like to play football so they didnt enjoyed much',
 'Upcoming match is of India Vs Australia India has challenged Australia']

In [15]:
tokenization = []
for i in range(len(document)):
    tokenization.append(word_tokenize(document[i].lower()))

In [16]:
print(tokenization)

[['i', 'like', 'to', 'play', 'cricket', 'yesterday', 'i', 'was', 'playing', 'cricket', 'with', 'my', 'friends'], ['but', 'my', 'other', 'friends', 'like', 'to', 'play', 'football', 'so', 'they', 'didnt', 'enjoyed', 'much'], ['upcoming', 'match', 'is', 'of', 'india', 'vs', 'australia', 'india', 'has', 'challenged', 'australia']]


In [21]:
eg_stopwords = stopwords.words('english')

In [23]:
print(eg_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [24]:
len(eg_stopwords)

179

In [39]:
words = []

for tokensList in tokenization:
    non_stopwords = []
    for i in range(len(tokensList)):
        if tokensList[i] not in eg_stopwords:
            non_stopwords.append(tokensList[i])
    words.append(non_stopwords)

In [40]:
print(words)

[['like', 'play', 'cricket', 'yesterday', 'playing', 'cricket', 'friends'], ['friends', 'like', 'play', 'football', 'didnt', 'enjoyed', 'much'], ['upcoming', 'match', 'india', 'vs', 'australia', 'india', 'challenged', 'australia']]


In [41]:
ps = PorterStemmer()

In [42]:
ps.stem('upcoming')

'upcom'

In [43]:
ps.stem('playing')

'play'

In [44]:
ps.stem('challenged')

'challeng'

In [45]:
ps.stem('bought')

'bought'

In [46]:
wnet = WordNetLemmatizer()

In [47]:
# n - noun
wnet.lemmatize('upcoming', pos='n')

'upcoming'

In [48]:
# v - verb
wnet.lemmatize('upcoming', pos='v')

'upcoming'

In [49]:
wnet.lemmatize('challenged', pos='v')

'challenge'

In [50]:
wnet.lemmatize('bought', pos='v')

'buy'

In [51]:
for i in range(len(words)):
    for j in range(len(words[i])):
        words[i][j] = wnet.lemmatize(words[i][j], pos='v')

In [53]:
print(words)

[['like', 'play', 'cricket', 'yesterday', 'play', 'cricket', 'friends'], ['friends', 'like', 'play', 'football', 'didnt', 'enjoy', 'much'], ['upcoming', 'match', 'india', 'vs', 'australia', 'india', 'challenge', 'australia']]


In [56]:
for i in range(len(words)):
    words[i] = ' '.join(words[i])

In [57]:
words

['like play cricket yesterday play cricket friends',
 'friends like play football didnt enjoy much',
 'upcoming match india vs australia india challenge australia']

In [54]:
cv = CountVectorizer()

In [58]:
cv.fit(words)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [60]:
print(cv.vocabulary_)

{'like': 8, 'play': 11, 'cricket': 2, 'yesterday': 14, 'friends': 6, 'football': 5, 'didnt': 3, 'enjoy': 4, 'much': 10, 'upcoming': 12, 'match': 9, 'india': 7, 'vs': 13, 'australia': 0, 'challenge': 1}


In [61]:
cv.transform(words)

<3x15 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [62]:
cv.transform(words).toarray()

array([[0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 2, 0, 0, 1],
       [0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0],
       [2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 1, 0]], dtype=int64)

In [63]:
words

['like play cricket yesterday play cricket friends',
 'friends like play football didnt enjoy much',
 'upcoming match india vs australia india challenge australia']

In [64]:
tfidf = TfidfVectorizer()

In [65]:
tfidf.fit(words)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [67]:
print(tfidf.vocabulary_)

{'like': 8, 'play': 11, 'cricket': 2, 'yesterday': 14, 'friends': 6, 'football': 5, 'didnt': 3, 'enjoy': 4, 'much': 10, 'upcoming': 12, 'match': 9, 'india': 7, 'vs': 13, 'australia': 0, 'challenge': 1}


In [68]:
tfidf.transform(words).toarray()

array([[0.        , 0.        , 0.68719204, 0.        , 0.        ,
        0.        , 0.26131363, 0.        , 0.26131363, 0.        ,
        0.        , 0.52262726, 0.        , 0.        , 0.34359602],
       [0.        , 0.        , 0.        , 0.41756662, 0.41756662,
        0.41756662, 0.31757018, 0.        , 0.31757018, 0.        ,
        0.41756662, 0.31757018, 0.        , 0.        , 0.        ],
       [0.57735027, 0.28867513, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.57735027, 0.        , 0.28867513,
        0.        , 0.        , 0.28867513, 0.28867513, 0.        ]])