In [1]:
import nltk
# 1. Tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
# 2. Removing Stopwords
from nltk.corpus import stopwords
# 3. Stemming/Lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer
# 4. Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
data = [
    "Hello Ram, How are you ? I was planning to watch a cricket match...",
    "Virat kohli is captain of indian cricket team. Indian cricket has improved a lot",
    "Sachin and Dhoni Played very well in last two matches, greatest innings ever..."
]

In [5]:
# word_tokenize(data[0])
for i in range(len(data)):
    data[i] = word_tokenize(data[i].lower())

In [8]:
print(data[0])

['hello', 'ram', ',', 'how', 'are', 'you', '?', 'i', 'was', 'planning', 'to', 'watch', 'a', 'cricket', 'match', '...']


In [30]:
myStopwords = [',','...','?','.']
s = stopwords.words('english')
s.extend(myStopwords)

words = []
for itemList in data:
    wordList = []
    for word in itemList:
        if word not in s:
            wordList.append(word)
    words.append(wordList)
        

In [31]:
words[0]

['hello', 'ram', 'planning', 'watch', 'cricket', 'match']

In [32]:
demo = ['played','greatest','watched','hated','loved','loving','went','bought']

ps = PorterStemmer()

for word in demo:
    print(ps.stem(word))

play
greatest
watch
hate
love
love
went
bought


In [33]:
wnet = WordNetLemmatizer()
for word in demo:
    print(wnet.lemmatize(word,pos='n'),"====>",wnet.lemmatize(word,pos='v'))

played ====> play
greatest ====> greatest
watched ====> watch
hated ====> hat
loved ====> love
loving ====> love
went ====> go
bought ====> buy


In [34]:
for i in range(len(words)):
    for j in range(len(words[i])):
        words[i][j] = wnet.lemmatize(words[i][j],pos='v')

In [35]:
words[0]

['hello', 'ram', 'plan', 'watch', 'cricket', 'match']

In [37]:
print(words[1])

['virat', 'kohli', 'captain', 'indian', 'cricket', 'team', 'indian', 'cricket', 'improve', 'lot']


In [38]:
print(words[2])

['sachin', 'dhoni', 'play', 'well', 'last', 'two', 'match', 'greatest', 'innings', 'ever']


In [39]:
cv = CountVectorizer()

In [40]:
for i in range(len(words)):
    words[i] = ' '.join(words[i])

In [41]:
words

['hello ram plan watch cricket match',
 'virat kohli captain indian cricket team indian cricket improve lot',
 'sachin dhoni play well last two match greatest innings ever']

In [42]:
vect = cv.fit(words)

In [45]:
print(vect.vocabulary_)

{'hello': 5, 'ram': 15, 'plan': 13, 'watch': 20, 'cricket': 1, 'match': 12, 'virat': 19, 'kohli': 9, 'captain': 0, 'indian': 7, 'team': 17, 'improve': 6, 'lot': 11, 'sachin': 16, 'dhoni': 2, 'play': 14, 'well': 21, 'last': 10, 'two': 18, 'greatest': 4, 'innings': 8, 'ever': 3}


In [46]:
vect_transform = cv.transform(words)

In [47]:
vect_transform

<3x22 sparse matrix of type '<class 'numpy.int64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [48]:
vect_transform.toarray()

array([[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [1, 2, 0, 0, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1]],
      dtype=int64)

In [50]:
print(words)

['hello ram plan watch cricket match', 'virat kohli captain indian cricket team indian cricket improve lot', 'sachin dhoni play well last two match greatest innings ever']


In [51]:
tfidf = TfidfVectorizer()

In [52]:
tfidf.fit(words)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [53]:
tf = tfidf.transform(words)

In [55]:
tf.toarray()

array([[0.        , 0.3349067 , 0.        , 0.        , 0.        ,
        0.44036207, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.3349067 , 0.44036207, 0.        ,
        0.44036207, 0.        , 0.        , 0.        , 0.        ,
        0.44036207, 0.        ],
       [0.2849755 , 0.43346242, 0.        , 0.        , 0.        ,
        0.        , 0.2849755 , 0.56995099, 0.        , 0.2849755 ,
        0.        , 0.2849755 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.2849755 , 0.        , 0.2849755 ,
        0.        , 0.        ],
       [0.        , 0.        , 0.32311233, 0.32311233, 0.32311233,
        0.        , 0.        , 0.        , 0.32311233, 0.        ,
        0.32311233, 0.        , 0.24573525, 0.        , 0.32311233,
        0.        , 0.32311233, 0.        , 0.32311233, 0.        ,
        0.        , 0.32311233]])