### 1. Data collection/ Corpus
Here, we will be using our own data.

In [1]:
data = ''' Ram loves to play cricket. Sita is the wife of Ram. Ram and Laxman are brothers. Bharat likes archery.'''
sentence = " Good Evening, Saurabh how are you doing today? "

### 2. Tokenization and Stop words removal
#### (a) Tokenization

In [13]:
import nltk
import warnings
warnings.filterwarnings('ignore')
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download("punkt") # punkt is an inbuilt nltk corpus which contains english words.

[nltk_data] Downloading package punkt to C:\Users\SRISHTI
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# sent_tokenize() function tokenizes the document/data/corpus into list of sentences.
sents = sent_tokenize(data)
print(len(sents))
print(sents)

4
[' Ram loves to play cricket.', 'Sita is the wife of Ram.', 'Ram and Laxman are brothers.', 'Bharat likes archery.']


In [10]:
# word_tokenize() function tokenizes sentences into list of words.
words = word_tokenize(sentence)
print(len(words))
print(words)

10
['Good', 'Evening', ',', 'Saurabh', 'how', 'are', 'you', 'doing', 'today', '?']


#### (b) Stop words removal

In [15]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\SRISHTI
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [17]:
sw = set(stopwords.words('english'))
# sw is a set of stopwords generally used in english language and we can ignore them while making vocab.
print(sw)

{'these', 'a', 'me', 'themselves', 'be', 'shouldn', 'wouldn', 'through', 'both', 'few', 'herself', 'other', "needn't", 'doesn', 'yourself', 'very', 'some', 'itself', "you've", 'but', 'himself', 'ours', 'mightn', "mightn't", 'haven', 'ourselves', 'by', "won't", 'same', 'won', 'with', 'hasn', 'all', 'no', 'ma', 'your', 'further', 'the', 'then', 'm', 'isn', 'before', "wouldn't", "you're", 'his', 'which', 'until', 'is', 'off', 'more', "should've", 'once', 'into', 'out', 'their', 'to', 'had', 'any', 'if', 'so', 't', 'yours', 'its', 'who', 'doing', 'at', 'ain', 'shan', 'hadn', 'he', 'below', 'can', 'above', 'too', 'over', 'needn', "hadn't", 'my', 'only', 'now', "didn't", 'than', 'how', 'on', 'don', "don't", 'y', 'down', 'under', 'i', 'wasn', 'not', 'd', 'up', 'and', 'here', "she's", 'did', 'when', 'she', 'whom', 'myself', 'such', 's', 'was', 'am', 'while', 'after', 'they', 'own', 'couldn', 'between', 'what', "shan't", 'against', 'mustn', "aren't", "shouldn't", 'as', 'were', 'should', 'do', '

In [20]:
def remove_stopwords(words,stopwards):
    useful_words = [i for i in words if i  not in stopwards]
    return useful_words

In [21]:
useful_words = remove_stopwords(words,sw) # 'how','are', 'you' are removed from the list of words as they were in sw.
print(useful_words)

['Good', 'Evening', ',', 'Saurabh', 'today', '?']


 - One thing to be kept in mind is that, we always do not need to use these inbuilt stopwords. We can create list of stopwords according to our own needs and usecase. Otherwise, sometimes sentiments can get changed using inbuilt functions. 

### Tokenization using Regular Expressions

In [23]:
from nltk.tokenize import RegexpTokenizer
# We can take help from online resource such as regexpal.com to play with regular expressions.

In [32]:
sent_1 = "My email id is abc@xyz.in"
sent_2 = "Ram has 3 cars, 2 tractors and 8 bikes"

In [28]:
words_1 = word_tokenize(sent_1)
print(words_1)

['My', 'email', 'id', 'is', 'abc', '@', 'xyz.in']


 - We can notice in above example, the email-id got tokenized into several parts by default and in certain cases we do not need this to happen. So, we can use regex to customise our tokenization.

In [39]:
tokenizer_1 = RegexpTokenizer('[a-zA-Z@.]+')
useful_words_1 = tokenizer_1.tokenize(sent_1)
print(useful_words_1)

['My', 'email', 'id', 'is', 'abc@xyz.in']


In [33]:
words_2 = word_tokenize(sent_2)
print(words_2)

['Ram', 'has', '3', 'cars', ',', '2', 'tractors', 'and', '8', 'bikes']


 - Suppose in above example we do not need numbers then we can use regex to discard numeric values.

In [40]:
tokenizer_2 = RegexpTokenizer('[a-zA-Z]+')
useful_words_2 = tokenizer_2.tokenize(sent_2)
print(useful_words_2)

['Ram', 'has', 'cars', 'tractors', 'and', 'bikes']


### 3. Stemming and Lemmatiztion

#### (a) Stemming
 - NLTK provides us three different stemming options which are-
 1. Snowball Stemmer
 2. Porter Stemmer
 3. Lancaster Stemmer

In [41]:
text = ''' Ram is a singer and loves singing. Ram has been practicing singing since he was 6 years.
He sings beutifully and everyone gets mesmerized when he sings.''' 

In [42]:
from nltk.stem import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [43]:
ps = PorterStemmer()

In [44]:
ps.stem("singing")

'sing'

In [46]:
ps.stem("sings")

'sing'

In [48]:
print(ps.stem("jumps"))
print(ps.stem("jumping"))

jump
jump


##### Working with SnowballStemmer

 - SnowballStemmer is a multi-lingual stemmer and it supports more than one language.

In [49]:
ss = SnowballStemmer('english')

In [50]:
ss.stem('jumps')

'jump'

In [51]:
ss.stem('jumping')

'jump'

In [53]:
ss.stem('lovely')

'love'

In [54]:
ss.stem('loving')

'love'

#### (b) Lemmatization

In [56]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to C:\Users\SRISHTI
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [61]:
wn = WordNetLemmatizer()
wn.lemmatize('jumps')

'jump'

In [63]:
wn.lemmatize('jumping')

'jumping'

In [64]:
wn.lemmatize('dancing')

'dancing'

In [60]:
wn.lemmatize('dances')

'dance'

### 4. Building a Vocab and Vectorization

In [65]:
# dummy corpus contains 4 documents from Sports, Politics, Movie, Technology catogory. Each document can have 1 or more sentences
dummy_corpus = [ 'Virat Kohli is classy cricketer and he dominates in all three formats of the game.',
'Narendra Modi is the prime minister of India and comes from Bhartiya Janta Party.',
'Dangal is a movie based on two sisters Geeta and Babita who are wrestlers.',
'Iphone 12 got launched in India recently by Apple. ']

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
cv = CountVectorizer()

In [70]:
# In fit_transform(), fit learns what the dictionary is from the given document and transform converts the data into vectorized format
vectorized_corpus = cv.fit_transform(dummy_corpus)

In [72]:
vectorized_corpus = vectorized_corpus.toarray()

In [86]:
print(len(vectorized_corpus[0])) # It tells there are 43 unique words in the entire corpus.
vectorized_corpus[0]

43


array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0],
      dtype=int64)

In [90]:
# To get the idea how mapping is done
print(cv.vocabulary_)
# It is a dictionary (key-value) pair which shows at what location words are present in the vocab

{'virat': 40, 'kohli': 25, 'is': 23, 'classy': 9, 'cricketer': 11, 'and': 2, 'he': 19, 'dominates': 13, 'in': 20, 'all': 1, 'three': 38, 'formats': 14, 'of': 31, 'the': 37, 'game': 16, 'narendra': 30, 'modi': 28, 'prime': 34, 'minister': 27, 'india': 21, 'comes': 10, 'from': 15, 'bhartiya': 7, 'janta': 24, 'party': 33, 'dangal': 12, 'movie': 29, 'based': 6, 'on': 32, 'two': 39, 'sisters': 36, 'geeta': 17, 'babita': 5, 'who': 41, 'are': 4, 'wrestlers': 42, 'iphone': 22, '12': 0, 'got': 18, 'launched': 26, 'recently': 35, 'by': 8, 'apple': 3}


In [88]:
print(len(cv.vocabulary_.keys())) # This shows there are 43 unique words.

43


In [91]:
# Reverse Mapping

In [92]:
numbers = vectorized_corpus[0]
numbers

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0],
      dtype=int64)

In [94]:
# Since, it is a bag of words, we cannot get the same order but it still gives an idea of all the unique word in that document.
cv.inverse_transform(numbers)

[array(['all', 'and', 'classy', 'cricketer', 'dominates', 'formats',
        'game', 'he', 'in', 'is', 'kohli', 'of', 'the', 'three', 'virat'],
       dtype='<U9')]

#### Vectorization with stop words removal

 - When we create a  countVectorizer() object, in this we can pass our own custom tokenizer which can be used to remove stop words.

In [98]:
def myTokenizer(sent):
    words = tokenizer.tokenize(sent.lower())
    words = remove_stopwords(words,sw)
    return words

In [99]:
myTokenizer("This is a sentance written for testing.")

['sentance', 'written', 'testing.']

In [100]:
cv = CountVectorizer(tokenizer= myTokenizer)

In [101]:
vectorized_corpus = cv.fit_transform(dummy_corpus).toarray()

In [103]:
print(len(vectorized_corpus[0]))
print(vectorized_corpus)

30
[[0 0 0 0 1 0 1 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0]
 [0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1]
 [1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0]]


In [104]:
cv.inverse_transform(vectorized_corpus)

[array(['classy', 'cricketer', 'dominates', 'formats', 'game.', 'kohli',
        'three', 'virat'], dtype='<U10'),
 array(['bhartiya', 'comes', 'india', 'janta', 'minister', 'modi',
        'narendra', 'party.', 'prime'], dtype='<U10'),
 array(['babita', 'based', 'dangal', 'geeta', 'movie', 'sisters', 'two',
        'wrestlers.'], dtype='<U10'),
 array(['apple.', 'got', 'india', 'iphone', 'launched', 'recently'],
       dtype='<U10')]

  - We dont need to learn new corpus for test data, we use our trained corpus
  - For test data-

In [107]:
test_corpus = [' This is a test corpus.']

In [108]:
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

 - fit_transform() methid is used for training data and transform() method is used for test data.