## INTRODUCTION TO NATURAL LANGUAGE PROCESSING
### Natural Language Toolkit
pip install nltk

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# 1.)GET THE DATA
- Get the data from NLTK corpus
- OR scrape data /use API

##### CORPUS - A large collection of text


In [3]:
#brown is a corpus
from nltk.corpus import brown

In [4]:
# brown corpus contain following categories
print(len(brown.categories()))
print(brown.categories())

15
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [5]:
#from brown corpus lets choose sentences from fiction categories
data = brown.sents(categories='fiction')
print(len(data))
print(data[110])
#seperate the sentence by a space
' '.join(data[110])

4249
['She', 'did', 'not', 'touch', 'him', '.']


'She did not touch him .'

# BAG OF WORDS
##### Steps
- Get the Data/corpus
- Tokenization,stopword removal
- Stemming
- Building the vocabulary
- Vectorization
- Classification

# 2.) TOKENIZATION & STOPWORD REMOVAL

- TOKENIZE

In [6]:
document = """It was a very pleasant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [7]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [8]:
#lets break the document into sentences
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [9]:
sents[1]

'The weather was cool and there were light showers.'

In [10]:
#lets break sentence into words
words = word_tokenize(sentence)
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek',
 '@',
 'cb.com']

- STOPWORDS

In [11]:
from nltk.corpus import stopwords

In [12]:
#select stopwords from engloshj language
sw = set(stopwords.words('english'))

In [13]:
print(sw)

{'off', 'against', 'until', 'most', 'couldn', 'wouldn', 'was', 'why', 'ours', 'are', 'into', 'shan', 'once', "needn't", 'than', 'these', 'it', "you've", 'by', 'have', 'such', 'be', 'who', "wouldn't", 's', 'at', 'as', 'him', 'before', 'does', 'hasn', 'themselves', 'am', "aren't", 'shouldn', 'did', 'mustn', 'their', 'her', 'and', 'here', 'no', 'out', "should've", 'do', 'didn', 'an', 'which', 'having', 'aren', 'were', 'wasn', 'further', 'very', 'hadn', 'my', 'ain', 'from', "weren't", 'not', 'the', 'after', "mightn't", "didn't", "hadn't", 'needn', 'y', 'its', 'so', 'few', 'being', "haven't", 'there', 'with', 'down', 'whom', 'for', 'd', 'yourself', 'will', "you'd", 'below', 'll', 'some', 'too', 'doesn', "it's", 'again', 'don', 'herself', 'i', "won't", 'yours', 'had', 'me', "hasn't", 'myself', 'ourselves', 'that', 'isn', "she's", 'your', 'itself', 'been', 'up', 'doing', 'each', 'has', 'can', 'what', "don't", "wasn't", "that'll", 'during', 'our', "you'll", 'other', 'about', 'above', 'himself'

In [14]:
#this function removes stopwords from a piece of text
def remove_stopWords(text,stopwords):
    useful_words = [word for word in text if word not in stopwords]
    return useful_words

In [15]:
# lets use the above function to remove stopswords
# split() is used to seperate each word from string
text = "i am not bothered about her very much".split()
print(remove_stopWords(text,sw))

['bothered', 'much']


- TOKENIZATION USING REGULAR EXPRESSIONS
-https://regexr.com/

In [16]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [17]:
from nltk.tokenize import RegexpTokenizer

In [18]:
#this regex will choose A-Z a-z and @ and rest other will be ignored
# by making a custom regex we can remove stopwords
tokenizer = RegexpTokenizer('[a-zA-Z@]+')
useful_text = tokenizer.tokenize(sentence)

In [19]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb',
 'com']

# 3.) STEMMING

- Process that transforms particular words(verbs,plurals)into their radical form 
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Example - jumps, jumping, jumped, jump ==> jump

In [20]:
text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""

In [21]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
#Snowball Stemmer, Porter, Lancaster Stemmer are different stemming tools

In [22]:
ps = PorterStemmer()

In [23]:
ps.stem('jumping')

'jump'

In [24]:
ps.stem('jumped')

'jump'

- Stemming can also be done using Lemmatization

In [25]:
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()
print(wn.lemmatize('jumping'))
print(wn.lemmatize('jump'))

jumping
jump


# 4.) BUILDING VOCABULARY AND VECTORIZATION

In [26]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
#make a object of CountVectorizer class 
cv = CountVectorizer()

In [29]:
# fit_transform will break the doc into tokens and will make a vocabulary and then acc. to 
# vocab it will fill the vector by filling in the word frequency
vectorized_corpus = cv.fit_transform(corpus)

In [30]:
vectorized_corpus = vectorized_corpus.toarray()

In [31]:
#document's vectorization
print(vectorized_corpus)

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]


In [32]:
# we can also see the vocabulary of document
# it is basically a dictionary with index as value() and word as key()
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [33]:
#jo ek single vector ki len hai it should be equal to vocabulary ki length
print(len(vectorized_corpus[0]))
print(len(cv.vocabulary_))

42
42


- REVERSE MAPPING


In [34]:
#here we will give the vocalbulary and try to find corpus 
# this will be jumbled in order

In [35]:
numbers = vectorized_corpus[2]

In [36]:
s = cv.inverse_transform?

In [None]:
s = cv.inverse_transform

In [37]:
s = cv.inverse_transform(numbers)
print(s)
# real sentence - 
#The nobel laurate won the hearts of the people 


[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


# VECTORIZATION WITH STOP WORD REMOVAL

In [47]:
def myTokenizer(document):
    word = tokenizer.tokenize(document.lower())
    # Remove Stopwords
    word = remove_stopWords(word,sw)
    return word

In [51]:
#we can also make our own made tokenizer and pass as custom parameter
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [52]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [53]:
cv.inverse_transform(vectorized_corpus)

[array(['capt', 'cricket', 'cup', 'held', 'indian', 'kohli', 'lanka',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story', 'thriller', 'upon'], dtype='<U9')]

# MORE WAYS TO CREATE FEATURES
- Unigram - every single word is a feature
- Bigram
- Trigram
- n-gram
- TF-IDF Normalization

In [54]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [56]:
cv = CountVectorizer(ngram_range=(2,2))

In [59]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 1, 1, 0, 0, 0, 1],
       [1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [61]:
#2 2 words are considered and then made a feature
cv.vocabulary_

{'this is': 7,
 'is good': 3,
 'good movie': 2,
 'movie but': 5,
 'but actor': 1,
 'actor is': 0,
 'is not': 4,
 'not present': 6}