In [1]:
!pip install nltk



# Introduction to Natural Language Processing

In [2]:
import nltk

In [63]:
#nltk.download()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [35]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
# Corpus - A large collection of text
from nltk.corpus import brown

In [7]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [12]:
data = brown.sents(categories='adventure')
data

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

In [14]:
# first sentence in adventure category
' '.join(data[1])

'He was well rid of her .'

# Bag of Words Pipeline
  -> Get the Data/Corpus
  -> Tokenisation, Stopward Removal
  -> Stemming
  -> Building a Vocab
  -> Vectorization 
  -> Classification

-> Tokenisation and Stopword Removal

In [25]:
document = """ It was a very pleasant day. The weather was cool and there
 were light showers. I went to the market to buy some fruits. """

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [16]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [28]:
sents = sent_tokenize(document)
sents

[' It was a very pleasant day.',
 'The weather was cool and there\n were light showers.',
 'I went to the market to buy some fruits.']

In [27]:
print(len(sents))
sents[0]

3


' It was a very pleasant day.'

In [30]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [31]:
words = word_tokenize(sentence)
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek',
 '@',
 'cb.com']

In [None]:
## Stopwords removal

In [32]:
from nltk.corpus import stopwords

In [37]:
sw = set(stopwords.words('english'))

In [38]:
print(sw)

{'haven', "couldn't", "you'll", 'because', 'didn', 'both', "mightn't", 'some', 'all', 'those', 'in', 've', 'as', 'on', 't', 'but', 'while', 'shouldn', "weren't", 'of', 'did', "won't", 'the', "hadn't", 'most', "aren't", 'will', 'with', "doesn't", 'mustn', 'who', 'should', 'to', 'being', "that'll", 'no', 'themselves', 'our', "it's", 'wouldn', 'into', 'further', "should've", 'o', 're', 'out', "haven't", "you'd", 'below', "you've", 'above', 'her', 'theirs', 'an', 'from', 'are', 'yourself', 'don', 'before', 'when', 'has', 's', 'hadn', "isn't", 'again', 'after', 'aren', 'if', 'll', 'so', 'than', 'nor', 'ma', 'other', 'this', 'isn', 'too', 'them', 'its', 'same', "hasn't", 'weren', 'my', 'him', 'these', 'shan', "you're", 'am', 'then', 'been', 'between', 'his', 'won', 'you', 'be', 'during', 'not', 'how', 'about', 'now', 'himself', 'was', 'do', 'off', 'herself', 'is', 'yours', 'for', 'a', 'what', 'there', "she's", 'me', 'had', 'only', 'under', 'your', "wasn't", 'through', "wouldn't", 'few', 'he'

In [39]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [41]:
# example of remove_stopwords funcationality:-
text = "i am not bothered about her very much".split()
useful_text = remove_stopwords(text,sw)
print(useful_text)

['bothered', 'much']


-> Tokenisation using Regular Expression

In [None]:
# learn about regular expression on regexpal.com
# refer chit sheet on regexpal.com too for creating your own regex

In [42]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [44]:
from nltk.tokenize import RegexpTokenizer

In [48]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)

In [49]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb.com']

# Stemming

In [None]:
-> Process that transforms particular words(verbs,plurals) into their radical form
-> Preserve the semantics of the sentend=ce without increasing the number of unique tokens
-> Example - jumps,jumping,jumped,jump==>jump

In [50]:
test = """Foxes love to make jumps. The quick brown fox was seen jumping over the 
          lovely dog from a 6ft feet high wall """

In [51]:
# nltk provides three type of stemmer = Snowball , Porter ,Lancaster
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [52]:
# Porter stemmer
ps = PorterStemmer()

In [53]:
ps.stem('jumping')

'jump'

In [54]:
ps.stem('jumps')

'jump'

In [56]:
ps.stem('lovely')

'love'

In [57]:
ps.stem('loving')

'love'

In [58]:
# Snowball Stemmer
ss = SnowballStemmer('english')

In [59]:
ss.stem('lovely')

'love'

In [60]:
ss.stem('jumpingly')

'jump'

In [64]:
## Lemmatization == same as stemming
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

# Building a Vocab & Vectorization

In [96]:
# Sample Corpus - Contains 4 Documents , each document can have 1 or more sentences
corpus = [
    'Indian cricket team will wins World Cup, says Capt. Virat Kohli. Word cup will be held at Sri Lanka',
    'We will win next Lok Sabha Elections, says confident Indian PM',
    'The nobel laurate won the hearts of the people.',
    'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

In [98]:
cv = CountVectorizer()

In [99]:
vectorized_corpus = cv.fit_transform(corpus)

In [100]:
vectorized_corpus = vectorized_corpus.toarray()

In [106]:
vectorized_corpus[0]
len(vectorized_corpus[0])

43

In [103]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 42, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'word': 41, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [104]:
len(cv.vocabulary_.keys())

43

In [105]:
# Reverse Mapping!
numbers = vectorized_corpus[2] # 2nd index
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [110]:
s = cv.inverse_transform(numbers)
s # showing the words which was present

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
       dtype='<U9')]

# Vectorization with Stopword Removal

In [113]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # Indian, indian ==same
    # remove stopwords
    words = remove_stopwords(words,sw) #intial defined
    return words

In [114]:
myTokenizer(sentence)

['send', 'documents', 'related', 'chapters', 'prateek@cb.com']

In [115]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [116]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [117]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 1 1]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0 0]]


In [118]:
print(len(vectorized_corpus[0])) # here len is reduced as earlier it was 43

34


In [119]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka',
        'says', 'sri', 'team', 'virat', 'wins', 'word', 'world'],
       dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [120]:
# For Test Data - you will work on training result
test_corpus = [
    'Indian cricket team rocks!'
]

In [121]:
# Very Important note:-
""" we will not use fit_transform as it will overrite your trained vocab
  but, in ML it is preffered to use similar representation.. but 
  fit_transform will shrink,hence we will use transform . """
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

# More Ways to Create Features
    => Unigram - every word as a feature
    => Bigram - pair of word as feature
    => Trigram 
    => n-gram
    => TF=IDF Normalisation

In [122]:
sent_1 = ["this is good movie"]
sent_2 = ["this is not good movie"]

In [124]:
# Unigram feature
cv = CountVectorizer()

In [132]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[1, 0, 0, 1, 0],
       [0, 1, 1, 0, 1]], dtype=int64)

In [127]:
# Digram feature
cv = CountVectorizer(ngram_range=(2,2))

In [130]:
cv.vocabulary_

{'this is': 4, 'is good': 1, 'good movie': 0, 'is not': 2, 'not good': 3}

In [131]:
# Trigram feature
cv = CountVectorizer(ngram_range=(3,3))

In [133]:
cv.vocabulary_

{'this is good': 3,
 'is good movie': 0,
 'this is not': 4,
 'is not good': 1,
 'not good movie': 2}

# Tf-idf Normalisation
    => Avoid features that occur very often, because they contain less information
    => Information decreases as the number of occurences increases across different type of documents
    => S we define another term = term-document-frequency which associates a weight with every term
    

In [135]:
sent_1 = "this is a good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [None]:
# tf.idf = term frequency * inverse document frequency
# tf(t,d)  
# idf(t,d) = log{N/(a + cont(t,d))} ; N = total no of document


In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [137]:
tfidf = TfidfVectorizer()

In [140]:
vc = tfidf.fit_transform(corpus).toarray()

In [141]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [142]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}