In [1]:
import nltk

In [2]:
#nltk.download()

In [3]:
from nltk.corpus import brown


In [4]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [5]:
data=brown.sents(categories='adventure')
print(len(data))

4637


In [6]:
print(data[90])

['He', "wasn't", 'so', 'sure', 'about', 'the', 'boy', '.']


# Bag of words pipeline
- Get the data
- Tokenisation, stopword removal
- stemming/lemitisation
- Building a vocab
- vectorization
- classification

## TOKENISATION

In [7]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [8]:
document="""It was a very pleasant day. The weather was cool and there were light showers. I went to the market to buy some fruits. """

sentence="send all the 50 documents related to chapter 1,2,3 at prateek@cb."

In [9]:
sents=sent_tokenize(document)

In [10]:
print(sents)
print(len(sents))

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [11]:
print(sents[0])

It was a very pleasant day.


In [12]:
sentence.split()

['send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapter',
 '1,2,3',
 'at',
 'prateek@cb.']

In [13]:
words=word_tokenize(sentence)

In [14]:
print(words)

['send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapter', '1,2,3', 'at', 'prateek', '@', 'cb', '.']


## STOPWORDS

In [15]:
from nltk.corpus import stopwords

In [16]:
sw=set(stopwords.words('english'))

In [17]:
# common words occuring in english lanuage
print(sw)
print(len(sw))

{'each', "couldn't", 'while', 'should', 'yourself', 'again', 'most', 'itself', 'if', 'how', 'are', 'through', 'herself', 'her', 'be', "isn't", "weren't", 'him', 'm', 'as', 'only', 'aren', 'you', 'being', "mightn't", 'hers', 'more', 'doing', 'hadn', 'our', 'didn', 'shan', 'me', 'they', 'what', 'wouldn', "doesn't", 'doesn', 'she', 'nor', 'mustn', 'these', 'between', 'too', 'y', "hasn't", 'by', 'am', 'whom', 'in', 'at', 'theirs', 'same', 'into', 'off', 'no', "aren't", 'did', 'those', 'yours', 'up', 'himself', 'a', 'for', "don't", 'isn', 'can', 'but', 'on', 'few', 'd', 'was', 'won', 'it', 'once', 'until', 'the', 'or', 'of', 'this', "you're", 'will', 'to', "shouldn't", 'not', 'them', 'other', 'who', 'do', 'than', 'their', 'under', 'i', 'why', 'mightn', 'been', 'from', "haven't", "should've", 'an', 'is', "she's", 't', 'my', 'below', "you'll", "shan't", 's', 'couldn', 'so', 'ma', 'myself', "hadn't", "didn't", 'out', 'such', "you've", "wasn't", 'ain', 'don', "wouldn't", 'then', 'because', 'the

In [18]:
def removal_stopwords(text,stopwords):
    useful_words=[w for w in text if w not in stopwords]
    return useful_words

In [19]:
text="i am not bothered about her very much".split()
list=removal_stopwords(text,sw)

In [20]:
print(list)

['bothered', 'much']


## Tokenzation using regular expression

In [21]:
sentence="send all the 50 documents related to chapter 1,2,3 at prateek@cb."

In [22]:
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer('[a-zA-Z]+')

In [23]:
useful_words=tokenizer.tokenize(sentence)

In [24]:
print(useful_words)

['send', 'all', 'the', 'documents', 'related', 'to', 'chapter', 'at', 'prateek', 'cb']


## Stemming 
- process that transforms particular words(verbs,plural) into there radical form
- preserve the sematics of the sentence without increasing the number of unique tokens
- Example - jumps,jumping ,jump,jumped changes to jump

In [25]:
text="""foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog fom a 6 feet high wall"""

In [26]:
# Snowball Stemmer,Porter stemmer,Lancaaster stemmer
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [27]:
ps=PorterStemmer()

In [28]:
ps.stem("lovely")

'love'

In [29]:
ps.stem("loving")

'love'

In [30]:
# snowball stemer : multi lingual (supports french german)
ss=SnowballStemmer('english')

In [31]:
ss.stem('jumping')

'jump'

In [32]:
ss.stem('holiness')

'holi'

In [33]:
# lemitization

In [34]:
from nltk.stem import WordNetLemmatizer

In [35]:
wn=WordNetLemmatizer()
wn.lemmatize('jumper')

'jumper'

In [36]:
wn.lemmatize('gone')

'gone'

## BUILDING A VOCAB AND VECTORIZATION

In [37]:

# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
cv=CountVectorizer()

In [40]:
transformed_cv=cv.fit_transform(corpus)

In [41]:
transformed_cv=transformed_cv.toarray()
print(transformed_cv)

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]


In [42]:
cv.vocabulary_

{'indian': 12,
 'cricket': 6,
 'team': 31,
 'will': 37,
 'wins': 39,
 'world': 41,
 'cup': 7,
 'says': 27,
 'capt': 4,
 'virat': 35,
 'kohli': 14,
 'be': 3,
 'held': 11,
 'at': 1,
 'sri': 29,
 'lanka': 15,
 'we': 36,
 'win': 38,
 'next': 19,
 'lok': 17,
 'sabha': 26,
 'elections': 8,
 'confident': 5,
 'pm': 23,
 'the': 32,
 'nobel': 20,
 'laurate': 16,
 'won': 40,
 'hearts': 10,
 'of': 21,
 'people': 22,
 'movie': 18,
 'raazi': 24,
 'is': 13,
 'an': 0,
 'exciting': 9,
 'spy': 28,
 'thriller': 33,
 'based': 2,
 'upon': 34,
 'real': 25,
 'story': 30}

In [43]:
cv.vocabulary_.keys()

dict_keys(['indian', 'cricket', 'team', 'will', 'wins', 'world', 'cup', 'says', 'capt', 'virat', 'kohli', 'be', 'held', 'at', 'sri', 'lanka', 'we', 'win', 'next', 'lok', 'sabha', 'elections', 'confident', 'pm', 'the', 'nobel', 'laurate', 'won', 'hearts', 'of', 'people', 'movie', 'raazi', 'is', 'an', 'exciting', 'spy', 'thriller', 'based', 'upon', 'real', 'story'])

In [44]:
# reverse mapping
numbers=transformed_cv[2]
print(numbers)
print(cv.inverse_transform(numbers))

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0
 0 0 0 1 0]
[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


# vectorization from stopword removal

In [45]:
def myTokenizer(document):
    words=tokenizer.tokenize(document.lower())
    words=removal_stopwords(words,sw)
    return words

In [46]:
cv=CountVectorizer(tokenizer=myTokenizer)

In [47]:
vectorized_corpus=cv.fit_transform(corpus).toarray()

In [48]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [49]:
print(len(vectorized_corpus[0]))

33


In [50]:
cv.inverse_transform(vectorized_corpus)

[array(['capt', 'cricket', 'cup', 'held', 'indian', 'kohli', 'lanka',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story', 'thriller', 'upon'], dtype='<U9')]

In [51]:

# For Test Data
test_corpus = [
        'Indian cricket rock !',        
]

In [52]:
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [53]:
cv.vocabulary_

{'indian': 9,
 'cricket': 3,
 'team': 26,
 'wins': 31,
 'world': 32,
 'cup': 4,
 'says': 22,
 'capt': 1,
 'virat': 29,
 'kohli': 10,
 'held': 8,
 'sri': 24,
 'lanka': 11,
 'win': 30,
 'next': 15,
 'lok': 13,
 'sabha': 21,
 'elections': 5,
 'confident': 2,
 'pm': 18,
 'nobel': 16,
 'laurate': 12,
 'hearts': 7,
 'people': 17,
 'movie': 14,
 'raazi': 19,
 'exciting': 6,
 'spy': 23,
 'thriller': 27,
 'based': 0,
 'upon': 28,
 'real': 20,
 'story': 25}

# More ways to Create Features
- Unigram - every word as a feature
- Bigrams
- Trigrams
- n-grams
- TF-IDF Normalisation

# More ways to Create Features
- Unigram - every word as a feature
- Bigrams
- Trigrams
- n-grams
- TF-IDF Normalisation

In [54]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [55]:
# bigram
cv=CountVectorizer(ngram_range=(1,3)) #(1,1) is for unigrams
docs=[sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]], dtype=int64)

In [56]:
cv.vocabulary_

{'this': 20,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 21,
 'is good': 10,
 'good movie': 7,
 'this is good': 22,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 19,
 'movie but': 15,
 'but actor': 4,
 'actor is': 1,
 'is not': 12,
 'not present': 18,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor is': 5,
 'actor is not': 2,
 'is not present': 13}

## Tf-idf Normalisation
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [57]:
sent_1="this is good movie"
sent_2="this was good movie"
sent_3="this is not good movie"

corpus=[sent_1,sent_2,sent_3]

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
tfidf=TfidfVectorizer()

In [61]:
vc=tfidf.fit_transform(corpus).toarray()

In [62]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [63]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}